feature_selection_param.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. import copy
  19. from federatedml.param.base_param import BaseParam, deprecated_param
  20. from federatedml.util import consts
  21. class UniqueValueParam(BaseParam):
  22. """
  23. Use the difference between max-value and min-value to judge.
  24. Parameters
  25. ----------
  26. eps : float, default: 1e-5
  27. The column(s) will be filtered if its difference is smaller than eps.
  28. """
  29. def __init__(self, eps=1e-5):
  30. self.eps = eps
  31. def check(self):
  32. descr = "Unique value param's"
  33. self.check_positive_number(self.eps, descr)
  34. return True
  35. class IVValueSelectionParam(BaseParam):
  36. """
  37. Use information values to select features.
  38. Parameters
  39. ----------
  40. value_threshold: float, default: 1.0
  41. Used if iv_value_thres method is used in feature selection.
  42. host_thresholds: List of float or None, default: None
  43. Set threshold for different host. If None, use same threshold as guest. If provided, the order should map with
  44. the host id setting.
  45. """
  46. def __init__(self, value_threshold=0.0, host_thresholds=None, local_only=False):
  47. super().__init__()
  48. self.value_threshold = value_threshold
  49. self.host_thresholds = host_thresholds
  50. self.local_only = local_only
  51. def check(self):
  52. if not isinstance(self.value_threshold, (float, int)):
  53. raise ValueError("IV selection param's value_threshold should be float or int")
  54. if self.host_thresholds is not None:
  55. if not isinstance(self.host_thresholds, list):
  56. raise ValueError("IV selection param's host_threshold should be list or None")
  57. if not isinstance(self.local_only, bool):
  58. raise ValueError("IV selection param's local_only should be bool")
  59. return True
  60. class IVPercentileSelectionParam(BaseParam):
  61. """
  62. Use information values to select features.
  63. Parameters
  64. ----------
  65. percentile_threshold: float
  66. 0 <= percentile_threshold <= 1.0, default: 1.0, Percentile threshold for iv_percentile method
  67. """
  68. def __init__(self, percentile_threshold=1.0, local_only=False):
  69. super().__init__()
  70. self.percentile_threshold = percentile_threshold
  71. self.local_only = local_only
  72. def check(self):
  73. descr = "IV selection param's"
  74. if self.percentile_threshold != 0 or self.percentile_threshold != 1:
  75. self.check_decimal_float(self.percentile_threshold, descr)
  76. self.check_boolean(self.local_only, descr)
  77. return True
  78. class IVTopKParam(BaseParam):
  79. """
  80. Use information values to select features.
  81. Parameters
  82. ----------
  83. k: int
  84. should be greater than 0, default: 10, Percentile threshold for iv_percentile method
  85. """
  86. def __init__(self, k=10, local_only=False):
  87. super().__init__()
  88. self.k = k
  89. self.local_only = local_only
  90. def check(self):
  91. descr = "IV selection param's"
  92. self.check_positive_integer(self.k, descr)
  93. self.check_boolean(self.local_only, descr)
  94. return True
  95. class VarianceOfCoeSelectionParam(BaseParam):
  96. """
  97. Use coefficient of variation to select features. When judging, the absolute value will be used.
  98. Parameters
  99. ----------
  100. value_threshold: float, default: 1.0
  101. Used if coefficient_of_variation_value_thres method is used in feature selection. Filter those
  102. columns who has smaller coefficient of variance than the threshold.
  103. """
  104. def __init__(self, value_threshold=1.0):
  105. self.value_threshold = value_threshold
  106. def check(self):
  107. descr = "Coff of Variances param's"
  108. self.check_positive_number(self.value_threshold, descr)
  109. return True
  110. class OutlierColsSelectionParam(BaseParam):
  111. """
  112. Given percentile and threshold. Judge if this quantile point is larger than threshold. Filter those larger ones.
  113. Parameters
  114. ----------
  115. percentile: float, [0., 1.] default: 1.0
  116. The percentile points to compare.
  117. upper_threshold: float, default: 1.0
  118. Percentile threshold for coefficient_of_variation_percentile method
  119. """
  120. def __init__(self, percentile=1.0, upper_threshold=1.0):
  121. self.percentile = percentile
  122. self.upper_threshold = upper_threshold
  123. def check(self):
  124. descr = "Outlier Filter param's"
  125. self.check_decimal_float(self.percentile, descr)
  126. self.check_defined_type(self.upper_threshold, descr, ['float', 'int'])
  127. return True
  128. class CommonFilterParam(BaseParam):
  129. """
  130. All of the following parameters can set with a single value or a list of those values.
  131. When setting one single value, it means using only one metric to filter while
  132. a list represent for using multiple metrics.
  133. Please note that if some of the following values has been set as list, all of them
  134. should have same length. Otherwise, error will be raised. And if there exist a list
  135. type parameter, the metrics should be in list type.
  136. Parameters
  137. ----------
  138. metrics: str or list, default: depends on the specific filter
  139. Indicate what metrics are used in this filter
  140. filter_type: str, default: threshold
  141. Should be one of "threshold", "top_k" or "top_percentile"
  142. take_high: bool, default: True
  143. When filtering, taking highest values or not.
  144. threshold: float or int, default: 1
  145. If filter type is threshold, this is the threshold value.
  146. If it is "top_k", this is the k value.
  147. If it is top_percentile, this is the percentile threshold.
  148. host_thresholds: List of float or List of List of float or None, default: None
  149. Set threshold for different host. If None, use same threshold as guest. If provided, the order should map with
  150. the host id setting.
  151. select_federated: bool, default: True
  152. Whether select federated with other parties or based on local variables
  153. """
  154. def __init__(self, metrics, filter_type='threshold', take_high=True, threshold=1,
  155. host_thresholds=None, select_federated=True):
  156. super().__init__()
  157. self.metrics = metrics
  158. self.filter_type = filter_type
  159. self.take_high = take_high
  160. self.threshold = threshold
  161. self.host_thresholds = host_thresholds
  162. self.select_federated = select_federated
  163. def check(self):
  164. self._convert_to_list(param_names=["filter_type", "take_high",
  165. "threshold", "select_federated"])
  166. for v in self.filter_type:
  167. if v not in ["threshold", "top_k", "top_percentile"]:
  168. raise ValueError('filter_type should be one of '
  169. '"threshold", "top_k", "top_percentile"')
  170. descr = "hetero feature selection param's"
  171. for v in self.take_high:
  172. self.check_boolean(v, descr)
  173. for idx, v in enumerate(self.threshold):
  174. if self.filter_type[idx] == "threshold":
  175. if not isinstance(v, (float, int)):
  176. raise ValueError(descr + f"{v} should be a float or int")
  177. elif self.filter_type[idx] == 'top_k':
  178. self.check_positive_integer(v, descr)
  179. else:
  180. if not (v == 0 or v == 1):
  181. self.check_decimal_float(v, descr)
  182. if self.host_thresholds is not None:
  183. if not isinstance(self.host_thresholds, list):
  184. self.host_thresholds = [self.host_thresholds]
  185. # raise ValueError("selection param's host_thresholds should be list or None")
  186. assert isinstance(self.select_federated, list)
  187. for v in self.select_federated:
  188. self.check_boolean(v, descr)
  189. def _convert_to_list(self, param_names):
  190. if not isinstance(self.metrics, list):
  191. for value_name in param_names:
  192. v = getattr(self, value_name)
  193. if isinstance(v, list):
  194. raise ValueError(f"{value_name}: {v} should not be a list when "
  195. f"metrics: {self.metrics} is not a list")
  196. setattr(self, value_name, [v])
  197. setattr(self, "metrics", [self.metrics])
  198. else:
  199. expected_length = len(self.metrics)
  200. for value_name in param_names:
  201. v = getattr(self, value_name)
  202. if isinstance(v, list):
  203. if len(v) != expected_length:
  204. raise ValueError(f"The parameter {v} should have same length "
  205. f"with metrics")
  206. else:
  207. new_v = [v] * expected_length
  208. setattr(self, value_name, new_v)
  209. class IVFilterParam(CommonFilterParam):
  210. """
  211. Parameters
  212. ----------
  213. mul_class_merge_type: str or list, default: "average"
  214. Indicate how to merge multi-class iv results. Support "average", "min" and "max".
  215. """
  216. def __init__(self, filter_type='threshold', threshold=1,
  217. host_thresholds=None, select_federated=True, mul_class_merge_type="average"):
  218. super().__init__(metrics='iv', filter_type=filter_type, take_high=True, threshold=threshold,
  219. host_thresholds=host_thresholds, select_federated=select_federated)
  220. self.mul_class_merge_type = mul_class_merge_type
  221. def check(self):
  222. super(IVFilterParam, self).check()
  223. self._convert_to_list(param_names=["mul_class_merge_type"])
  224. class CorrelationFilterParam(BaseParam):
  225. """
  226. This filter follow this specific rules:
  227. 1. Sort all the columns from high to low based on specific metric, eg. iv.
  228. 2. Traverse each sorted column. If there exists other columns with whom the
  229. absolute values of correlation are larger than threshold, they will be filtered.
  230. Parameters
  231. ----------
  232. sort_metric: str, default: iv
  233. Specify which metric to be used to sort features.
  234. threshold: float or int, default: 0.1
  235. Correlation threshold
  236. select_federated: bool, default: True
  237. Whether select federated with other parties or based on local variables
  238. """
  239. def __init__(self, sort_metric='iv', threshold=0.1, select_federated=True):
  240. super().__init__()
  241. self.sort_metric = sort_metric
  242. self.threshold = threshold
  243. self.select_federated = select_federated
  244. def check(self):
  245. descr = "Correlation Filter param's"
  246. self.sort_metric = self.sort_metric.lower()
  247. support_metrics = ['iv']
  248. if self.sort_metric not in support_metrics:
  249. raise ValueError(f"sort_metric in Correlation Filter should be one of {support_metrics}")
  250. self.check_positive_number(self.threshold, descr)
  251. class PercentageValueParam(BaseParam):
  252. """
  253. Filter the columns that have a value that exceeds a certain percentage.
  254. Parameters
  255. ----------
  256. upper_pct: float, [0.1, 1.], default: 1.0
  257. The upper percentage threshold for filtering, upper_pct should not be less than 0.1.
  258. """
  259. def __init__(self, upper_pct=1.0):
  260. super().__init__()
  261. self.upper_pct = upper_pct
  262. def check(self):
  263. descr = "Percentage Filter param's"
  264. if self.upper_pct not in [0, 1]:
  265. self.check_decimal_float(self.upper_pct, descr)
  266. if self.upper_pct < consts.PERCENTAGE_VALUE_LIMIT:
  267. raise ValueError(descr + f" {self.upper_pct} not supported,"
  268. f" should not be smaller than {consts.PERCENTAGE_VALUE_LIMIT}")
  269. return True
  270. class ManuallyFilterParam(BaseParam):
  271. """
  272. Specified columns that need to be filtered. If exist, it will be filtered directly, otherwise, ignore it.
  273. Both Filter_out or left parameters only works for this specific filter. For instances, if you set some columns left
  274. in this filter but those columns are filtered by other filters, those columns will NOT left in final.
  275. Please note that (left_col_indexes & left_col_names) cannot use with (filter_out_indexes & filter_out_names) simultaneously.
  276. Parameters
  277. ----------
  278. filter_out_indexes: list of int, default: None
  279. Specify columns' indexes to be filtered out
  280. Note tha columns specified by `filter_out_indexes` and `filter_out_names` will be combined.
  281. filter_out_names : list of string, default: None
  282. Specify columns' names to be filtered out
  283. Note tha columns specified by `filter_out_indexes` and `filter_out_names` will be combined.
  284. left_col_indexes: list of int, default: None
  285. Specify left_col_index
  286. Note tha columns specified by `left_col_indexes` and `left_col_names` will be combined.
  287. left_col_names: list of string, default: None
  288. Specify left col names
  289. Note tha columns specified by `left_col_indexes` and `left_col_names` will be combined.
  290. """
  291. def __init__(self, filter_out_indexes=None, filter_out_names=None, left_col_indexes=None,
  292. left_col_names=None):
  293. super().__init__()
  294. self.filter_out_indexes = filter_out_indexes
  295. self.filter_out_names = filter_out_names
  296. self.left_col_indexes = left_col_indexes
  297. self.left_col_names = left_col_names
  298. def check(self):
  299. descr = "Manually Filter param's"
  300. self.check_defined_type(self.filter_out_indexes, descr, ['list', 'NoneType'])
  301. self.check_defined_type(self.filter_out_names, descr, ['list', 'NoneType'])
  302. self.check_defined_type(self.left_col_indexes, descr, ['list', 'NoneType'])
  303. self.check_defined_type(self.left_col_names, descr, ['list', 'NoneType'])
  304. if (self.filter_out_indexes or self.filter_out_names) is not None and \
  305. (self.left_col_names or self.left_col_indexes) is not None:
  306. raise ValueError("(left_col_indexes & left_col_names) cannot use with"
  307. " (filter_out_indexes & filter_out_names) simultaneously")
  308. return True
  309. deprecated_param_list = ["iv_value_param", "iv_percentile_param",
  310. "iv_top_k_param", "variance_coe_param", "unique_param",
  311. "outlier_param"]
  312. @deprecated_param(*deprecated_param_list)
  313. class FeatureSelectionParam(BaseParam):
  314. """
  315. Define the feature selection parameters.
  316. Parameters
  317. ----------
  318. select_col_indexes: list or int, default: -1
  319. Specify which columns need to calculated. -1 represent for all columns.
  320. Note tha columns specified by `select_col_indexes` and `select_names` will be combined.
  321. select_names : list of string, default: []
  322. Specify which columns need to calculated. Each element in the list represent for a column name in header.
  323. Note tha columns specified by `select_col_indexes` and `select_names` will be combined.
  324. filter_methods: list of ["manually", "iv_filter", "statistic_filter", "psi_filter",
  325. “hetero_sbt_filter", "homo_sbt_filter", "hetero_fast_sbt_filter", "percentage_value",
  326. "vif_filter", "correlation_filter"], default: ["manually"].
  327. The following methods will be deprecated in future version:
  328. "unique_value", "iv_value_thres", "iv_percentile",
  329. "coefficient_of_variation_value_thres", "outlier_cols"
  330. Specify the filter methods used in feature selection. The orders of filter used is depended on this list.
  331. Please be notified that, if a percentile method is used after some certain filter method,
  332. the percentile represent for the ratio of rest features.
  333. e.g. If you have 10 features at the beginning. After first filter method, you have 8 rest. Then, you want
  334. top 80% highest iv feature. Here, we will choose floor(0.8 * 8) = 6 features instead of 8.
  335. unique_param: UniqueValueParam
  336. filter the columns if all values in this feature is the same
  337. iv_value_param: IVValueSelectionParam
  338. Use information value to filter columns. If this method is set, a float threshold need to be provided.
  339. Filter those columns whose iv is smaller than threshold. Will be deprecated in the future.
  340. iv_percentile_param: IVPercentileSelectionParam
  341. Use information value to filter columns. If this method is set, a float ratio threshold
  342. need to be provided. Pick floor(ratio * feature_num) features with higher iv. If multiple features around
  343. the threshold are same, all those columns will be keep. Will be deprecated in the future.
  344. variance_coe_param: VarianceOfCoeSelectionParam
  345. Use coefficient of variation to judge whether filtered or not.
  346. Will be deprecated in the future.
  347. outlier_param: OutlierColsSelectionParam
  348. Filter columns whose certain percentile value is larger than a threshold.
  349. Will be deprecated in the future.
  350. percentage_value_param: PercentageValueParam
  351. Filter the columns that have a value that exceeds a certain percentage.
  352. iv_param: IVFilterParam
  353. Setting how to filter base on iv. It support take high mode only. All of "threshold",
  354. "top_k" and "top_percentile" are accepted. Check more details in CommonFilterParam. To
  355. use this filter, hetero-feature-binning module has to be provided.
  356. statistic_param: CommonFilterParam
  357. Setting how to filter base on statistic values. All of "threshold",
  358. "top_k" and "top_percentile" are accepted. Check more details in CommonFilterParam.
  359. To use this filter, data_statistic module has to be provided.
  360. psi_param: CommonFilterParam
  361. Setting how to filter base on psi values. All of "threshold",
  362. "top_k" and "top_percentile" are accepted. Its take_high properties should be False
  363. to choose lower psi features. Check more details in CommonFilterParam.
  364. To use this filter, data_statistic module has to be provided.
  365. use_anonymous: bool, default False
  366. whether to interpret 'select_names' as anonymous names.
  367. need_run: bool, default True
  368. Indicate if this module needed to be run
  369. """
  370. def __init__(self, select_col_indexes=-1, select_names=None, filter_methods=None,
  371. unique_param=UniqueValueParam(),
  372. iv_value_param=IVValueSelectionParam(),
  373. iv_percentile_param=IVPercentileSelectionParam(),
  374. iv_top_k_param=IVTopKParam(),
  375. variance_coe_param=VarianceOfCoeSelectionParam(),
  376. outlier_param=OutlierColsSelectionParam(),
  377. manually_param=ManuallyFilterParam(),
  378. percentage_value_param=PercentageValueParam(),
  379. iv_param=IVFilterParam(),
  380. statistic_param=CommonFilterParam(metrics=consts.MEAN),
  381. psi_param=CommonFilterParam(metrics=consts.PSI,
  382. take_high=False),
  383. vif_param=CommonFilterParam(metrics=consts.VIF,
  384. threshold=5.0,
  385. take_high=False),
  386. sbt_param=CommonFilterParam(metrics=consts.FEATURE_IMPORTANCE),
  387. correlation_param=CorrelationFilterParam(),
  388. use_anonymous=False,
  389. need_run=True
  390. ):
  391. super(FeatureSelectionParam, self).__init__()
  392. self.correlation_param = correlation_param
  393. self.vif_param = vif_param
  394. self.select_col_indexes = select_col_indexes
  395. if select_names is None:
  396. self.select_names = []
  397. else:
  398. self.select_names = select_names
  399. if filter_methods is None:
  400. self.filter_methods = [consts.MANUALLY_FILTER]
  401. else:
  402. self.filter_methods = filter_methods
  403. # deprecate in the future
  404. self.unique_param = copy.deepcopy(unique_param)
  405. self.iv_value_param = copy.deepcopy(iv_value_param)
  406. self.iv_percentile_param = copy.deepcopy(iv_percentile_param)
  407. self.iv_top_k_param = copy.deepcopy(iv_top_k_param)
  408. self.variance_coe_param = copy.deepcopy(variance_coe_param)
  409. self.outlier_param = copy.deepcopy(outlier_param)
  410. self.percentage_value_param = copy.deepcopy(percentage_value_param)
  411. self.manually_param = copy.deepcopy(manually_param)
  412. self.iv_param = copy.deepcopy(iv_param)
  413. self.statistic_param = copy.deepcopy(statistic_param)
  414. self.psi_param = copy.deepcopy(psi_param)
  415. self.sbt_param = copy.deepcopy(sbt_param)
  416. self.need_run = need_run
  417. self.use_anonymous = use_anonymous
  418. def check(self):
  419. descr = "hetero feature selection param's"
  420. self.check_defined_type(self.filter_methods, descr, ['list'])
  421. for idx, method in enumerate(self.filter_methods):
  422. method = method.lower()
  423. self.check_valid_value(method, descr, [consts.UNIQUE_VALUE, consts.IV_VALUE_THRES, consts.IV_PERCENTILE,
  424. consts.COEFFICIENT_OF_VARIATION_VALUE_THRES, consts.OUTLIER_COLS,
  425. consts.MANUALLY_FILTER, consts.PERCENTAGE_VALUE,
  426. consts.IV_FILTER, consts.STATISTIC_FILTER, consts.IV_TOP_K,
  427. consts.PSI_FILTER, consts.HETERO_SBT_FILTER,
  428. consts.HOMO_SBT_FILTER, consts.HETERO_FAST_SBT_FILTER,
  429. consts.VIF_FILTER, consts.CORRELATION_FILTER])
  430. self.filter_methods[idx] = method
  431. self.check_defined_type(self.select_col_indexes, descr, ['list', 'int'])
  432. self.unique_param.check()
  433. self.iv_value_param.check()
  434. self.iv_percentile_param.check()
  435. self.iv_top_k_param.check()
  436. self.variance_coe_param.check()
  437. self.outlier_param.check()
  438. self.manually_param.check()
  439. self.percentage_value_param.check()
  440. self.iv_param.check()
  441. for th in self.iv_param.take_high:
  442. if not th:
  443. raise ValueError("Iv filter should take higher iv features")
  444. for m in self.iv_param.metrics:
  445. if m != consts.IV:
  446. raise ValueError("For iv filter, metrics should be 'iv'")
  447. self.statistic_param.check()
  448. self.psi_param.check()
  449. for th in self.psi_param.take_high:
  450. if th:
  451. raise ValueError("PSI filter should take lower psi features")
  452. for m in self.psi_param.metrics:
  453. if m != consts.PSI:
  454. raise ValueError("For psi filter, metrics should be 'psi'")
  455. self.sbt_param.check()
  456. for th in self.sbt_param.take_high:
  457. if not th:
  458. raise ValueError("SBT filter should take higher feature_importance features")
  459. for m in self.sbt_param.metrics:
  460. if m != consts.FEATURE_IMPORTANCE:
  461. raise ValueError("For SBT filter, metrics should be 'feature_importance'")
  462. self.vif_param.check()
  463. for m in self.vif_param.metrics:
  464. if m != consts.VIF:
  465. raise ValueError("For VIF filter, metrics should be 'vif'")
  466. self.correlation_param.check()
  467. self.check_boolean(self.use_anonymous, f"{descr} use_anonymous")
  468. self._warn_to_deprecate_param("iv_value_param", descr, "iv_param")
  469. self._warn_to_deprecate_param("iv_percentile_param", descr, "iv_param")
  470. self._warn_to_deprecate_param("iv_top_k_param", descr, "iv_param")
  471. self._warn_to_deprecate_param("variance_coe_param", descr, "statistic_param")
  472. self._warn_to_deprecate_param("unique_param", descr, "statistic_param")
  473. self._warn_to_deprecate_param("outlier_param", descr, "statistic_param")