base_binning.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. import bisect
  18. import functools
  19. import math
  20. import random
  21. import copy
  22. import numpy as np
  23. from federatedml.feature.binning.bin_inner_param import BinInnerParam
  24. from federatedml.feature.binning.bin_result import BinColResults, SplitPointsResult
  25. from federatedml.statistic.data_overview import get_header, get_anonymous_header
  26. from federatedml.feature.sparse_vector import SparseVector
  27. from federatedml.param.feature_binning_param import FeatureBinningParam
  28. from federatedml.statistic import data_overview
  29. from federatedml.util import LOGGER
  30. from federatedml.feature.fate_element_type import NoneType
  31. # from federatedml.statistic import statics
  32. class BaseBinning(object):
  33. """
  34. This is use for discrete data so that can transform data or use information for feature selection.
  35. """
  36. def __init__(self, params=None, abnormal_list=None, labels=None):
  37. self.bin_inner_param: BinInnerParam = None
  38. self.is_multi_class = False
  39. self.bin_results = SplitPointsResult()
  40. if params is None:
  41. return
  42. if isinstance(params, FeatureBinningParam):
  43. self.params = params
  44. else:
  45. self.params = None
  46. self.bin_num = params.bin_num
  47. if abnormal_list is None:
  48. self.abnormal_list = []
  49. else:
  50. self.abnormal_list = abnormal_list
  51. self.split_points = None
  52. @property
  53. def header(self):
  54. return self.bin_inner_param.header
  55. # @property
  56. # def split_points(self):
  57. # return self.bin_results.all_split_points
  58. def _default_setting(self, header, anonymous_header):
  59. if self.bin_inner_param is not None:
  60. return
  61. self.bin_inner_param = BinInnerParam()
  62. self.bin_inner_param.set_header(header, anonymous_header)
  63. if self.params.bin_indexes == -1:
  64. self.bin_inner_param.set_bin_all()
  65. else:
  66. self.bin_inner_param.add_bin_indexes(self.params.bin_indexes)
  67. self.bin_inner_param.add_bin_names(self.params.bin_names)
  68. self.bin_inner_param.add_category_indexes(self.params.category_indexes)
  69. self.bin_inner_param.add_category_names(self.params.category_names)
  70. if self.params.transform_param.transform_cols == -1:
  71. self.bin_inner_param.set_transform_all()
  72. else:
  73. self.bin_inner_param.add_transform_bin_indexes(self.params.transform_param.transform_cols)
  74. self.bin_inner_param.add_transform_bin_names(self.params.transform_param.transform_names)
  75. def fit_split_points(self, data_instances):
  76. """
  77. Get split points
  78. Parameters
  79. ----------
  80. data_instances : Table
  81. The input data
  82. Returns
  83. -------
  84. split_points : dict.
  85. Each value represent for the split points for a feature. The element in each row represent for
  86. the corresponding split point.
  87. e.g.
  88. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature
  89. 'x2': [1, 2, 3, 4, ...], # The second feature
  90. ...] # Other features
  91. """
  92. pass
  93. def fit_category_features(self, data_instances):
  94. is_sparse = data_overview.is_sparse_data(data_instances)
  95. if len(self.bin_inner_param.category_indexes) > 0:
  96. statics_obj = data_overview.DataStatistics()
  97. category_col_values = statics_obj.static_all_values(data_instances,
  98. self.bin_inner_param.category_indexes,
  99. is_sparse)
  100. for col_name, split_points in zip(self.bin_inner_param.category_names, category_col_values):
  101. self.bin_results.put_col_split_points(col_name, split_points)
  102. self.bin_results.put_col_optimal_metric_array(col_name, None)
  103. def set_bin_inner_param(self, bin_inner_param):
  104. self.bin_inner_param = bin_inner_param
  105. def transform(self, data_instances, transform_type):
  106. # self._init_cols(data_instances)
  107. for col_name in self.bin_inner_param.transform_bin_names:
  108. if col_name not in self.bin_inner_param.col_name_maps:
  109. raise ValueError("Transform col_name: {} is not existed".format(col_name))
  110. if transform_type == 'bin_num':
  111. data_instances, _, _ = self.convert_feature_to_bin(data_instances)
  112. elif transform_type == 'woe':
  113. data_instances = self.convert_feature_to_woe(data_instances)
  114. return data_instances
  115. @staticmethod
  116. def get_data_bin(data_instances, split_points, bin_cols_map):
  117. """
  118. Apply the binning method
  119. Parameters
  120. ----------
  121. data_instances : Table
  122. The input data
  123. split_points : dict.
  124. Each value represent for the split points for a feature. The element in each row represent for
  125. the corresponding split point.
  126. e.g.
  127. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature
  128. 'x2': [1, 2, 3, 4, ...], # The second feature
  129. ...] # Other features
  130. Returns
  131. -------
  132. data_bin_table : Table.
  133. Each element represent for the corresponding bin number this feature belongs to.
  134. e.g. it could be:
  135. [{'x1': 1, 'x2': 5, 'x3': 2}
  136. ...
  137. ]
  138. """
  139. # self._init_cols(data_instances)
  140. is_sparse = data_overview.is_sparse_data(data_instances)
  141. header = data_instances.schema.get('header')
  142. f = functools.partial(BaseBinning.bin_data,
  143. split_points=split_points,
  144. cols_dict=bin_cols_map,
  145. header=header,
  146. is_sparse=is_sparse)
  147. data_bin_dict = data_instances.mapValues(f)
  148. return data_bin_dict
  149. def convert_feature_to_woe(self, data_instances):
  150. is_sparse = data_overview.is_sparse_data(data_instances)
  151. schema = data_instances.schema
  152. if is_sparse:
  153. f = functools.partial(self._convert_sparse_data,
  154. bin_inner_param=self.bin_inner_param,
  155. bin_results=self.bin_results,
  156. abnormal_list=self.abnormal_list,
  157. convert_type='woe'
  158. )
  159. new_data = data_instances.mapValues(f)
  160. else:
  161. f = functools.partial(self._convert_dense_data,
  162. bin_inner_param=self.bin_inner_param,
  163. bin_results=self.bin_results,
  164. abnormal_list=self.abnormal_list,
  165. convert_type='woe')
  166. new_data = data_instances.mapValues(f)
  167. new_data.schema = schema
  168. return new_data
  169. def convert_feature_to_bin(self, data_instances, split_points=None):
  170. is_sparse = data_overview.is_sparse_data(data_instances)
  171. schema = data_instances.schema
  172. if split_points is None:
  173. split_points = self.bin_results.all_split_points
  174. else:
  175. for col_name, sp in split_points.items():
  176. self.bin_results.put_col_split_points(col_name, sp)
  177. if is_sparse:
  178. f = functools.partial(self._convert_sparse_data,
  179. bin_inner_param=self.bin_inner_param,
  180. bin_results=self.bin_results,
  181. abnormal_list=self.abnormal_list,
  182. convert_type='bin_num'
  183. )
  184. new_data = data_instances.mapValues(f)
  185. else:
  186. f = functools.partial(self._convert_dense_data,
  187. bin_inner_param=self.bin_inner_param,
  188. bin_results=self.bin_results,
  189. abnormal_list=self.abnormal_list,
  190. convert_type='bin_num')
  191. new_data = data_instances.mapValues(f)
  192. new_data.schema = schema
  193. header = get_header(data_instances)
  194. bin_sparse = self.get_sparse_bin(self.bin_inner_param.transform_bin_indexes, split_points, header)
  195. split_points_result = self.bin_results.get_split_points_array(self.bin_inner_param.transform_bin_names)
  196. return new_data, split_points_result, bin_sparse
  197. def _setup_bin_inner_param(self, data_instances, params):
  198. if self.bin_inner_param is not None:
  199. return
  200. self.bin_inner_param = BinInnerParam()
  201. header = get_header(data_instances)
  202. anonymous_header = get_anonymous_header(data_instances)
  203. LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(len(self.header)))
  204. self.schema = data_instances.schema
  205. self.bin_inner_param.set_header(header, anonymous_header)
  206. if params.bin_indexes == -1:
  207. self.bin_inner_param.set_bin_all()
  208. else:
  209. self.bin_inner_param.add_bin_indexes(params.bin_indexes)
  210. self.bin_inner_param.add_bin_names(params.bin_names)
  211. self.bin_inner_param.add_category_indexes(params.category_indexes)
  212. self.bin_inner_param.add_category_names(params.category_names)
  213. if params.transform_param.transform_cols == -1:
  214. self.bin_inner_param.set_transform_all()
  215. else:
  216. self.bin_inner_param.add_transform_bin_indexes(params.transform_param.transform_cols)
  217. self.bin_inner_param.add_transform_bin_names(params.transform_param.transform_names)
  218. self.set_bin_inner_param(self.bin_inner_param)
  219. @staticmethod
  220. def _convert_sparse_data(instances, bin_inner_param: BinInnerParam, bin_results: SplitPointsResult,
  221. abnormal_list: list, convert_type: str = 'bin_num'):
  222. instances = copy.deepcopy(instances)
  223. all_data = instances.features.get_all_data()
  224. data_shape = instances.features.get_shape()
  225. indice = []
  226. sparse_value = []
  227. transform_cols_idx_set = bin_inner_param.transform_bin_indexes_added_set
  228. split_points_dict = bin_results.all_split_points
  229. for col_idx, col_value in all_data:
  230. if col_idx in transform_cols_idx_set:
  231. if col_value in abnormal_list:
  232. indice.append(col_idx)
  233. sparse_value.append(col_value)
  234. continue
  235. # Maybe it is because missing value add in sparse value, but
  236. col_name = bin_inner_param.header[col_idx]
  237. split_points = split_points_dict[col_name]
  238. bin_num = BaseBinning.get_bin_num(col_value, split_points)
  239. indice.append(col_idx)
  240. if convert_type == 'bin_num':
  241. sparse_value.append(bin_num)
  242. else:
  243. sparse_value.append(col_value)
  244. else:
  245. indice.append(col_idx)
  246. sparse_value.append(col_value)
  247. sparse_vector = SparseVector(indice, sparse_value, data_shape)
  248. instances.features = sparse_vector
  249. return instances
  250. @staticmethod
  251. def get_sparse_bin(transform_cols_idx, split_points_dict, header):
  252. """
  253. Get which bins the 0 located at for each column.
  254. Returns
  255. -------
  256. Dict of sparse bin num
  257. {0: 2, 1: 3, 2:5 ... }
  258. """
  259. result = {}
  260. for col_idx in transform_cols_idx:
  261. col_name = header[col_idx]
  262. split_points = split_points_dict[col_name]
  263. sparse_bin_num = BaseBinning.get_bin_num(0, split_points)
  264. result[col_idx] = sparse_bin_num
  265. return result
  266. @staticmethod
  267. def _convert_dense_data(instances, bin_inner_param: BinInnerParam, bin_results: SplitPointsResult,
  268. abnormal_list: list, convert_type: str = 'bin_num'):
  269. instances = copy.deepcopy(instances)
  270. features = instances.features
  271. transform_cols_idx_set = bin_inner_param.transform_bin_indexes_added_set
  272. split_points_dict = bin_results.all_split_points
  273. for col_idx, col_value in enumerate(features):
  274. if col_idx in transform_cols_idx_set:
  275. if col_value in abnormal_list:
  276. features[col_idx] = col_value
  277. continue
  278. col_name = bin_inner_param.header[col_idx]
  279. split_points = split_points_dict[col_name]
  280. bin_num = BaseBinning.get_bin_num(col_value, split_points)
  281. if convert_type == 'bin_num':
  282. features[col_idx] = bin_num
  283. else:
  284. features[col_idx] = col_value
  285. instances.features = features
  286. return instances
  287. @staticmethod
  288. def convert_bin_counts_table(result_counts, idx):
  289. """
  290. Given event count information calculate iv information
  291. Parameters
  292. ----------
  293. result_counts: table.
  294. It is like:
  295. ('x1': [[label_0_count, label_1_count, ...], [label_0_count, label_1_count, ...] ... ],
  296. 'x2': [[label_0_count, label_1_count, ...], [label_0_count, label_1_count, ...] ... ],
  297. ...
  298. )
  299. idx: int
  300. Returns
  301. -------
  302. ('x1': [[event_count, non_event_count], [event_count, non_event_count] ... ],
  303. 'x2': [[event_count, non_event_count], [event_count, non_event_count] ... ],
  304. ...
  305. )
  306. """
  307. def _convert(list_counts):
  308. res = []
  309. for c_array in list_counts:
  310. event_count = c_array[idx]
  311. non_event_count = np.sum(c_array) - event_count
  312. res.append([event_count, non_event_count])
  313. return res
  314. return result_counts.mapValues(_convert)
  315. @staticmethod
  316. def fill_sparse_result(col_name, static_nums, sparse_bin_points, label_counts):
  317. """
  318. Parameters
  319. ----------
  320. static_nums : list.
  321. It is like:
  322. [[event_count, total_num], [event_count, total_num] ... ]
  323. sparse_bin_points : dict
  324. Dict of sparse bin num
  325. {"x1": 2, "x2": 3, "x3": 5 ... }
  326. label_counts: np.array
  327. eg. [100, 200, ...]
  328. Returns
  329. -------
  330. The format is same as result_counts.
  331. """
  332. curt_all = functools.reduce(lambda x, y: x + y, static_nums)
  333. sparse_bin = sparse_bin_points.get(col_name)
  334. static_nums[sparse_bin] = label_counts - curt_all
  335. return col_name, static_nums
  336. @staticmethod
  337. def bin_data(instance, split_points, cols_dict, header, is_sparse):
  338. """
  339. Apply the binning method
  340. Parameters
  341. ----------
  342. instance : Table
  343. The input data
  344. split_points : dict.
  345. Each value represent for the split points for a feature. The element in each row represent for
  346. the corresponding split point.
  347. e.g.
  348. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature
  349. 'x2': [1, 2, 3, 4, ...], # The second feature
  350. ...] # Other features
  351. cols_dict: dict
  352. Record key, value pairs where key is cols' name, and value is cols' index.
  353. header: list
  354. header of Table
  355. is_sparse: bool
  356. Specify whether it is sparse data or not
  357. Returns
  358. -------
  359. result_bin_dict : dict.
  360. Each element represent for the corresponding bin number this feature belongs to.
  361. e.g. it could be:
  362. [{1: 1, 2: 5, 3: 2}
  363. ...
  364. ] # Each number represent for the bin number it belongs to.
  365. """
  366. result_bin_nums = {}
  367. if is_sparse:
  368. sparse_data = instance.features.get_all_data()
  369. for col_idx, col_value in sparse_data:
  370. col_name = header[col_idx]
  371. if col_name in cols_dict:
  372. col_split_points = split_points[col_name]
  373. col_bin_num = BaseBinning.get_bin_num(col_value, col_split_points)
  374. result_bin_nums[col_name] = col_bin_num
  375. return result_bin_nums
  376. # For dense data
  377. for col_name, col_index in cols_dict.items():
  378. col_split_points = split_points[col_name]
  379. value = instance.features[col_index]
  380. col_bin_num = BaseBinning.get_bin_num(value, col_split_points)
  381. result_bin_nums[col_name] = col_bin_num
  382. return result_bin_nums
  383. @staticmethod
  384. def get_bin_num(value, split_points):
  385. if np.isnan(value):
  386. return len(split_points)
  387. sp = split_points[:-1]
  388. col_bin_num = bisect.bisect_left(sp, value)
  389. # col_bin_num = bisect.bisect_left(split_points, value)
  390. return col_bin_num
  391. @staticmethod
  392. def add_label_in_partition_bak(data_bin_with_table, sparse_bin_points):
  393. """
  394. Add all label, so that become convenient to calculate woe and iv
  395. Parameters
  396. ----------
  397. data_bin_with_table : Table
  398. The input data, the Table is like:
  399. (id, {'x1': 1, 'x2': 5, 'x3': 2}, y)
  400. sparse_bin_points: dict
  401. Dict of sparse bin num
  402. {0: 2, 1: 3, 2:5 ... }
  403. Returns
  404. -------
  405. result_sum: the result Table. It is like:
  406. {'x1': [[event_count, total_num], [event_count, total_num] ... ],
  407. 'x2': [[event_count, total_num], [event_count, total_num] ... ],
  408. ...
  409. }
  410. """
  411. result_sum = {}
  412. for _, datas in data_bin_with_table:
  413. bin_idx_dict = datas[0]
  414. y = datas[1]
  415. # y = y_combo[0]
  416. # inverse_y = y_combo[1]
  417. for col_name, bin_idx in bin_idx_dict.items():
  418. result_sum.setdefault(col_name, [])
  419. col_sum = result_sum[col_name]
  420. while bin_idx >= len(col_sum):
  421. col_sum.append([0, 0])
  422. if bin_idx == sparse_bin_points[col_name]:
  423. continue
  424. label_sum = col_sum[bin_idx]
  425. label_sum[0] = label_sum[0] + y
  426. label_sum[1] = label_sum[1] + 1
  427. col_sum[bin_idx] = label_sum
  428. result_sum[col_name] = col_sum
  429. return list(result_sum.items())