selection_properties.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. import operator
  18. from federatedml.protobuf.generated import feature_selection_param_pb2
  19. from federatedml.util import LOGGER
  20. class SelectionProperties(object):
  21. def __init__(self):
  22. self.header = []
  23. self.anonymous_header = []
  24. self.anonymous_col_name_maps = {}
  25. self.col_name_maps = {}
  26. self.last_left_col_indexes = []
  27. self.select_col_indexes = []
  28. self.select_col_names = []
  29. # self.anonymous_select_col_names = []
  30. self.left_col_indexes_added = set()
  31. self.left_col_indexes = []
  32. self.left_col_names = []
  33. # self.anonymous_left_col_names = []
  34. self.feature_values = {}
  35. def load_properties_with_new_header(self, header, feature_values, left_cols_obj, new_header_dict):
  36. self.set_header(list(new_header_dict.values()))
  37. self.set_last_left_col_indexes([header.index(i) for i in left_cols_obj.original_cols])
  38. self.add_select_col_names([new_header_dict.get(col) for col in left_cols_obj.original_cols])
  39. for col_name, _ in feature_values.items():
  40. self.add_feature_value(new_header_dict.get(col_name), feature_values.get(col_name))
  41. left_cols_dict = dict(left_cols_obj.left_cols)
  42. # LOGGER.info(f"left_cols_dict: {left_cols_dict}")
  43. for col_name, _ in left_cols_dict.items():
  44. if left_cols_dict.get(col_name):
  45. self.add_left_col_name(new_header_dict.get(col_name))
  46. # LOGGER.info(f"select properties all left cols names: {self.all_left_col_names}")
  47. return self
  48. def load_properties(self, header, feature_values, left_cols_obj):
  49. self.set_header(header)
  50. self.set_last_left_col_indexes([header.index(i) for i in left_cols_obj.original_cols])
  51. self.add_select_col_names(left_cols_obj.original_cols)
  52. for col_name, _ in feature_values.items():
  53. self.add_feature_value(col_name, feature_values[col_name])
  54. left_cols_dict = dict(left_cols_obj.left_cols)
  55. for col_name, _ in left_cols_dict.items():
  56. if left_cols_dict[col_name]:
  57. self.add_left_col_name(col_name)
  58. return self
  59. def set_header(self, header):
  60. self.header = header
  61. for idx, col_name in enumerate(self.header):
  62. self.col_name_maps[col_name] = idx
  63. def set_anonymous_header(self, anonymous_header):
  64. self.anonymous_header = anonymous_header
  65. if self.anonymous_header:
  66. for idx, col_name in enumerate(self.anonymous_header):
  67. self.anonymous_col_name_maps[col_name] = idx
  68. def set_last_left_col_indexes(self, left_cols):
  69. self.last_left_col_indexes = left_cols.copy()
  70. def set_select_all_cols(self):
  71. self.select_col_indexes = [i for i in range(len(self.header))]
  72. self.select_col_names = self.header
  73. # self.anonymous_select_col_names = self.anonymous_header
  74. def add_select_col_indexes(self, select_col_indexes):
  75. last_left_col_indexes = set(self.last_left_col_indexes)
  76. added_select_col_index = set(self.select_col_indexes)
  77. for idx in select_col_indexes:
  78. if idx >= len(self.header):
  79. LOGGER.warning("Adding an index out of header's bound")
  80. continue
  81. if idx not in last_left_col_indexes:
  82. continue
  83. if idx not in added_select_col_index:
  84. self.select_col_indexes.append(idx)
  85. self.select_col_names.append(self.header[idx])
  86. # self.anonymous_select_col_names.append(self.anonymous_header[idx])
  87. added_select_col_index.add(idx)
  88. def add_select_col_names(self, select_col_names):
  89. last_left_col_indexes = set(self.last_left_col_indexes)
  90. added_select_col_indexes = set(self.select_col_indexes)
  91. for col_name in select_col_names:
  92. idx = self.col_name_maps.get(col_name)
  93. if idx is None:
  94. LOGGER.warning("Adding a col_name that does not exist in header")
  95. continue
  96. if idx not in last_left_col_indexes:
  97. continue
  98. if idx not in added_select_col_indexes:
  99. self.select_col_indexes.append(idx)
  100. self.select_col_names.append(col_name)
  101. # self.anonymous_select_col_names.append(self.anonymous_header[idx])
  102. added_select_col_indexes.add(idx)
  103. def add_left_col_name(self, left_col_name):
  104. idx = self.col_name_maps.get(left_col_name)
  105. if idx is None:
  106. LOGGER.warning("Adding a col_name that does not exist in header")
  107. return
  108. if idx not in self.left_col_indexes_added:
  109. self.left_col_indexes.append(idx)
  110. self.left_col_indexes_added.add(idx)
  111. self.left_col_names.append(left_col_name)
  112. # self.anonymous_left_col_names.append(self.anonymous_header[idx])
  113. def add_feature_value(self, col_name, feature_value):
  114. self.feature_values[col_name] = feature_value
  115. @property
  116. def all_left_col_indexes(self):
  117. result = []
  118. select_col_indexes = set(self.select_col_indexes)
  119. left_col_indexes = set(self.left_col_indexes)
  120. for idx in self.last_left_col_indexes:
  121. if (idx not in select_col_indexes) or (idx in left_col_indexes):
  122. result.append(idx)
  123. # elif idx in left_col_indexes:
  124. # result.append(idx)
  125. return result
  126. @property
  127. def all_left_col_names(self):
  128. return [self.header[x] for x in self.all_left_col_indexes]
  129. @property
  130. def all_left_anonymous_col_names(self):
  131. return [self.anonymous_header[x] for x in self.all_left_col_indexes]
  132. @property
  133. def left_col_dicts(self):
  134. return {x: True for x in self.all_left_col_names}
  135. @property
  136. def last_left_col_names(self):
  137. return [self.header[x] for x in self.last_left_col_indexes]
  138. class CompletedSelectionResults(object):
  139. def __init__(self):
  140. self.header = []
  141. self.anonymous_header = []
  142. self.col_name_maps = {}
  143. self.__select_col_names = None
  144. self.filter_results = []
  145. self.__guest_pass_filter_nums = {}
  146. self.__host_pass_filter_nums_list = []
  147. self.all_left_col_indexes = []
  148. def set_header(self, header):
  149. self.header = header
  150. for idx, col_name in enumerate(self.header):
  151. self.col_name_maps[col_name] = idx
  152. def set_anonymous_header(self, anonymous_header):
  153. self.anonymous_header = anonymous_header
  154. def set_select_col_names(self, select_col_names):
  155. if self.__select_col_names is None:
  156. self.__select_col_names = select_col_names
  157. def get_select_col_names(self):
  158. return self.__select_col_names
  159. def set_all_left_col_indexes(self, left_indexes):
  160. self.all_left_col_indexes = left_indexes.copy()
  161. @property
  162. def all_left_col_names(self):
  163. return [self.header[x] for x in self.all_left_col_indexes]
  164. @property
  165. def all_left_anonymous_col_names(self):
  166. return [self.anonymous_header[x] for x in self.all_left_col_indexes]
  167. def add_filter_results(self, filter_name, select_properties: SelectionProperties, host_select_properties=None):
  168. # self.all_left_col_indexes = select_properties.all_left_col_indexes.copy()
  169. self.set_all_left_col_indexes(select_properties.all_left_col_indexes)
  170. if filter_name == 'conclusion':
  171. return
  172. if host_select_properties is None:
  173. host_select_properties = []
  174. host_feature_values = []
  175. host_left_cols = []
  176. for idx, host_result in enumerate(host_select_properties):
  177. host_all_left_col_names = set(host_result.all_left_col_names)
  178. if idx >= len(self.__host_pass_filter_nums_list):
  179. _host_pass_filter_nums = {}
  180. self.__host_pass_filter_nums_list.append(_host_pass_filter_nums)
  181. else:
  182. _host_pass_filter_nums = self.__host_pass_filter_nums_list[idx]
  183. host_last_left_col_names = host_result.last_left_col_names
  184. for col_name in host_last_left_col_names:
  185. _host_pass_filter_nums.setdefault(col_name, 0)
  186. if col_name in host_all_left_col_names:
  187. _host_pass_filter_nums[col_name] += 1
  188. feature_value_pb = feature_selection_param_pb2.FeatureValue(feature_values=host_result.feature_values)
  189. host_feature_values.append(feature_value_pb)
  190. left_col_pb = feature_selection_param_pb2.LeftCols(original_cols=host_last_left_col_names,
  191. left_cols=host_result.left_col_dicts)
  192. host_left_cols.append(left_col_pb)
  193. # for col_name in select_properties.all_left_col_names:
  194. self_all_left_col_names = set(select_properties.all_left_col_names)
  195. self_last_left_col_names = select_properties.last_left_col_names
  196. for col_name in self_last_left_col_names:
  197. self.__guest_pass_filter_nums.setdefault(col_name, 0)
  198. if col_name in self_all_left_col_names:
  199. self.__guest_pass_filter_nums[col_name] += 1
  200. left_cols_pb = feature_selection_param_pb2.LeftCols(original_cols=self_last_left_col_names,
  201. left_cols=select_properties.left_col_dicts)
  202. this_filter_result = {
  203. 'feature_values': select_properties.feature_values,
  204. 'host_feature_values': host_feature_values,
  205. 'left_cols': left_cols_pb,
  206. 'host_left_cols': host_left_cols,
  207. 'filter_name': filter_name
  208. }
  209. this_filter_result = feature_selection_param_pb2.FeatureSelectionFilterParam(**this_filter_result)
  210. self.filter_results.append(this_filter_result)
  211. def get_sorted_col_names(self):
  212. result = sorted(self.__guest_pass_filter_nums.items(), key=operator.itemgetter(1), reverse=True)
  213. return [x for x, _ in result]
  214. def get_host_sorted_col_names(self):
  215. result = []
  216. for pass_name_dict in self.__host_pass_filter_nums_list:
  217. sorted_list = sorted(pass_name_dict.items(), key=operator.itemgetter(1), reverse=True)
  218. result.append([x for x, _ in sorted_list])
  219. return result