feature_imputation.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. #
  2. # Copyright 2019 The FATE Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import numpy as np
  17. from federatedml.model_base import ModelBase
  18. from federatedml.feature.imputer import Imputer
  19. from federatedml.protobuf.generated.feature_imputation_meta_pb2 import FeatureImputationMeta, FeatureImputerMeta
  20. from federatedml.protobuf.generated.feature_imputation_param_pb2 import FeatureImputationParam, FeatureImputerParam
  21. from federatedml.statistic.data_overview import get_header
  22. from federatedml.util import LOGGER
  23. from federatedml.util.io_check import assert_io_num_rows_equal
  24. class FeatureImputation(ModelBase):
  25. def __init__(self):
  26. super(FeatureImputation, self).__init__()
  27. self.summary_obj = None
  28. self.missing_impute_rate = None
  29. self.skip_cols = []
  30. self.cols_replace_method = None
  31. self.header = None
  32. from federatedml.param.feature_imputation_param import FeatureImputationParam
  33. self.model_param = FeatureImputationParam()
  34. self.model_param_name = 'FeatureImputationParam'
  35. self.model_meta_name = 'FeatureImputationMeta'
  36. def _init_model(self, model_param):
  37. self.missing_fill_method = model_param.missing_fill_method
  38. self.col_missing_fill_method = model_param.col_missing_fill_method
  39. self.default_value = model_param.default_value
  40. self.missing_impute = model_param.missing_impute
  41. def get_summary(self):
  42. missing_summary = dict()
  43. missing_summary["missing_value"] = list(self.missing_impute)
  44. missing_summary["missing_impute_value"] = dict(zip(self.header, self.default_value))
  45. missing_summary["missing_impute_rate"] = dict(zip(self.header, self.missing_impute_rate))
  46. missing_summary["skip_cols"] = self.skip_cols
  47. return missing_summary
  48. def load_model(self, model_dict):
  49. param_obj = list(model_dict.get('model').values())[0].get(self.model_param_name)
  50. meta_obj = list(model_dict.get('model').values())[0].get(self.model_meta_name)
  51. self.header = param_obj.header
  52. self.missing_fill, self.missing_fill_method, \
  53. self.missing_impute, self.default_value, self.skip_cols = load_feature_imputer_model(self.header,
  54. "Imputer",
  55. meta_obj.imputer_meta,
  56. param_obj.imputer_param)
  57. def save_model(self):
  58. meta_obj, param_obj = save_feature_imputer_model(missing_fill=True,
  59. missing_replace_method=self.missing_fill_method,
  60. cols_replace_method=self.cols_replace_method,
  61. missing_impute=self.missing_impute,
  62. missing_fill_value=self.default_value,
  63. missing_replace_rate=self.missing_impute_rate,
  64. header=self.header,
  65. skip_cols=self.skip_cols)
  66. return meta_obj, param_obj
  67. def export_model(self):
  68. missing_imputer_meta, missing_imputer_param = self.save_model()
  69. meta_obj = FeatureImputationMeta(need_run=self.need_run,
  70. imputer_meta=missing_imputer_meta)
  71. param_obj = FeatureImputationParam(header=self.header,
  72. imputer_param=missing_imputer_param)
  73. model_dict = {
  74. self.model_meta_name: meta_obj,
  75. self.model_param_name: param_obj
  76. }
  77. return model_dict
  78. @assert_io_num_rows_equal
  79. def fit(self, data):
  80. LOGGER.info(f"Enter Feature Imputation fit")
  81. imputer_processor = Imputer(self.missing_impute)
  82. self.header = get_header(data)
  83. if self.col_missing_fill_method:
  84. for k in self.col_missing_fill_method.keys():
  85. if k not in self.header:
  86. raise ValueError(f"{k} not found in data header. Please check col_missing_fill_method keys.")
  87. imputed_data, self.default_value = imputer_processor.fit(data,
  88. replace_method=self.missing_fill_method,
  89. replace_value=self.default_value,
  90. col_replace_method=self.col_missing_fill_method)
  91. if self.missing_impute is None:
  92. self.missing_impute = imputer_processor.get_missing_value_list()
  93. self.missing_impute_rate = imputer_processor.get_impute_rate("fit")
  94. # self.header = get_header(imputed_data)
  95. self.cols_replace_method = imputer_processor.cols_replace_method
  96. self.skip_cols = imputer_processor.get_skip_cols()
  97. self.set_summary(self.get_summary())
  98. return imputed_data
  99. @assert_io_num_rows_equal
  100. def transform(self, data):
  101. LOGGER.info(f"Enter Feature Imputation transform")
  102. imputer_processor = Imputer(self.missing_impute)
  103. imputed_data = imputer_processor.transform(data,
  104. transform_value=self.default_value,
  105. skip_cols=self.skip_cols)
  106. if self.missing_impute is None:
  107. self.missing_impute = imputer_processor.get_missing_value_list()
  108. self.missing_impute_rate = imputer_processor.get_impute_rate("transform")
  109. return imputed_data
  110. def save_feature_imputer_model(missing_fill=False,
  111. missing_replace_method=None,
  112. cols_replace_method=None,
  113. missing_impute=None,
  114. missing_fill_value=None,
  115. missing_replace_rate=None,
  116. header=None,
  117. skip_cols=None):
  118. model_meta = FeatureImputerMeta()
  119. model_param = FeatureImputerParam()
  120. model_meta.is_imputer = missing_fill
  121. if missing_fill:
  122. if missing_replace_method and cols_replace_method is None:
  123. model_meta.strategy = missing_replace_method
  124. if missing_impute is not None:
  125. model_meta.missing_value.extend(map(str, missing_impute))
  126. model_meta.missing_value_type.extend([type(v).__name__ for v in missing_impute])
  127. if missing_fill_value is not None and header is not None:
  128. fill_header = [col for col in header if col not in skip_cols]
  129. feature_value_dict = dict(zip(fill_header, map(str, missing_fill_value)))
  130. model_param.missing_replace_value.update(feature_value_dict)
  131. missing_fill_value_type = [type(v).__name__ for v in missing_fill_value]
  132. feature_value_type_dict = dict(zip(fill_header, missing_fill_value_type))
  133. model_param.missing_replace_value_type.update(feature_value_type_dict)
  134. if missing_replace_rate is not None:
  135. missing_replace_rate_dict = dict(zip(header, missing_replace_rate))
  136. model_param.missing_value_ratio.update(missing_replace_rate_dict)
  137. if cols_replace_method is not None:
  138. cols_replace_method = {k: str(v) for k, v in cols_replace_method.items()}
  139. # model_param.cols_replace_method.update(cols_replace_method)
  140. else:
  141. filled_cols = set(header) - set(skip_cols)
  142. cols_replace_method = {k: str(missing_replace_method) for k in filled_cols}
  143. model_param.cols_replace_method.update(cols_replace_method)
  144. model_param.skip_cols.extend(skip_cols)
  145. return model_meta, model_param
  146. def load_value_to_type(value, value_type):
  147. if value is None:
  148. loaded_value = None
  149. elif value_type in ["int", "int64", "long", "float", "float64", "double"]:
  150. loaded_value = getattr(np, value_type)(value)
  151. elif value_type in ["str", "_str"]:
  152. loaded_value = str(value)
  153. elif value_type.lower() in ["none", "nonetype"]:
  154. loaded_value = None
  155. else:
  156. raise ValueError(f"unknown value type: {value_type}")
  157. return loaded_value
  158. def load_feature_imputer_model(header=None,
  159. model_name="Imputer",
  160. model_meta=None,
  161. model_param=None):
  162. missing_fill = model_meta.is_imputer
  163. missing_replace_method = model_meta.strategy
  164. missing_value = list(model_meta.missing_value)
  165. missing_value_type = list(model_meta.missing_value_type)
  166. missing_fill_value = model_param.missing_replace_value
  167. missing_fill_value_type = model_param.missing_replace_value_type
  168. skip_cols = list(model_param.skip_cols)
  169. if missing_fill:
  170. if not missing_replace_method:
  171. missing_replace_method = None
  172. if not missing_value:
  173. missing_value = None
  174. else:
  175. missing_value = [load_value_to_type(missing_value[i],
  176. missing_value_type[i]) for i in range(len(missing_value))]
  177. if missing_fill_value:
  178. missing_fill_value = [load_value_to_type(missing_fill_value.get(head),
  179. missing_fill_value_type.get(head)) for head in header]
  180. else:
  181. missing_fill_value = None
  182. else:
  183. missing_replace_method = None
  184. missing_value = None
  185. missing_fill_value = None
  186. return missing_fill, missing_replace_method, missing_value, missing_fill_value, skip_cols