standard_scale.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #
  2. # Copyright 2019 The FATE Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import copy
  17. import functools
  18. import numpy as np
  19. from federatedml.protobuf.generated.feature_scale_meta_pb2 import ScaleMeta
  20. from federatedml.protobuf.generated.feature_scale_param_pb2 import ScaleParam
  21. from federatedml.protobuf.generated.feature_scale_param_pb2 import ColumnScaleParam
  22. from federatedml.feature.feature_scale.base_scale import BaseScale
  23. from federatedml.statistic.statics import MultivariateStatisticalSummary
  24. from federatedml.util import LOGGER
  25. class StandardScale(BaseScale):
  26. """
  27. Standardize features by removing the mean and scaling to unit variance. The standard score of a sample x is calculated as:
  28. z = (x - u) / s, where u is the mean of the training samples, and s is the standard deviation of the training samples
  29. """
  30. def __init__(self, params):
  31. super().__init__(params)
  32. self.with_mean = params.with_mean
  33. self.with_std = params.with_std
  34. self.mean = None
  35. self.std = None
  36. def set_param(self, mean, std):
  37. self.mean = mean
  38. self.std = std
  39. @staticmethod
  40. def __scale_with_column_range(data, column_upper, column_lower, mean, std, process_cols_list):
  41. features = np.array(data.features, dtype=float)
  42. for i in process_cols_list:
  43. value = data.features[i]
  44. if value > column_upper[i]:
  45. value = column_upper[i]
  46. elif value < column_lower[i]:
  47. value = column_lower[i]
  48. features[i] = (value - mean[i]) / std[i]
  49. _data = copy.deepcopy(data)
  50. _data.features = features
  51. return _data
  52. @staticmethod
  53. def __scale(data, mean, std, process_cols_list):
  54. features = np.array(data.features, dtype=float)
  55. for i in process_cols_list:
  56. features[i] = (data.features[i] - mean[i]) / std[i]
  57. _data = copy.deepcopy(data)
  58. _data.features = features
  59. return _data
  60. def fit(self, data):
  61. """
  62. Apply standard scale for input data
  63. Parameters
  64. ----------
  65. data: data_instance, input data
  66. Returns
  67. ----------
  68. data:data_instance, data after scale
  69. mean: list, each column mean value
  70. std: list, each column standard deviation
  71. """
  72. self.column_min_value, self.column_max_value = self._get_min_max_value(data)
  73. self.scale_column_idx = self._get_scale_column_idx(data)
  74. self.header = self._get_header(data)
  75. self.data_shape = self._get_data_shape(data)
  76. # fit column value if larger than parameter upper or less than parameter lower
  77. data = self.fit_feature_range(data)
  78. if not self.with_mean and not self.with_std:
  79. self.mean = [0 for _ in range(self.data_shape)]
  80. self.std = [1 for _ in range(self.data_shape)]
  81. else:
  82. self.summary_obj = MultivariateStatisticalSummary(data, -1)
  83. if self.with_mean:
  84. self.mean = self.summary_obj.get_mean()
  85. self.mean = [self.mean[key] for key in self.header]
  86. else:
  87. self.mean = [0 for _ in range(self.data_shape)]
  88. if self.with_std:
  89. self.std = self.summary_obj.get_std_variance()
  90. self.std = [self.std[key] for key in self.header]
  91. for i, value in enumerate(self.std):
  92. if np.abs(value - 0) < 1e-6:
  93. self.std[i] = 1
  94. else:
  95. self.std = [1 for _ in range(self.data_shape)]
  96. f = functools.partial(self.__scale, mean=self.mean, std=self.std, process_cols_list=self.scale_column_idx)
  97. fit_data = data.mapValues(f)
  98. return fit_data
  99. def transform(self, data):
  100. """
  101. Transform input data using standard scale with fit results
  102. Parameters
  103. ----------
  104. data: data_instance, input data
  105. Returns
  106. ----------
  107. transform_data:data_instance, data after transform
  108. """
  109. f = functools.partial(self.__scale_with_column_range, column_upper=self.column_max_value,
  110. column_lower=self.column_min_value,
  111. mean=self.mean, std=self.std, process_cols_list=self.scale_column_idx)
  112. transform_data = data.mapValues(f)
  113. return transform_data
  114. def _get_meta(self, need_run):
  115. if self.header:
  116. scale_column = [self.header[i] for i in self.scale_column_idx]
  117. else:
  118. scale_column = ["_".join(["col", str(i)]) for i in self.scale_column_idx]
  119. if not self.data_shape:
  120. self.data_shape = -1
  121. meta_proto_obj = ScaleMeta(method="standard_scale",
  122. area="null",
  123. scale_column=scale_column,
  124. feat_upper=self._get_upper(self.data_shape),
  125. feat_lower=self._get_lower(self.data_shape),
  126. with_mean=self.with_mean,
  127. with_std=self.with_std,
  128. need_run=need_run
  129. )
  130. return meta_proto_obj
  131. def _get_param(self):
  132. column_scale_param_dict = {}
  133. scale_column_idx_set = set(self.scale_column_idx)
  134. if self.header:
  135. for i, header in enumerate(self.header):
  136. if i in scale_column_idx_set:
  137. param_obj = ColumnScaleParam(column_upper=self.column_max_value[i],
  138. column_lower=self.column_min_value[i],
  139. mean=self.mean[i],
  140. std=self.std[i])
  141. column_scale_param_dict[header] = param_obj
  142. param_proto_obj = ScaleParam(col_scale_param=column_scale_param_dict,
  143. header=self.header)
  144. return param_proto_obj