min_max_scale.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. #
  2. # Copyright 2019 The FATE Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import copy
  17. import functools
  18. import numpy as np
  19. from federatedml.protobuf.generated.feature_scale_meta_pb2 import ScaleMeta
  20. from federatedml.protobuf.generated.feature_scale_param_pb2 import ScaleParam
  21. from federatedml.protobuf.generated.feature_scale_param_pb2 import ColumnScaleParam
  22. from federatedml.feature.feature_scale.base_scale import BaseScale
  23. class MinMaxScale(BaseScale):
  24. """
  25. Transforms features by scaling each feature to a given range,e.g.between minimum and maximum. The transformation is given by:
  26. X_scale = (X - X.min) / (X.max - X.min), while X.min is the minimum value of feature, and X.max is the maximum
  27. """
  28. def __init__(self, params):
  29. super().__init__(params)
  30. self.mode = params.mode
  31. self.column_range = None
  32. @staticmethod
  33. def __scale(data, max_value_list, min_value_list, scale_value_list, process_cols_list):
  34. """
  35. Scale operator for each column. The input data type is data_instance
  36. """
  37. features = np.array(data.features, dtype=float)
  38. for i in process_cols_list:
  39. value = features[i]
  40. if value > max_value_list[i]:
  41. value = max_value_list[i]
  42. elif value < min_value_list[i]:
  43. value = min_value_list[i]
  44. features[i] = (value - min_value_list[i]) / scale_value_list[i]
  45. _data = copy.deepcopy(data)
  46. _data.features = features
  47. return _data
  48. def fit(self, data):
  49. """
  50. Apply min-max scale for input data
  51. Parameters
  52. ----------
  53. data: data_instance, input data
  54. Returns
  55. ----------
  56. fit_data:data_instance, data after scale
  57. """
  58. self.column_min_value, self.column_max_value = self._get_min_max_value(data)
  59. self.scale_column_idx = self._get_scale_column_idx(data)
  60. self.header = self._get_header(data)
  61. self.column_range = []
  62. for i in range(len(self.column_max_value)):
  63. scale = self.column_max_value[i] - self.column_min_value[i]
  64. if scale < 0:
  65. raise ValueError("scale value should large than 0")
  66. elif np.abs(scale - 0) < 1e-6:
  67. scale = 1
  68. self.column_range.append(scale)
  69. f = functools.partial(MinMaxScale.__scale, max_value_list=self.column_max_value,
  70. min_value_list=self.column_min_value, scale_value_list=self.column_range,
  71. process_cols_list=self.scale_column_idx)
  72. fit_data = data.mapValues(f)
  73. return fit_data
  74. def transform(self, data):
  75. """
  76. Transform input data using min-max scale with fit results
  77. Parameters
  78. ----------
  79. data: data_instance, input data
  80. Returns
  81. ----------
  82. transform_data:data_instance, data after transform
  83. """
  84. self.column_range = []
  85. for i in range(len(self.column_max_value)):
  86. scale = self.column_max_value[i] - self.column_min_value[i]
  87. if scale < 0:
  88. raise ValueError("scale value should large than 0")
  89. elif np.abs(scale - 0) < 1e-6:
  90. scale = 1
  91. self.column_range.append(scale)
  92. f = functools.partial(MinMaxScale.__scale, max_value_list=self.column_max_value,
  93. min_value_list=self.column_min_value, scale_value_list=self.column_range,
  94. process_cols_list=self.scale_column_idx)
  95. transform_data = data.mapValues(f)
  96. return transform_data
  97. def _get_meta(self, need_run):
  98. if self.header:
  99. scale_column = [self.header[i] for i in self.scale_column_idx]
  100. else:
  101. scale_column = ["_".join(["col", str(i)]) for i in self.scale_column_idx]
  102. if not self.data_shape:
  103. self.data_shape = -1
  104. meta_proto_obj = ScaleMeta(method="min_max_scale",
  105. mode=self.mode,
  106. area="null",
  107. scale_column=scale_column,
  108. feat_upper=self._get_upper(self.data_shape),
  109. feat_lower=self._get_lower(self.data_shape),
  110. need_run=need_run
  111. )
  112. return meta_proto_obj
  113. def _get_param(self):
  114. min_max_scale_param_dict = {}
  115. if self.header:
  116. scale_column_idx_set = set(self.scale_column_idx)
  117. for i, header in enumerate(self.header):
  118. if i in scale_column_idx_set:
  119. param_obj = ColumnScaleParam(column_upper=self.column_max_value[i],
  120. column_lower=self.column_min_value[i])
  121. min_max_scale_param_dict[header] = param_obj
  122. param_proto_obj = ScaleParam(col_scale_param=min_max_scale_param_dict,
  123. header=self.header)
  124. return param_proto_obj