scale_param.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. from federatedml.param.base_param import BaseParam
  19. from federatedml.util import consts, LOGGER
  20. class ScaleParam(BaseParam):
  21. """
  22. Define the feature scale parameters.
  23. Parameters
  24. ----------
  25. method : {"standard_scale", "min_max_scale"}
  26. like scale in sklearn, now it support "min_max_scale" and "standard_scale", and will support other scale method soon.
  27. Default standard_scale, which will do nothing for scale
  28. mode : {"normal", "cap"}
  29. for mode is "normal", the feat_upper and feat_lower is the normal value like "10" or "3.1"
  30. and for "cap", feat_upper and feature_lower will between 0 and 1, which means the percentile of the column. Default "normal"
  31. feat_upper : int or float or list of int or float
  32. the upper limit in the column.
  33. If use list, mode must be "normal", and list length should equal to the number of features to scale.
  34. If the scaled value is larger than feat_upper, it will be set to feat_upper
  35. feat_lower: int or float or list of int or float
  36. the lower limit in the column.
  37. If use list, mode must be "normal", and list length should equal to the number of features to scale.
  38. If the scaled value is less than feat_lower, it will be set to feat_lower
  39. scale_col_indexes: list
  40. the idx of column in scale_column_idx will be scaled, while the idx of column is not in, it will not be scaled.
  41. scale_names : list of string
  42. Specify which columns need to scaled. Each element in the list represent for a column name in header. default: []
  43. with_mean : bool
  44. used for "standard_scale". Default True.
  45. with_std : bool
  46. used for "standard_scale". Default True.
  47. The standard scale of column x is calculated as : $z = (x - u) / s$ , where $u$ is the mean of the column and $s$ is the standard deviation of the column.
  48. if with_mean is False, $u$ will be 0, and if with_std is False, $s$ will be 1.
  49. need_run : bool
  50. Indicate if this module needed to be run, default True
  51. """
  52. def __init__(
  53. self,
  54. method="standard_scale",
  55. mode="normal",
  56. scale_col_indexes=-1,
  57. scale_names=None,
  58. feat_upper=None,
  59. feat_lower=None,
  60. with_mean=True,
  61. with_std=True,
  62. need_run=True):
  63. super().__init__()
  64. self.scale_names = [] if scale_names is None else scale_names
  65. self.method = method
  66. self.mode = mode
  67. self.feat_upper = feat_upper
  68. # LOGGER.debug("self.feat_upper:{}, type:{}".format(self.feat_upper, type(self.feat_upper)))
  69. self.feat_lower = feat_lower
  70. self.scale_col_indexes = scale_col_indexes
  71. self.with_mean = with_mean
  72. self.with_std = with_std
  73. self.need_run = need_run
  74. def check(self):
  75. if self.method is not None:
  76. descr = "scale param's method"
  77. self.method = self.check_and_change_lower(self.method,
  78. [consts.MINMAXSCALE, consts.STANDARDSCALE],
  79. descr)
  80. descr = "scale param's mode"
  81. self.mode = self.check_and_change_lower(self.mode,
  82. [consts.NORMAL, consts.CAP],
  83. descr)
  84. # LOGGER.debug("self.feat_upper:{}, type:{}".format(self.feat_upper, type(self.feat_upper)))
  85. # if type(self.feat_upper).__name__ not in ["float", "int"]:
  86. # raise ValueError("scale param's feat_upper {} not supported, should be float or int".format(
  87. # self.feat_upper))
  88. if self.scale_col_indexes != -1 and not isinstance(self.scale_col_indexes, list):
  89. raise ValueError("scale_col_indexes is should be -1 or a list")
  90. if self.scale_names is None:
  91. self.scale_names = []
  92. if not isinstance(self.scale_names, list):
  93. raise ValueError("scale_names is should be a list of string")
  94. else:
  95. for e in self.scale_names:
  96. if not isinstance(e, str):
  97. raise ValueError("scale_names is should be a list of string")
  98. self.check_boolean(self.with_mean, "scale_param with_mean")
  99. self.check_boolean(self.with_std, "scale_param with_std")
  100. self.check_boolean(self.need_run, "scale_param need_run")
  101. LOGGER.debug("Finish scale parameter check!")
  102. return True