123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- #
- # Copyright 2019 The FATE Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import copy
- import functools
- import numpy as np
- from federatedml.protobuf.generated.feature_scale_meta_pb2 import ScaleMeta
- from federatedml.protobuf.generated.feature_scale_param_pb2 import ScaleParam
- from federatedml.protobuf.generated.feature_scale_param_pb2 import ColumnScaleParam
- from federatedml.feature.feature_scale.base_scale import BaseScale
- from federatedml.statistic.statics import MultivariateStatisticalSummary
- from federatedml.util import LOGGER
- class StandardScale(BaseScale):
- """
- Standardize features by removing the mean and scaling to unit variance. The standard score of a sample x is calculated as:
- z = (x - u) / s, where u is the mean of the training samples, and s is the standard deviation of the training samples
- """
- def __init__(self, params):
- super().__init__(params)
- self.with_mean = params.with_mean
- self.with_std = params.with_std
- self.mean = None
- self.std = None
- def set_param(self, mean, std):
- self.mean = mean
- self.std = std
- @staticmethod
- def __scale_with_column_range(data, column_upper, column_lower, mean, std, process_cols_list):
- features = np.array(data.features, dtype=float)
- for i in process_cols_list:
- value = data.features[i]
- if value > column_upper[i]:
- value = column_upper[i]
- elif value < column_lower[i]:
- value = column_lower[i]
- features[i] = (value - mean[i]) / std[i]
- _data = copy.deepcopy(data)
- _data.features = features
- return _data
- @staticmethod
- def __scale(data, mean, std, process_cols_list):
- features = np.array(data.features, dtype=float)
- for i in process_cols_list:
- features[i] = (data.features[i] - mean[i]) / std[i]
- _data = copy.deepcopy(data)
- _data.features = features
- return _data
- def fit(self, data):
- """
- Apply standard scale for input data
- Parameters
- ----------
- data: data_instance, input data
- Returns
- ----------
- data:data_instance, data after scale
- mean: list, each column mean value
- std: list, each column standard deviation
- """
- self.column_min_value, self.column_max_value = self._get_min_max_value(data)
- self.scale_column_idx = self._get_scale_column_idx(data)
- self.header = self._get_header(data)
- self.data_shape = self._get_data_shape(data)
- # fit column value if larger than parameter upper or less than parameter lower
- data = self.fit_feature_range(data)
- if not self.with_mean and not self.with_std:
- self.mean = [0 for _ in range(self.data_shape)]
- self.std = [1 for _ in range(self.data_shape)]
- else:
- self.summary_obj = MultivariateStatisticalSummary(data, -1)
- if self.with_mean:
- self.mean = self.summary_obj.get_mean()
- self.mean = [self.mean[key] for key in self.header]
- else:
- self.mean = [0 for _ in range(self.data_shape)]
- if self.with_std:
- self.std = self.summary_obj.get_std_variance()
- self.std = [self.std[key] for key in self.header]
- for i, value in enumerate(self.std):
- if np.abs(value - 0) < 1e-6:
- self.std[i] = 1
- else:
- self.std = [1 for _ in range(self.data_shape)]
- f = functools.partial(self.__scale, mean=self.mean, std=self.std, process_cols_list=self.scale_column_idx)
- fit_data = data.mapValues(f)
- return fit_data
- def transform(self, data):
- """
- Transform input data using standard scale with fit results
- Parameters
- ----------
- data: data_instance, input data
- Returns
- ----------
- transform_data:data_instance, data after transform
- """
- f = functools.partial(self.__scale_with_column_range, column_upper=self.column_max_value,
- column_lower=self.column_min_value,
- mean=self.mean, std=self.std, process_cols_list=self.scale_column_idx)
- transform_data = data.mapValues(f)
- return transform_data
- def _get_meta(self, need_run):
- if self.header:
- scale_column = [self.header[i] for i in self.scale_column_idx]
- else:
- scale_column = ["_".join(["col", str(i)]) for i in self.scale_column_idx]
- if not self.data_shape:
- self.data_shape = -1
- meta_proto_obj = ScaleMeta(method="standard_scale",
- area="null",
- scale_column=scale_column,
- feat_upper=self._get_upper(self.data_shape),
- feat_lower=self._get_lower(self.data_shape),
- with_mean=self.with_mean,
- with_std=self.with_std,
- need_run=need_run
- )
- return meta_proto_obj
- def _get_param(self):
- column_scale_param_dict = {}
- scale_column_idx_set = set(self.scale_column_idx)
- if self.header:
- for i, header in enumerate(self.header):
- if i in scale_column_idx_set:
- param_obj = ColumnScaleParam(column_upper=self.column_max_value[i],
- column_lower=self.column_min_value[i],
- mean=self.mean[i],
- std=self.std[i])
- column_scale_param_dict[header] = param_obj
- param_proto_obj = ScaleParam(col_scale_param=column_scale_param_dict,
- header=self.header)
- return param_proto_obj
|