123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- #
- # Copyright 2019 The FATE Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import functools
- import numpy as np
- from federatedml.feature.binning.base_binning import BaseBinning
- from federatedml.framework import weights
- from fate_arch.session import computing_session as session
- from federatedml.param.feature_binning_param import HomoFeatureBinningParam
- from federatedml.statistic.data_statistics import MultivariateStatisticalSummary
- from federatedml.transfer_variable.transfer_class.homo_binning_transfer_variable import HomoBinningTransferVariable
- from federatedml.framework.homo.aggregator.secure_aggregator import SecureAggregatorClient, SecureAggregatorServer
- from federatedml.util import consts
- class SplitPointNode(object):
- def __init__(self, value, min_value, max_value, aim_rank=None, allow_error_rank=0, last_rank=-1):
- self.value = value
- self.min_value = min_value
- self.max_value = max_value
- self.aim_rank = aim_rank
- self.allow_error_rank = allow_error_rank
- self.last_rank = last_rank
- self.fixed = False
- def set_aim_rank(self, rank):
- self.aim_rank = rank
- def create_right_new(self):
- value = (self.value + self.max_value) / 2
- if np.fabs(value - self.value) <= consts.FLOAT_ZERO * 0.9:
- self.value += consts.FLOAT_ZERO * 0.9
- self.fixed = True
- return self
- min_value = self.value
- return SplitPointNode(value, min_value, self.max_value, self.aim_rank, self.allow_error_rank)
- def create_left_new(self):
- value = (self.value + self.min_value) / 2
- if np.fabs(value - self.value) <= consts.FLOAT_ZERO * 0.9:
- self.value += consts.FLOAT_ZERO * 0.9
- self.fixed = True
- return self
- max_value = self.value
- return SplitPointNode(value, self.min_value, max_value, self.aim_rank, self.allow_error_rank)
- class RankArray(object):
- def __init__(self, rank_array, error_rank, last_rank_array=None):
- self.rank_array = rank_array
- self.last_rank_array = last_rank_array
- self.error_rank = error_rank
- self.all_fix = False
- self.fixed_array = np.zeros(len(self.rank_array), dtype=bool)
- self._compare()
- def _compare(self):
- if self.last_rank_array is None:
- return
- else:
- self.fixed_array = abs(self.rank_array - self.last_rank_array) < self.error_rank
- assert isinstance(self.fixed_array, np.ndarray)
- if (self.fixed_array).all():
- self.all_fix = True
- def __iadd__(self, other: 'RankArray'):
- for idx, is_fixed in enumerate(self.fixed_array):
- if not is_fixed:
- self.rank_array[idx] += other.rank_array[idx]
- self._compare()
- return self
- def __add__(self, other: 'RankArray'):
- res_array = []
- for idx, is_fixed in enumerate(self.fixed_array):
- if not is_fixed:
- res_array.append(self.rank_array[idx] + other.rank_array[idx])
- else:
- res_array.append(self.rank_array[idx])
- return RankArray(np.array(res_array), self.error_rank, self.last_rank_array)
- class Server(BaseBinning):
- def __init__(self, params=None, abnormal_list=None):
- super().__init__(params, abnormal_list)
- self.aggregator: SecureAggregatorServer = None
- self.transfer_variable = HomoBinningTransferVariable()
- self.suffix = None
- def set_suffix(self, suffix):
- self.suffix = suffix
- def set_transfer_variable(self, variable):
- self.transfer_variable = variable
- def set_aggregator(self, aggregator):
- self.aggregator = aggregator
- def get_total_count(self):
- # total_count = self.aggregator.sum_model(suffix=(self.suffix, 'total_count'))
- # self.aggregator.send_aggregated_model(total_count, suffix=(self.suffix, 'total_count'))
- total_count = self.aggregator.aggregate_model(suffix=(self.suffix, 'total_count'))
- self.aggregator.broadcast_model(total_count, suffix=(self.suffix, 'total_count'))
- return total_count
- def get_missing_count(self):
- # missing_count = self.aggregator.sum_model(suffix=(self.suffix, 'missing_count'))
- # self.aggregator.send_aggregated_model(missing_count, suffix=(self.suffix, 'missing_count'))
- missing_count = self.aggregator.aggregate_model(suffix=(self.suffix, 'missing_count'))
- self.aggregator.broadcast_model(missing_count, suffix=(self.suffix, 'missing_count'))
- return missing_count
- def get_min_max(self):
- local_values = self.transfer_variable.local_static_values.get(suffix=(self.suffix, "min-max"))
- max_array, min_array = [], []
- for local_max, local_min in local_values:
- max_array.append(local_max)
- min_array.append(local_min)
- max_values = np.max(max_array, axis=0)
- min_values = np.min(min_array, axis=0)
- self.transfer_variable.global_static_values.remote((max_values, min_values),
- suffix=(self.suffix, "min-max"))
- return min_values, max_values
- def query_values(self):
- # rank_weight = self.aggregator.aggregate_tables(suffix=(self.suffix, 'rank'))
- # self.aggregator.send_aggregated_tables(rank_weight, suffix=(self.suffix, 'rank'))
- rank_weight = self.aggregator.aggregate_model(suffix=(self.suffix, 'rank'))
- self.aggregator.broadcast_model(rank_weight, suffix=(self.suffix, 'rank'))
- class Client(BaseBinning):
- def __init__(self, params: HomoFeatureBinningParam = None, abnormal_list=None):
- super().__init__(params, abnormal_list)
- self.aggregator: SecureAggregatorClient = None
- self.transfer_variable = HomoBinningTransferVariable()
- self.max_values, self.min_values = None, None
- self.suffix = None
- self.total_count = 0
- def set_suffix(self, suffix):
- self.suffix = suffix
- def set_transfer_variable(self, variable):
- self.transfer_variable = variable
- def set_aggregator(self, aggregator):
- self.aggregator = aggregator
- def get_total_count(self, data_inst):
- count = data_inst.count()
- count_weight = weights.NumericWeights(count)
- self.aggregator.send_model(count_weight, suffix=(self.suffix, 'total_count'))
- total_count = self.aggregator.get_aggregated_model(suffix=(self.suffix, 'total_count')).unboxed
- return total_count
- def get_missing_count(self, summary_table):
- missing_table = summary_table.mapValues(lambda x: x.missing_count)
- missing_value_counts = dict(missing_table.collect())
- missing_weight = weights.DictWeights(missing_value_counts)
- self.aggregator.send_model(missing_weight, suffix=(self.suffix, 'missing_count'))
- missing_counts = self.aggregator.get_aggregated_model(suffix=(self.suffix, 'missing_count')).unboxed
- return missing_counts
- def get_min_max(self, data_inst):
- """
- Get max and min value of each selected columns
- Returns:
- max_values, min_values: dict
- eg. {"x1": 10, "x2": 3, ... }
- """
- if self.max_values and self.min_values:
- return self.max_values, self.min_values
- statistic_obj = MultivariateStatisticalSummary(data_inst,
- cols_index=self.bin_inner_param.bin_indexes,
- abnormal_list=self.abnormal_list,
- error=self.params.error)
- max_values = statistic_obj.get_max()
- min_values = statistic_obj.get_min()
- max_list = [max_values[x] for x in self.bin_inner_param.bin_names]
- min_list = [min_values[x] for x in self.bin_inner_param.bin_names]
- local_min_max_values = (max_list, min_list)
- self.transfer_variable.local_static_values.remote(local_min_max_values,
- suffix=(self.suffix, "min-max"))
- self.max_values, self.min_values = self.transfer_variable.global_static_values.get(
- idx=0, suffix=(self.suffix, "min-max"))
- return self.max_values, self.min_values
- def init_query_points(self, partitions, split_num, error_rank=1, need_first=True):
- query_points = []
- for idx, col_name in enumerate(self.bin_inner_param.bin_names):
- max_value = self.max_values[idx]
- min_value = self.min_values[idx]
- sps = np.linspace(min_value, max_value, split_num)
- if not need_first:
- sps = sps[1:]
- split_point_array = [SplitPointNode(sps[i], min_value, max_value, allow_error_rank=error_rank)
- for i in range(len(sps))]
- query_points.append((col_name, split_point_array))
- query_points_table = session.parallelize(query_points,
- include_key=True,
- partition=partitions)
- return query_points_table
- def query_values(self, summary_table, query_points):
- local_ranks = summary_table.join(query_points, self._query_table)
- self.aggregator.send_model(local_ranks, suffix=(self.suffix, 'rank'))
- global_rank = self.aggregator.get_aggregated_model(suffix=(self.suffix, 'rank'))
- global_rank = global_rank.mapValues(lambda x: np.array(x, dtype=int))
- return global_rank
- @staticmethod
- def _query_table(summary, query_points):
- queries = [x.value for x in query_points]
- original_idx = np.argsort(np.argsort(queries))
- queries = np.sort(queries)
- ranks = summary.query_value_list(queries)
- ranks = np.array(ranks)[original_idx]
- return np.array(ranks, dtype=int)
|