#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2019 The FATE Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import re from federatedml.param.base_param import BaseParam from federatedml.util import consts import copy class StatisticsParam(BaseParam): """ Define statistics params Parameters ---------- statistics: list, string, default "summary" Specify the statistic types to be computed. "summary" represents list: [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION, consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS] column_names: list of string, default [] Specify columns to be used for statistic computation by column names in header column_indexes: list of int, default -1 Specify columns to be used for statistic computation by column order in header -1 indicates to compute statistics over all columns bias: bool, default: True If False, the calculations of skewness and kurtosis are corrected for statistical bias. need_run: bool, default True Indicate whether to run this modules """ LEGAL_STAT = [consts.COUNT, consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION, consts.MEDIAN, consts.MIN, consts.MAX, consts.VARIANCE, consts.COEFFICIENT_OF_VARIATION, consts.MISSING_COUNT, consts.MISSING_RATIO, consts.SKEWNESS, consts.KURTOSIS] BASIC_STAT = [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION, consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_RATIO, consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS, consts.COEFFICIENT_OF_VARIATION] LEGAL_QUANTILE = re.compile("^(100)|([1-9]?[0-9])%$") def __init__(self, statistics="summary", column_names=None, column_indexes=-1, need_run=True, abnormal_list=None, quantile_error=consts.DEFAULT_RELATIVE_ERROR, bias=True): super().__init__() self.statistics = statistics self.column_names = column_names self.column_indexes = column_indexes self.abnormal_list = abnormal_list self.need_run = need_run self.quantile_error = quantile_error self.bias = bias # @staticmethod # def extend_statistics(statistic_name): # basic_metrics = [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION, # consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_RATIO, # consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS, # consts.COEFFICIENT_OF_VARIATION] # if statistic_name == "summary": # return basic_metrics # # if statistic_name == "describe": # return [consts.COUNT, consts.MEAN, consts.STANDARD_DEVIATION, # consts.MIN, consts.MAX] @staticmethod def find_stat_name_match(stat_name): if stat_name in StatisticsParam.LEGAL_STAT or StatisticsParam.LEGAL_QUANTILE.match(stat_name): return True return False # match_result = [legal_name == stat_name for legal_name in StatisticsParam.LEGAL_STAT] # match_result.append(0 if LEGAL_QUANTILE.match(stat_name) is None else True) # match_found = sum(match_result) > 0 # return match_found def check(self): model_param_descr = "Statistics's param statistics" BaseParam.check_boolean(self.need_run, model_param_descr) statistics = copy.copy(self.BASIC_STAT) if not isinstance(self.statistics, list): if self.statistics in [consts.SUMMARY]: self.statistics = statistics else: if self.statistics not in statistics: statistics.append(self.statistics) self.statistics = statistics else: for s in self.statistics: if s not in statistics: statistics.append(s) self.statistics = statistics for stat_name in self.statistics: match_found = StatisticsParam.find_stat_name_match(stat_name) if not match_found: raise ValueError(f"Illegal statistics name provided: {stat_name}.") self.column_names = [] if self.column_names is None else self.column_names self.column_indexes = [] if self.column_indexes is None else self.column_indexes self.abnormal_list = [] if self.abnormal_list is None else self.abnormal_list model_param_descr = "Statistics's param column_names" if not isinstance(self.column_names, list): raise ValueError(f"column_names should be list of string.") for col_name in self.column_names: BaseParam.check_string(col_name, model_param_descr) model_param_descr = "Statistics's param column_indexes" if not isinstance(self.column_indexes, list) and self.column_indexes != -1: raise ValueError(f"column_indexes should be list of int or -1.") if self.column_indexes != -1: for col_index in self.column_indexes: if not isinstance(col_index, int): raise ValueError(f"{model_param_descr} should be int or list of int") if col_index < -consts.FLOAT_ZERO: raise ValueError(f"{model_param_descr} should be non-negative int value(s)") if not isinstance(self.abnormal_list, list): raise ValueError(f"abnormal_list should be list of int or string.") self.check_decimal_float(self.quantile_error, "Statistics's param quantile_error ") self.check_boolean(self.bias, "Statistics's param bias ") return True