123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #
- # Copyright 2019 The FATE Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import re
- from pipeline.param.base_param import BaseParam
- from pipeline.param import consts
- class StatisticsParam(BaseParam):
- """
- Define statistics params
- Parameters
- ----------
- statistics: list, string, default "summary"
- Specify the statistic types to be computed.
- "summary" represents list: [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
- consts.MEDIAN, consts.MIN, consts.MAX,
- consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS]
- "describe" represents list: [consts.COUNT, consts.MEAN,
- consts.STANDARD_DEVIATION, consts.MIN, consts.MAX]
- column_names: list of string, default []
- Specify columns to be used for statistic computation by column names in header
- column_indexes: list of int, default -1
- Specify columns to be used for statistic computation by column order in header
- -1 indicates to compute statistics over all columns
- bias: bool, default: True
- If False, the calculations of skewness and kurtosis are corrected for statistical bias.
- need_run: bool, default True
- Indicate whether to run this modules
- """
- LEGAL_STAT = [consts.COUNT, consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
- consts.MEDIAN, consts.MIN, consts.MAX, consts.VARIANCE,
- consts.COEFFICIENT_OF_VARIATION, consts.MISSING_COUNT,
- consts.SKEWNESS, consts.KURTOSIS]
- LEGAL_QUANTILE = re.compile("^(100)|([1-9]?[0-9])%$")
- def __init__(self, statistics="summary", column_names=None,
- column_indexes=-1, need_run=True, abnormal_list=None,
- quantile_error=consts.DEFAULT_RELATIVE_ERROR, bias=True):
- super().__init__()
- self.statistics = statistics
- self.column_names = column_names
- self.column_indexes = column_indexes
- self.abnormal_list = abnormal_list
- self.need_run = need_run
- self.quantile_error = quantile_error
- self.bias = bias
- if column_names is None:
- self.column_names = []
- if column_indexes is None:
- self.column_indexes = []
- if abnormal_list is None:
- self.abnormal_list = []
- @staticmethod
- def extend_statistics(statistic_name):
- if statistic_name == "summary":
- return [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
- consts.MEDIAN, consts.MIN, consts.MAX,
- consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS,
- consts.COEFFICIENT_OF_VARIATION]
- if statistic_name == "describe":
- return [consts.COUNT, consts.MEAN, consts.STANDARD_DEVIATION, consts.MIN, consts.MAX]
- @staticmethod
- def find_stat_name_match(stat_name):
- if stat_name in StatisticsParam.LEGAL_STAT or StatisticsParam.LEGAL_QUANTILE.match(stat_name):
- return True
- return False
- # match_result = [legal_name == stat_name for legal_name in StatisticsParam.LEGAL_STAT]
- # match_result.append(0 if LEGAL_QUANTILE.match(stat_name) is None else True)
- # match_found = sum(match_result) > 0
- # return match_found
- def check(self):
- model_param_descr = "Statistics's param statistics"
- BaseParam.check_boolean(self.need_run, model_param_descr)
- if not isinstance(self.statistics, list):
- if self.statistics in [consts.DESCRIBE, consts.SUMMARY]:
- self.statistics = StatisticsParam.extend_statistics(self.statistics)
- else:
- self.statistics = [self.statistics]
- for stat_name in self.statistics:
- match_found = StatisticsParam.find_stat_name_match(stat_name)
- if not match_found:
- raise ValueError(f"Illegal statistics name provided: {stat_name}.")
- model_param_descr = "Statistics's param column_names"
- if not isinstance(self.column_names, list):
- raise ValueError(f"column_names should be list of string.")
- for col_name in self.column_names:
- BaseParam.check_string(col_name, model_param_descr)
- model_param_descr = "Statistics's param column_indexes"
- if not isinstance(self.column_indexes, list) and self.column_indexes != -1:
- raise ValueError(f"column_indexes should be list of int or -1.")
- if self.column_indexes != -1:
- for col_index in self.column_indexes:
- if not isinstance(col_index, int):
- raise ValueError(f"{model_param_descr} should be int or list of int")
- if col_index < -consts.FLOAT_ZERO:
- raise ValueError(f"{model_param_descr} should be non-negative int value(s)")
- if not isinstance(self.abnormal_list, list):
- raise ValueError(f"abnormal_list should be list of int or string.")
- self.check_decimal_float(self.quantile_error, "Statistics's param quantile_error ")
- self.check_boolean(self.bias, "Statistics's param bias ")
- return True
|