statistics_param.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. import re
  19. from federatedml.param.base_param import BaseParam
  20. from federatedml.util import consts
  21. import copy
  22. class StatisticsParam(BaseParam):
  23. """
  24. Define statistics params
  25. Parameters
  26. ----------
  27. statistics: list, string, default "summary"
  28. Specify the statistic types to be computed.
  29. "summary" represents list: [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
  30. consts.MEDIAN, consts.MIN, consts.MAX,
  31. consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS]
  32. column_names: list of string, default []
  33. Specify columns to be used for statistic computation by column names in header
  34. column_indexes: list of int, default -1
  35. Specify columns to be used for statistic computation by column order in header
  36. -1 indicates to compute statistics over all columns
  37. bias: bool, default: True
  38. If False, the calculations of skewness and kurtosis are corrected for statistical bias.
  39. need_run: bool, default True
  40. Indicate whether to run this modules
  41. """
  42. LEGAL_STAT = [consts.COUNT, consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
  43. consts.MEDIAN, consts.MIN, consts.MAX, consts.VARIANCE,
  44. consts.COEFFICIENT_OF_VARIATION, consts.MISSING_COUNT,
  45. consts.MISSING_RATIO,
  46. consts.SKEWNESS, consts.KURTOSIS]
  47. BASIC_STAT = [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
  48. consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_RATIO,
  49. consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS,
  50. consts.COEFFICIENT_OF_VARIATION]
  51. LEGAL_QUANTILE = re.compile("^(100)|([1-9]?[0-9])%$")
  52. def __init__(self, statistics="summary", column_names=None,
  53. column_indexes=-1, need_run=True, abnormal_list=None,
  54. quantile_error=consts.DEFAULT_RELATIVE_ERROR, bias=True):
  55. super().__init__()
  56. self.statistics = statistics
  57. self.column_names = column_names
  58. self.column_indexes = column_indexes
  59. self.abnormal_list = abnormal_list
  60. self.need_run = need_run
  61. self.quantile_error = quantile_error
  62. self.bias = bias
  63. # @staticmethod
  64. # def extend_statistics(statistic_name):
  65. # basic_metrics = [consts.SUM, consts.MEAN, consts.STANDARD_DEVIATION,
  66. # consts.MEDIAN, consts.MIN, consts.MAX, consts.MISSING_RATIO,
  67. # consts.MISSING_COUNT, consts.SKEWNESS, consts.KURTOSIS,
  68. # consts.COEFFICIENT_OF_VARIATION]
  69. # if statistic_name == "summary":
  70. # return basic_metrics
  71. #
  72. # if statistic_name == "describe":
  73. # return [consts.COUNT, consts.MEAN, consts.STANDARD_DEVIATION,
  74. # consts.MIN, consts.MAX]
  75. @staticmethod
  76. def find_stat_name_match(stat_name):
  77. if stat_name in StatisticsParam.LEGAL_STAT or StatisticsParam.LEGAL_QUANTILE.match(stat_name):
  78. return True
  79. return False
  80. # match_result = [legal_name == stat_name for legal_name in StatisticsParam.LEGAL_STAT]
  81. # match_result.append(0 if LEGAL_QUANTILE.match(stat_name) is None else True)
  82. # match_found = sum(match_result) > 0
  83. # return match_found
  84. def check(self):
  85. model_param_descr = "Statistics's param statistics"
  86. BaseParam.check_boolean(self.need_run, model_param_descr)
  87. statistics = copy.copy(self.BASIC_STAT)
  88. if not isinstance(self.statistics, list):
  89. if self.statistics in [consts.SUMMARY]:
  90. self.statistics = statistics
  91. else:
  92. if self.statistics not in statistics:
  93. statistics.append(self.statistics)
  94. self.statistics = statistics
  95. else:
  96. for s in self.statistics:
  97. if s not in statistics:
  98. statistics.append(s)
  99. self.statistics = statistics
  100. for stat_name in self.statistics:
  101. match_found = StatisticsParam.find_stat_name_match(stat_name)
  102. if not match_found:
  103. raise ValueError(f"Illegal statistics name provided: {stat_name}.")
  104. self.column_names = [] if self.column_names is None else self.column_names
  105. self.column_indexes = [] if self.column_indexes is None else self.column_indexes
  106. self.abnormal_list = [] if self.abnormal_list is None else self.abnormal_list
  107. model_param_descr = "Statistics's param column_names"
  108. if not isinstance(self.column_names, list):
  109. raise ValueError(f"column_names should be list of string.")
  110. for col_name in self.column_names:
  111. BaseParam.check_string(col_name, model_param_descr)
  112. model_param_descr = "Statistics's param column_indexes"
  113. if not isinstance(self.column_indexes, list) and self.column_indexes != -1:
  114. raise ValueError(f"column_indexes should be list of int or -1.")
  115. if self.column_indexes != -1:
  116. for col_index in self.column_indexes:
  117. if not isinstance(col_index, int):
  118. raise ValueError(f"{model_param_descr} should be int or list of int")
  119. if col_index < -consts.FLOAT_ZERO:
  120. raise ValueError(f"{model_param_descr} should be non-negative int value(s)")
  121. if not isinstance(self.abnormal_list, list):
  122. raise ValueError(f"abnormal_list should be list of int or string.")
  123. self.check_decimal_float(self.quantile_error, "Statistics's param quantile_error ")
  124. self.check_boolean(self.bias, "Statistics's param bias ")
  125. return True