dataio_param.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. from federatedml.param.base_param import BaseParam
  19. class DataIOParam(BaseParam):
  20. """
  21. Define dataio parameters that used in federated ml.
  22. Parameters
  23. ----------
  24. input_format : {'dense', 'sparse', 'tag'}
  25. please have a look at this tutorial at "DataIO" section of federatedml/util/README.md.
  26. Formally,
  27. dense input format data should be set to "dense",
  28. svm-light input format data should be set to "sparse",
  29. tag or tag:value input format data should be set to "tag".
  30. delimitor : str
  31. the delimitor of data input, default: ','
  32. data_type : {'float64', 'float', 'int', 'int64', 'str', 'long'}
  33. the data type of data input
  34. exclusive_data_type : dict
  35. the key of dict is col_name, the value is data_type, use to specified special data type
  36. of some features.
  37. tag_with_value: bool
  38. use if input_format is 'tag', if tag_with_value is True,
  39. input column data format should be tag[delimitor]value, otherwise is tag only
  40. tag_value_delimitor: str
  41. use if input_format is 'tag' and 'tag_with_value' is True,
  42. delimitor of tag[delimitor]value column value.
  43. missing_fill : bool
  44. need to fill missing value or not, accepted only True/False, default: False
  45. default_value : None or object or list
  46. the value to replace missing value.
  47. if None, it will use default value define in federatedml/feature/imputer.py,
  48. if single object, will fill missing value with this object,
  49. if list, it's length should be the sample of input data' feature dimension,
  50. means that if some column happens to have missing values, it will replace it
  51. the value by element in the identical position of this list.
  52. missing_fill_method : {None, 'min', 'max', 'mean', 'designated'}
  53. the method to replace missing value
  54. missing_impute: None or list
  55. element of list can be any type, or auto generated if value is None, define which values to be consider as missing
  56. outlier_replace: bool
  57. need to replace outlier value or not, accepted only True/False, default: True
  58. outlier_replace_method : {None, 'min', 'max', 'mean', 'designated'}
  59. the method to replace missing value
  60. outlier_impute: None or list
  61. element of list can be any type, which values should be regard as missing value, default: None
  62. outlier_replace_value : None or object or list
  63. the value to replace outlier.
  64. if None, it will use default value define in federatedml/feature/imputer.py,
  65. if single object, will replace outlier with this object,
  66. if list, it's length should be the sample of input data' feature dimension,
  67. means that if some column happens to have outliers, it will replace it
  68. the value by element in the identical position of this list.
  69. with_label : bool
  70. True if input data consist of label, False otherwise. default: 'false'
  71. label_name : str
  72. column_name of the column where label locates, only use in dense-inputformat. default: 'y'
  73. label_type : {'int', 'int64', 'float', 'float64', 'long', 'str'}
  74. use when with_label is True.
  75. output_format : {'dense', 'sparse'}
  76. output format
  77. """
  78. def __init__(self, input_format="dense", delimitor=',', data_type='float64',
  79. exclusive_data_type=None,
  80. tag_with_value=False, tag_value_delimitor=":",
  81. missing_fill=False, default_value=0, missing_fill_method=None,
  82. missing_impute=None, outlier_replace=False, outlier_replace_method=None,
  83. outlier_impute=None, outlier_replace_value=0,
  84. with_label=False, label_name='y',
  85. label_type='int', output_format='dense', need_run=True):
  86. self.input_format = input_format
  87. self.delimitor = delimitor
  88. self.data_type = data_type
  89. self.exclusive_data_type = exclusive_data_type
  90. self.tag_with_value = tag_with_value
  91. self.tag_value_delimitor = tag_value_delimitor
  92. self.missing_fill = missing_fill
  93. self.default_value = default_value
  94. self.missing_fill_method = missing_fill_method
  95. self.missing_impute = missing_impute
  96. self.outlier_replace = outlier_replace
  97. self.outlier_replace_method = outlier_replace_method
  98. self.outlier_impute = outlier_impute
  99. self.outlier_replace_value = outlier_replace_value
  100. self.with_label = with_label
  101. self.label_name = label_name
  102. self.label_type = label_type
  103. self.output_format = output_format
  104. self.need_run = need_run
  105. def check(self):
  106. descr = "dataio param's"
  107. self.input_format = self.check_and_change_lower(self.input_format,
  108. ["dense", "sparse", "tag"],
  109. descr)
  110. self.output_format = self.check_and_change_lower(self.output_format,
  111. ["dense", "sparse"],
  112. descr)
  113. self.data_type = self.check_and_change_lower(self.data_type,
  114. ["int", "int64", "float", "float64", "str", "long"],
  115. descr)
  116. if type(self.missing_fill).__name__ != 'bool':
  117. raise ValueError("dataio param's missing_fill {} not supported".format(self.missing_fill))
  118. if self.missing_fill_method is not None:
  119. self.missing_fill_method = self.check_and_change_lower(self.missing_fill_method,
  120. ['min', 'max', 'mean', 'designated'],
  121. descr)
  122. if self.outlier_replace_method is not None:
  123. self.outlier_replace_method = self.check_and_change_lower(self.outlier_replace_method,
  124. ['min', 'max', 'mean', 'designated'],
  125. descr)
  126. if type(self.with_label).__name__ != 'bool':
  127. raise ValueError("dataio param's with_label {} not supported".format(self.with_label))
  128. if self.with_label:
  129. if not isinstance(self.label_name, str):
  130. raise ValueError("dataio param's label_name {} should be str".format(self.label_name))
  131. self.label_type = self.check_and_change_lower(self.label_type,
  132. ["int", "int64", "float", "float64", "str", "long"],
  133. descr)
  134. if self.exclusive_data_type is not None and not isinstance(self.exclusive_data_type, dict):
  135. raise ValueError("exclusive_data_type is should be None or a dict")
  136. return True