data_transform_param.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. from federatedml.param.base_param import BaseParam
  19. class DataTransformParam(BaseParam):
  20. """
  21. Define data transform parameters that used in federated ml.
  22. Parameters
  23. ----------
  24. input_format : {'dense', 'sparse', 'tag'}
  25. please have a look at this tutorial at "DataTransform" section of federatedml/util/README.md.
  26. Formally,
  27. dense input format data should be set to "dense",
  28. svm-light input format data should be set to "sparse",
  29. tag or tag:value input format data should be set to "tag".
  30. Note: in fate's version >= 1.9.0, this params can be used in uploading/binding data's meta
  31. delimitor : str
  32. the delimitor of data input, default: ','
  33. data_type : int
  34. {'float64','float','int','int64','str','long'}
  35. the data type of data input
  36. exclusive_data_type : dict
  37. the key of dict is col_name, the value is data_type, use to specified special data type
  38. of some features.
  39. tag_with_value: bool
  40. use if input_format is 'tag', if tag_with_value is True,
  41. input column data format should be tag[delimitor]value, otherwise is tag only
  42. tag_value_delimitor: str
  43. use if input_format is 'tag' and 'tag_with_value' is True,
  44. delimitor of tag[delimitor]value column value.
  45. missing_fill : bool
  46. need to fill missing value or not, accepted only True/False, default: False
  47. default_value : None or object or list
  48. the value to replace missing value.
  49. if None, it will use default value define in federatedml/feature/imputer.py,
  50. if single object, will fill missing value with this object,
  51. if list, it's length should be the sample of input data' feature dimension,
  52. means that if some column happens to have missing values, it will replace it
  53. the value by element in the identical position of this list.
  54. missing_fill_method: None or str
  55. the method to replace missing value, should be one of [None, 'min', 'max', 'mean', 'designated']
  56. missing_impute: None or list
  57. element of list can be any type, or auto generated if value is None, define which values to be consider as missing
  58. outlier_replace: bool
  59. need to replace outlier value or not, accepted only True/False, default: True
  60. outlier_replace_method: None or str
  61. the method to replace missing value, should be one of [None, 'min', 'max', 'mean', 'designated']
  62. outlier_impute: None or list
  63. element of list can be any type, which values should be regard as missing value
  64. outlier_replace_value: None or object or list
  65. the value to replace outlier.
  66. if None, it will use default value define in federatedml/feature/imputer.py,
  67. if single object, will replace outlier with this object,
  68. if list, it's length should be the sample of input data' feature dimension,
  69. means that if some column happens to have outliers, it will replace it
  70. the value by element in the identical position of this list.
  71. with_label : bool
  72. True if input data consist of label, False otherwise. default: 'false'
  73. Note: in fate's version >= 1.9.0, this params can be used in uploading/binding data's meta
  74. label_name : str
  75. column_name of the column where label locates, only use in dense-inputformat. default: 'y'
  76. label_type : {'int','int64','float','float64','long','str'}
  77. use when with_label is True
  78. output_format : {'dense', 'sparse'}
  79. output format
  80. with_match_id: bool
  81. True if dataset has match_id, default: False
  82. Note: in fate's version >= 1.9.0, this params can be used in uploading/binding data's meta
  83. match_id_name: str
  84. Valid if input_format is "dense", and multiple columns are considered as match_ids,
  85. the name of match_id to be used in current job
  86. Note: in fate's version >= 1.9.0, this params can be used in uploading/binding data's meta
  87. match_id_index: int
  88. Valid if input_format is "tag" or "sparse", and multiple columns are considered as match_ids,
  89. the index of match_id, default: 0
  90. This param works only when data meta has been set with uploading/binding.
  91. """
  92. def __init__(self, input_format="dense", delimitor=',', data_type='float64',
  93. exclusive_data_type=None,
  94. tag_with_value=False, tag_value_delimitor=":",
  95. missing_fill=False, default_value=0, missing_fill_method=None,
  96. missing_impute=None, outlier_replace=False, outlier_replace_method=None,
  97. outlier_impute=None, outlier_replace_value=0,
  98. with_label=False, label_name='y',
  99. label_type='int', output_format='dense', need_run=True,
  100. with_match_id=False, match_id_name='', match_id_index=0):
  101. self.input_format = input_format
  102. self.delimitor = delimitor
  103. self.data_type = data_type
  104. self.exclusive_data_type = exclusive_data_type
  105. self.tag_with_value = tag_with_value
  106. self.tag_value_delimitor = tag_value_delimitor
  107. self.missing_fill = missing_fill
  108. self.default_value = default_value
  109. self.missing_fill_method = missing_fill_method
  110. self.missing_impute = missing_impute
  111. self.outlier_replace = outlier_replace
  112. self.outlier_replace_method = outlier_replace_method
  113. self.outlier_impute = outlier_impute
  114. self.outlier_replace_value = outlier_replace_value
  115. self.with_label = with_label
  116. self.label_name = label_name
  117. self.label_type = label_type
  118. self.output_format = output_format
  119. self.need_run = need_run
  120. self.with_match_id = with_match_id
  121. self.match_id_name = match_id_name
  122. self.match_id_index = match_id_index
  123. def check(self):
  124. descr = "data_transform param's"
  125. self.input_format = self.check_and_change_lower(self.input_format,
  126. ["dense", "sparse", "tag"],
  127. descr)
  128. self.output_format = self.check_and_change_lower(self.output_format,
  129. ["dense", "sparse"],
  130. descr)
  131. self.data_type = self.check_and_change_lower(self.data_type,
  132. ["int", "int64", "float", "float64", "str", "long"],
  133. descr)
  134. if type(self.missing_fill).__name__ != 'bool':
  135. raise ValueError("data_transform param's missing_fill {} not supported".format(self.missing_fill))
  136. if self.missing_fill_method is not None:
  137. self.missing_fill_method = self.check_and_change_lower(self.missing_fill_method,
  138. ['min', 'max', 'mean', 'designated'],
  139. descr)
  140. if self.outlier_replace_method is not None:
  141. self.outlier_replace_method = self.check_and_change_lower(self.outlier_replace_method,
  142. ['min', 'max', 'mean', 'designated'],
  143. descr)
  144. if type(self.with_label).__name__ != 'bool':
  145. raise ValueError("data_transform param's with_label {} not supported".format(self.with_label))
  146. if self.with_label:
  147. if not isinstance(self.label_name, str):
  148. raise ValueError("data transform param's label_name {} should be str".format(self.label_name))
  149. self.label_type = self.check_and_change_lower(self.label_type,
  150. ["int", "int64", "float", "float64", "str", "long"],
  151. descr)
  152. if self.exclusive_data_type is not None and not isinstance(self.exclusive_data_type, dict):
  153. raise ValueError("exclusive_data_type is should be None or a dict")
  154. if not isinstance(self.with_match_id, bool):
  155. raise ValueError("with_match_id should be boolean variable, but {} find".format(self.with_match_id))
  156. if not isinstance(self.match_id_index, int) or self.match_id_index < 0:
  157. raise ValueError("match_id_index should be non negative integer")
  158. if self.match_id_name is not None and not isinstance(self.match_id_name, str):
  159. raise ValueError("match_id_name should be str")
  160. return True