imputer_test.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. import numpy as np
  2. import random
  3. import time
  4. import unittest
  5. from fate_arch.session import computing_session as session
  6. from federatedml.feature.imputer import Imputer
  7. class TestImputer(unittest.TestCase):
  8. def setUp(self):
  9. session.init("test_imputer_" + str(random.random()))
  10. str_time = time.strftime("%Y%m%d%H%M%S", time.localtime())
  11. self.test_data = [
  12. ["0.254879", "na", "0.209656", "10000", "-0.441366", "-10000", "-0.485934", "na", "-0.287570", "-0.733474"],
  13. ["-1.142928", "", "-1.166747", "-0.923578", "0.628230", "-1.021418", "-1.111867", "-0.959523", "-0.096672",
  14. "-0.121683"],
  15. ["-1.451067", "-1.406518", "none", "-1.092337", "none", "-1.168557", "-1.305831", "-1.745063", "-0.499499",
  16. "-0.302893"],
  17. ["-0.879933", "null", "-0.877527", "-0.780484", "-1.037534", "-0.483880", "-0.555498", "-0.768581",
  18. "0.433960", "-0.200928"],
  19. ["0.426758", "0.723479", "0.316885", "0.287273", "1.000835", "0.962702", "1.077099", "1.053586", "2.996525",
  20. "0.961696"],
  21. ["0.963102", "1.467675", "0.829202", "0.772457", "-0.038076", "-0.468613", "-0.307946", "-0.015321",
  22. "-0.641864", "-0.247477"],
  23. ["-0.662496", "0.212149", "-0.620475", "-0.632995", "-0.327392", "-0.385278", "-0.077665", "-0.730362",
  24. "0.217178", "-0.061280"],
  25. ["-0.453343", "-2.147457", "-0.473631", "-0.483572", "0.558093", "-0.740244", "-0.896170", "-0.617229",
  26. "-0.308601", "-0.666975"],
  27. ["-0.606584", "-0.971725", "-0.678558", "-0.591332", "-0.963013", "-1.302401", "-1.212855", "-1.321154",
  28. "-1.591501", "-1.230554"],
  29. ["-0.583805", "-0.193332", "-0.633283", "-0.560041", "-0.349310", "-0.519504", "-0.610669", "-0.929526",
  30. "-0.196974", "-0.151608"]
  31. ]
  32. self.test_instance = []
  33. for td in self.test_data:
  34. self.test_instance.append(td)
  35. self.table_instance = self.data_to_table(self.test_instance)
  36. self.table_instance.schema['header'] = ["fid" + str(i) for i in range(len(self.test_data[0]))]
  37. self.table_instance.schema['anonymous_header'] = [
  38. "guest_9999_x" + str(i) for i in range(len(self.test_data[0]))]
  39. def print_table(self, table):
  40. for v in (list(table.collect())):
  41. print(v[1].features)
  42. def data_to_table(self, data, partition=10):
  43. data_table = session.parallelize(data, include_key=False, partition=partition)
  44. return data_table
  45. def table_to_list(self, table_instance):
  46. res_list = []
  47. for k, v in list(table_instance.collect()):
  48. res_list.append(list(v))
  49. return res_list
  50. def fit_test_data(self, data, fit_values, imputer_value):
  51. for j in range(len(data)):
  52. for i in range(len(data[j])):
  53. if data[j][i] in imputer_value:
  54. data[j][i] = str(fit_values[i])
  55. return data
  56. def fit_test_data_float(self, data, fit_values, imputer_value):
  57. for j in range(len(data)):
  58. for i in range(len(data[j])):
  59. if data[j][i] in imputer_value:
  60. data[j][i] = float(fit_values[i])
  61. data[j][i] = float(data[j][i])
  62. return data
  63. def test_fit_min(self):
  64. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  65. imputer = Imputer(missing_value_list=imputer_value)
  66. process_data, cols_transform_value = imputer.fit(self.table_instance, "min", output_format='str')
  67. cols_transform_value_ground_true = [-1.451067, -2.147457, -1.166747, -1.092337, -1.037534, -1.302401, -1.305831,
  68. -1.745063, -1.591501, -1.230554]
  69. test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)
  70. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  71. self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
  72. def test_fit_max(self):
  73. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  74. imputer = Imputer(missing_value_list=imputer_value)
  75. process_data, cols_transform_value = imputer.fit(self.table_instance, "max", output_format='str')
  76. cols_transform_value_ground_true = [0.963102, 1.467675, 0.829202, 0.772457, 1.000835, 0.962702, 1.077099,
  77. 1.053586, 2.996525, 0.961696]
  78. test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)
  79. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  80. self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
  81. def test_fit_mean(self):
  82. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  83. imputer = Imputer(missing_value_list=imputer_value)
  84. process_data, cols_transform_value = imputer.fit(self.table_instance, "mean", output_format='str')
  85. cols_transform_value_ground_true = [-0.413542, -0.330818, -0.343831, -0.444957, -0.107726, -0.569688, -0.548734,
  86. -0.670353, 0.002498, -0.275518]
  87. # imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  88. # test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)
  89. # process_data_list = self.table_to_list(process_data)
  90. # process_data_list = [[round(float(i), 6) for i in v] for v in process_data_list]
  91. # process_data_list = [[str(i) for i in v] for v in process_data_list]
  92. cols_transform_value = [round(v, 6) for v in cols_transform_value]
  93. # self.assertListEqual(process_data_list, test_data_fit)
  94. self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
  95. def test_fit_replace_value(self):
  96. imputer_value = ['NA', 'naaa']
  97. imputer = Imputer(imputer_value)
  98. process_data, cols_transform_value = imputer.fit(self.table_instance, replace_method="designated",
  99. replace_value='111111', output_format='str')
  100. cols_transform_value_ground_true = ['111111' for _ in range(10)]
  101. test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)
  102. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  103. self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
  104. def test_fit_none_replace_method(self):
  105. imputer_value = ['NA', 'naaa']
  106. imputer = Imputer(imputer_value)
  107. process_data, cols_transform_value = imputer.fit(self.table_instance, output_format='str')
  108. cols_transform_value_ground_true = [0 for _ in range(10)]
  109. test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)
  110. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  111. self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
  112. def test_fit_max_float(self):
  113. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  114. imputer = Imputer(missing_value_list=imputer_value)
  115. process_data, cols_transform_value = imputer.fit(self.table_instance, "max", output_format='float')
  116. cols_transform_value_ground_true = [0.963102, 1.467675, 0.829202, 0.772457, 1.000835, 0.962702, 1.077099,
  117. 1.053586, 2.996525, 0.961696]
  118. test_data_fit = self.fit_test_data_float(self.test_data, cols_transform_value_ground_true, imputer_value)
  119. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  120. self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
  121. def test_transform(self):
  122. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  123. imputer = Imputer(missing_value_list=imputer_value)
  124. cols_transform_value_ground_true = [0.963102, 1.467675, 0.829202, 0.772457, 1.000835, 0.962702, 1.077099,
  125. 1.053586, 2.996525, 0.961696]
  126. process_data = imputer.transform(self.table_instance, cols_transform_value_ground_true)
  127. test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)
  128. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  129. def test_transform_float(self):
  130. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  131. imputer = Imputer(missing_value_list=imputer_value)
  132. cols_transform_value_ground_true = [0.963102, 1.467675, 0.829202, 0.772457, 1.000835, 0.962702, 1.077099,
  133. 1.053586, 2.996525, 0.961696]
  134. process_data = imputer.transform(self.table_instance, cols_transform_value_ground_true, output_format="float")
  135. test_data_fit = self.fit_test_data_float(self.test_data, cols_transform_value_ground_true, imputer_value)
  136. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  137. def test_fit_median(self):
  138. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  139. imputer = Imputer(missing_value_list=imputer_value)
  140. process_data, cols_transform_value = imputer.fit(self.table_instance, "median", output_format='str')
  141. cols_transform_value_ground_true = [-0.606584, -0.193332, -0.620475, -0.591332, -0.327392, -0.519504, -0.610669,
  142. -0.768581, -0.28757, -0.247477]
  143. test_data_fit = self.fit_test_data(self.test_data, cols_transform_value_ground_true, imputer_value)
  144. self.assertListEqual(self.table_to_list(process_data), test_data_fit)
  145. self.assertListEqual(cols_transform_value, cols_transform_value_ground_true)
  146. def test_get_impute_rate(self):
  147. imputer_value = ['', 'none', 'na', 'null', "10000", "-10000"]
  148. imputer = Imputer(missing_value_list=imputer_value)
  149. _, _ = imputer.fit(self.table_instance, "median", output_format='str')
  150. cols_impute_rate_ground_true = [0, 0.3, 0.1, 0.1, 0.1, 0.1, 0, 0.1, 0, 0]
  151. cols_fit_impute_rate = imputer.get_impute_rate(mode="fit")
  152. self.assertListEqual(cols_fit_impute_rate, cols_impute_rate_ground_true)
  153. cols_transform_value_ground_true = [-0.606584, -0.193332, -0.620475, -0.591332, -0.327392, -0.519504, -0.610669,
  154. -0.768581, -0.28757, -0.247477]
  155. _ = imputer.transform(self.table_instance, cols_transform_value_ground_true)
  156. cols_transform_impute_rate = imputer.get_impute_rate(mode="fit")
  157. self.assertListEqual(cols_transform_impute_rate, cols_impute_rate_ground_true)
  158. def tearDown(self):
  159. session.stop()
  160. if __name__ == "__main__":
  161. unittest.main()