statics.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674
  1. #
  2. # Copyright 2019 The FATE Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import copy
  17. import functools
  18. import math
  19. import numpy as np
  20. from federatedml.feature.binning.quantile_binning import QuantileBinning
  21. from federatedml.feature.binning.quantile_summaries import QuantileSummaries
  22. from federatedml.feature.instance import Instance
  23. from federatedml.feature.sparse_vector import SparseVector
  24. from federatedml.param.feature_binning_param import FeatureBinningParam
  25. from federatedml.statistic import data_overview
  26. # from federatedml.statistic.feature_statistic import feature_statistic
  27. from federatedml.util import LOGGER
  28. from federatedml.util import consts
  29. class SummaryStatistics(object):
  30. def __init__(self, length, abnormal_list=None, stat_order=2, bias=True):
  31. self.abnormal_list = abnormal_list
  32. self.sum = np.zeros(length)
  33. self.sum_square = np.zeros(length)
  34. self.max_value = -np.inf * np.ones(length)
  35. self.min_value = np.inf * np.ones(length)
  36. self.count = np.zeros(length)
  37. self.length = length
  38. self.stat_order = stat_order
  39. self.bias = bias
  40. m = 3
  41. while m <= stat_order:
  42. exp_sum_m = np.zeros(length)
  43. setattr(self, f"exp_sum_{m}", exp_sum_m)
  44. m += 1
  45. def add_rows(self, rows):
  46. """
  47. When getting E(x^n), the formula are:
  48. .. math::
  49. (i-1)/i * S_{i-1} + 1/i * x_i
  50. where i is the current count, and S_i is the current expectation of x
  51. """
  52. # if self.abnormal_list is None:
  53. if not self.abnormal_list:
  54. rows = np.array(rows, dtype=float)
  55. self.count += 1
  56. self.sum += rows
  57. self.sum_square += rows ** 2
  58. self.max_value = np.max([self.max_value, rows], axis=0)
  59. self.min_value = np.min([self.min_value, rows], axis=0)
  60. for m in range(3, self.stat_order + 1):
  61. exp_sum_m = getattr(self, f"exp_sum_{m}")
  62. # exp_sum_m += rows ** m
  63. exp_sum_m = (self.count - 1) / self.count * exp_sum_m + rows ** m / self.count
  64. setattr(self, f"exp_sum_{m}", exp_sum_m)
  65. else:
  66. filter_rows = []
  67. filter_idx = []
  68. for idx, value in enumerate(rows):
  69. if value in self.abnormal_list or (isinstance(value, float) and np.isnan(value)):
  70. continue
  71. try:
  72. value = float(value)
  73. except ValueError as e:
  74. raise ValueError(f"In add func, value should be either a numeric input or be listed in "
  75. f"abnormal list. Error info: {e}")
  76. filter_rows.append(value)
  77. filter_idx.append(idx)
  78. if not filter_idx:
  79. return
  80. filter_rows = np.array(filter_rows, dtype=float)
  81. filter_idx = np.array(filter_idx)
  82. self.count[filter_idx] += 1
  83. self.sum[filter_idx] += filter_rows
  84. self.sum_square[filter_idx] += filter_rows ** 2
  85. self.max_value[filter_idx] = np.max([self.max_value[filter_idx], filter_rows], axis=0)
  86. self.min_value[filter_idx] = np.min([self.min_value[filter_idx], filter_rows], axis=0)
  87. for m in range(3, self.stat_order + 1):
  88. exp_sum_m = getattr(self, f"exp_sum_{m}")
  89. # exp_sum_m[filter_idx] += filter_rows ** m
  90. exp_sum_m[filter_idx] = (self.count[filter_idx] - 1) / self.count[filter_idx] * \
  91. exp_sum_m[filter_idx] + filter_rows ** m / self.count[filter_idx]
  92. setattr(self, f"exp_sum_{m}", exp_sum_m)
  93. """
  94. for idx, value in enumerate(rows):
  95. if value in self.abnormal_list:
  96. continue
  97. try:
  98. value = float(value)
  99. except ValueError as e:
  100. raise ValueError(f"In add func, value should be either a numeric input or be listed in "
  101. f"abnormal list. Error info: {e}")
  102. self.count[idx] += 1
  103. self.sum[idx] += value
  104. self.sum_square[idx] += value ** 2
  105. self.max_value[idx] = np.max([self.max_value[idx], value])
  106. self.min_value[idx] = np.min([self.min_value[idx], value])
  107. for m in range(3, self.stat_order + 1):
  108. exp_sum_m = getattr(self, f"exp_sum_{m}")
  109. exp_sum_m[idx] = (self.count[idx] - 1) / self.count[idx] * \
  110. exp_sum_m[idx] + rows[idx] ** m / self.count[idx]
  111. setattr(self, f"exp_sum_{m}", exp_sum_m)
  112. """
  113. def merge(self, other):
  114. if self.stat_order != other.stat_order:
  115. raise AssertionError("Two merging summary should have same order.")
  116. self.sum += other.sum
  117. self.sum_square += other.sum_square
  118. self.max_value = np.max([self.max_value, other.max_value], axis=0)
  119. self.min_value = np.min([self.min_value, other.min_value], axis=0)
  120. for m in range(3, self.stat_order + 1):
  121. sum_m_1 = getattr(self, f"exp_sum_{m}")
  122. sum_m_2 = getattr(other, f"exp_sum_{m}")
  123. exp_sum = (sum_m_1 * self.count + sum_m_2 * other.count) / (self.count + other.count)
  124. setattr(self, f"exp_sum_{m}", exp_sum)
  125. self.count += other.count
  126. return self
  127. """
  128. def summary(self):
  129. for m in range(3, self.stat_order + 1):
  130. exp_sum_m = getattr(self, f"exp_sum_{m}")
  131. for idx, cnt in enumerate(self.count):
  132. if np.abs(cnt) < consts.FLOAT_ZERO:
  133. continue
  134. exp_sum_m[idx] /= cnt
  135. setattr(self, f"exp_sum_{m}", exp_sum_m)
  136. """
  137. @property
  138. def mean(self):
  139. return self.sum / self.count
  140. @property
  141. def max(self):
  142. return self.max_value
  143. @property
  144. def min(self):
  145. return self.min_value
  146. @property
  147. def variance(self):
  148. mean = self.mean
  149. variance = self.sum_square / self.count - mean ** 2
  150. variance = np.array([x if math.fabs(x) >= consts.FLOAT_ZERO else 0.0 for x in variance])
  151. return variance
  152. @property
  153. def coefficient_of_variance(self):
  154. mean = np.array([consts.FLOAT_ZERO if math.fabs(x) < consts.FLOAT_ZERO else x
  155. for x in self.mean])
  156. return np.fabs(self.stddev / mean)
  157. @property
  158. def stddev(self):
  159. return np.sqrt(self.variance)
  160. @property
  161. def moment_3(self):
  162. """
  163. In mathematics, a moment is a specific quantitative measure of the shape of a function.
  164. where the k-th central moment of a data sample is:
  165. .. math::
  166. m_k = \frac{1}{n} \\sum_{i = 1}^n (x_i - \bar{x})^k
  167. the 3rd central moment is often used to calculate the coefficient of skewness
  168. """
  169. if self.stat_order < 3:
  170. raise ValueError("The third order of expectation sum has not been statistic.")
  171. exp_sum_2 = self.sum_square / self.count
  172. exp_sum_3 = getattr(self, "exp_sum_3")
  173. mu = self.mean
  174. return exp_sum_3 - 3 * mu * exp_sum_2 + 2 * mu ** 3
  175. @property
  176. def moment_4(self):
  177. """
  178. In mathematics, a moment is a specific quantitative measure of the shape of a function.
  179. where the k-th central moment of a data sample is:
  180. .. math::
  181. m_k = \frac{1}{n} \\ sum_{i = 1}^n (x_i - \bar{x})^k
  182. the 4th central moment is often used to calculate the coefficient of kurtosis
  183. """
  184. if self.stat_order < 3:
  185. raise ValueError("The third order of expectation sum has not been statistic.")
  186. exp_sum_2 = self.sum_square / self.count
  187. exp_sum_3 = getattr(self, "exp_sum_3")
  188. exp_sum_4 = getattr(self, "exp_sum_4")
  189. mu = self.mean
  190. return exp_sum_4 - 4 * mu * exp_sum_3 + 6 * mu ** 2 * exp_sum_2 - 3 * mu ** 4
  191. @property
  192. def skewness(self):
  193. """
  194. The sample skewness is computed as the Fisher-Pearson coefficient
  195. of skewness, i.e.
  196. .. math::
  197. g_1=\frac{m_3}{m_2^{3/2}}
  198. where
  199. .. math::
  200. m_i=\frac{1}{N}\\sum_{n=1}^N(x[n]-\bar{x})^i
  201. If the bias is False, return the adjusted Fisher-Pearson standardized moment coefficient
  202. i.e.
  203. .. math::
  204. G_1=\frac{k_3}{k_2^{3/2}}=
  205. \frac{\\sqrt{N(N-1)}}{N-2}\frac{m_3}{m_2^{3/2}}.
  206. """
  207. m2 = self.variance
  208. m3 = self.moment_3
  209. n = self.count
  210. zero = (m2 == 0)
  211. np.seterr(divide='ignore', invalid='ignore')
  212. vals = np.where(zero, 0, m3 / m2 ** 1.5)
  213. if not self.bias:
  214. can_correct = (n > 2) & (m2 > 0)
  215. if can_correct.any():
  216. m2 = np.extract(can_correct, m2)
  217. m3 = np.extract(can_correct, m3)
  218. nval = np.sqrt((n - 1.0) * n) / (n - 2.0) * m3 / m2 ** 1.5
  219. np.place(vals, can_correct, nval)
  220. return vals
  221. @property
  222. def kurtosis(self):
  223. """
  224. Return the sample excess kurtosis which
  225. .. math::
  226. g = \frac{m_4}{m_2^2} - 3
  227. If bias is False, the calculations are corrected for statistical bias.
  228. """
  229. m2 = self.variance
  230. m4 = self.moment_4
  231. n = self.count
  232. zero = (m2 == 0)
  233. np.seterr(divide='ignore', invalid='ignore')
  234. result = np.where(zero, 0, m4 / m2 ** 2.0)
  235. if not self.bias:
  236. can_correct = (n > 3) & (m2 > 0)
  237. if can_correct.any():
  238. m2 = np.extract(can_correct, m2)
  239. m4 = np.extract(can_correct, m4)
  240. nval = 1.0 / (n - 2) / (n - 3) * ((n ** 2 - 1.0) * m4 / m2 ** 2.0 - 3 * (n - 1) ** 2.0)
  241. np.place(result, can_correct, nval + 3.0)
  242. return result - 3
  243. class MissingStatistic(object):
  244. def __init__(self, missing_val=None):
  245. super(MissingStatistic, self).__init__()
  246. self.missing_val = None
  247. self.feature_summary = {}
  248. self.count_summary = {}
  249. self.missing_feature = []
  250. self.all_feature_list = []
  251. self.tag_id_mapping, self.id_tag_mapping = {}, {}
  252. self.dense_missing_val = missing_val
  253. @staticmethod
  254. def is_sparse(tb):
  255. return isinstance(tb.take(1)[0][1].features, SparseVector)
  256. @staticmethod
  257. def check_table_content(tb):
  258. if not tb.count() > 0:
  259. raise ValueError('input table must contains at least 1 sample')
  260. first_ = tb.take(1)[0][1]
  261. if isinstance(first_, Instance):
  262. return True
  263. else:
  264. raise ValueError('unknown input format')
  265. def fit(self, tb):
  266. LOGGER.debug('start to compute feature lost ratio')
  267. if not self.check_table_content(tb):
  268. raise ValueError('contents of input table must be instances of class “Instance"')
  269. header = tb.schema['header']
  270. self.all_feature_list = header
  271. self.tag_id_mapping = {v: k for k, v in enumerate(header)}
  272. self.id_tag_mapping = {k: v for k, v in enumerate(header)}
  273. feature_count_rs = self.count_feature_ratio(tb, self.tag_id_mapping, not self.is_sparse(tb),
  274. missing_val=self.missing_val)
  275. total_count = tb.count()
  276. for idx, count_val in enumerate(feature_count_rs):
  277. self.feature_summary[self.id_tag_mapping[idx]] = 1 - (count_val / total_count)
  278. self.count_summary[self.id_tag_mapping[idx]] = int(total_count - count_val)
  279. if (count_val / total_count) == 0:
  280. self.missing_feature.append(self.id_tag_mapping[idx])
  281. return self.feature_summary
  282. @staticmethod
  283. def count_feature_ratio(tb, tag_id_mapping, dense_input, missing_val=None):
  284. func = functools.partial(MissingStatistic.map_partitions_count, tag_id_mapping=tag_id_mapping,
  285. dense_input=dense_input,
  286. missing_val=missing_val)
  287. rs = tb.applyPartitions(func)
  288. return rs.reduce(MissingStatistic.reduce_count_rs)
  289. @staticmethod
  290. def map_partitions_count(iterable, tag_id_mapping, dense_input=True, missing_val=None):
  291. count_arr = np.zeros(len(tag_id_mapping))
  292. for k, v in iterable:
  293. # in dense input, missing feature is set as np.nan
  294. if dense_input:
  295. feature = v.features # a numpy array
  296. arr = np.array(list(feature))
  297. if missing_val is None:
  298. idx_arr = np.argwhere(~np.isnan(arr)).flatten()
  299. else:
  300. idx_arr = np.argwhere(~(arr == missing_val)).flatten()
  301. # in sparse input, missing features have no key in the dict
  302. else:
  303. feature = v.features.sparse_vec # a dict
  304. idx_arr = np.array(list(feature.keys()))
  305. if len(idx_arr) != 0:
  306. count_arr[idx_arr] += 1
  307. return count_arr
  308. @staticmethod
  309. def reduce_count_rs(arr1, arr2):
  310. return arr1 + arr2
  311. class MultivariateStatisticalSummary(object):
  312. """
  313. """
  314. def __init__(self, data_instances, cols_index=-1, abnormal_list=None,
  315. error=consts.DEFAULT_RELATIVE_ERROR, stat_order=2, bias=True):
  316. self.finish_fit_statics = False # Use for static data
  317. # self.finish_fit_summaries = False # Use for quantile data
  318. self.binning_obj: QuantileBinning = None
  319. self.summary_statistics = None
  320. self.header = None
  321. # self.quantile_summary_dict = {}
  322. self.cols_dict = {}
  323. # self.medians = None
  324. self.data_instances = data_instances
  325. self.cols_index = None
  326. if not isinstance(abnormal_list, list):
  327. abnormal_list = [abnormal_list]
  328. self.abnormal_list = abnormal_list
  329. self.__init_cols(data_instances, cols_index, stat_order, bias)
  330. self.label_summary = None
  331. self.error = error
  332. self.missing_static_obj: MissingStatistic = None
  333. def __init_cols(self, data_instances, cols_index, stat_order, bias):
  334. header = data_overview.get_header(data_instances)
  335. self.header = header
  336. if cols_index == -1:
  337. self.cols_index = [i for i in range(len(header))]
  338. else:
  339. self.cols_index = cols_index
  340. LOGGER.debug(f"col_index: {cols_index}, self.col_index: {self.cols_index}")
  341. self.cols_dict = {header[indices]: indices for indices in self.cols_index}
  342. self.summary_statistics = SummaryStatistics(length=len(self.cols_index),
  343. abnormal_list=self.abnormal_list,
  344. stat_order=stat_order,
  345. bias=bias)
  346. def _static_sums(self):
  347. """
  348. Statics sum, sum_square, max_value, min_value,
  349. so that variance is available.
  350. """
  351. is_sparse = data_overview.is_sparse_data(self.data_instances)
  352. partition_cal = functools.partial(self.static_in_partition,
  353. cols_index=self.cols_index,
  354. summary_statistics=copy.deepcopy(self.summary_statistics),
  355. is_sparse=is_sparse)
  356. self.summary_statistics = self.data_instances.applyPartitions(partition_cal). \
  357. reduce(lambda x, y: self.copy_merge(x, y))
  358. # self.summary_statistics = summary_statistic_dict.reduce(self.aggregate_statics)
  359. self.finish_fit_statics = True
  360. def _static_quantile_summaries(self):
  361. """
  362. Static summaries so that can query a specific quantile point
  363. """
  364. if self.binning_obj is not None:
  365. return self.binning_obj
  366. bin_param = FeatureBinningParam(bin_num=2, bin_indexes=self.cols_index,
  367. error=self.error)
  368. self.binning_obj = QuantileBinning(bin_param, abnormal_list=self.abnormal_list)
  369. self.binning_obj.fit_split_points(self.data_instances)
  370. return self.binning_obj
  371. @staticmethod
  372. def copy_merge(s1, s2):
  373. new_s1 = copy.deepcopy(s1)
  374. return new_s1.merge(s2)
  375. @staticmethod
  376. def static_in_partition(data_instances, cols_index, summary_statistics, is_sparse):
  377. """
  378. Statics sums, sum_square, max and min value through one traversal
  379. Parameters
  380. ----------
  381. data_instances : Table
  382. The input data
  383. cols_index : indices
  384. Specify which column(s) need to apply statistic.
  385. summary_statistics: SummaryStatistics
  386. Returns
  387. -------
  388. Dict of SummaryStatistics object
  389. """
  390. cols_index_set = set(cols_index)
  391. for k, instances in data_instances:
  392. if not is_sparse:
  393. if isinstance(instances, Instance):
  394. features = instances.features
  395. else:
  396. features = instances
  397. # try:
  398. # features = np.array(instances, dtype=float)
  399. # except ValueError as e:
  400. # raise ValueError(f"Static Module accept numeric input only. Error info: {e}")
  401. # LOGGER.debug(f"In statics, features: {features}")
  402. # row_values = [x for idx, x in enumerate(features) if idx in cols_index]
  403. row_values = [x for idx, x in enumerate(features) if idx in cols_index_set]
  404. # row_values = features[cols_index]
  405. else:
  406. sparse_data = instances.features.get_sparse_vector()
  407. row_values = np.array([sparse_data.get(x, 0) for x in cols_index])
  408. summary_statistics.add_rows(row_values)
  409. # summary_statistics.summary()
  410. return summary_statistics
  411. @staticmethod
  412. def static_summaries_in_partition(data_instances, cols_dict, abnormal_list, error):
  413. """
  414. Statics sums, sum_square, max and min value through one traversal
  415. Parameters
  416. ----------
  417. data_instances : Table
  418. The input data
  419. cols_dict : dict
  420. Specify which column(s) need to apply statistic.
  421. abnormal_list: list
  422. Specify which values are not permitted.
  423. Returns
  424. -------
  425. Dict of SummaryStatistics object
  426. """
  427. summary_dict = {}
  428. for col_name in cols_dict:
  429. summary_dict[col_name] = QuantileSummaries(abnormal_list=abnormal_list, error=error)
  430. for k, instances in data_instances:
  431. if isinstance(instances, Instance):
  432. features = instances.features
  433. else:
  434. features = instances
  435. for col_name, col_index in cols_dict.items():
  436. value = features[col_index]
  437. summary_obj = summary_dict[col_name]
  438. summary_obj.insert(value)
  439. return summary_dict
  440. @staticmethod
  441. def aggregate_statics(s_dict1, s_dict2):
  442. if s_dict1 is None and s_dict2 is None:
  443. return None
  444. if s_dict1 is None:
  445. return s_dict2
  446. if s_dict2 is None:
  447. return s_dict1
  448. new_dict = {}
  449. for col_name, static_1 in s_dict1.items():
  450. static_1.merge(s_dict2[col_name])
  451. new_dict[col_name] = static_1
  452. return new_dict
  453. def get_median(self):
  454. if self.binning_obj is None:
  455. self._static_quantile_summaries()
  456. medians = self.binning_obj.query_quantile_point(query_points=0.5)
  457. return medians
  458. @property
  459. def median(self):
  460. median_dict = self.get_median()
  461. return np.array([median_dict[self.header[idx]] for idx in self.cols_index])
  462. def get_quantile_point(self, quantile):
  463. """
  464. Return the specific quantile point value
  465. Parameters
  466. ----------
  467. quantile : float, 0 <= quantile <= 1
  468. Specify which column(s) need to apply statistic.
  469. Returns
  470. -------
  471. return a dict of result quantile points.
  472. eg.
  473. quantile_point = {"x1": 3, "x2": 5... }
  474. """
  475. if self.binning_obj is None:
  476. self._static_quantile_summaries()
  477. quantile_points = self.binning_obj.query_quantile_point(quantile)
  478. return quantile_points
  479. def get_mean(self):
  480. """
  481. Return the mean value(s) of the given column
  482. Returns
  483. -------
  484. return a dict of result mean.
  485. """
  486. return self.get_statics("mean")
  487. def get_variance(self):
  488. return self.get_statics("variance")
  489. def get_std_variance(self):
  490. return self.get_statics("stddev")
  491. def get_max(self):
  492. return self.get_statics("max_value")
  493. def get_min(self):
  494. return self.get_statics("min_value")
  495. def get_statics(self, data_type):
  496. """
  497. Return the specific static value(s) of the given column
  498. Parameters
  499. ----------
  500. data_type : str, "mean", "variance", "std_variance", "max_value" or "mim_value"
  501. Specify which type to show.
  502. Returns
  503. -------
  504. return a list of result result. The order is the same as cols.
  505. """
  506. if not self.finish_fit_statics:
  507. self._static_sums()
  508. if hasattr(self.summary_statistics, data_type):
  509. result_row = getattr(self.summary_statistics, data_type)
  510. elif hasattr(self, data_type):
  511. result_row = getattr(self, data_type)
  512. else:
  513. raise ValueError(f"Statistic data type: {data_type} cannot be recognized")
  514. # LOGGER.debug(f"col_index: {self.cols_index}, result_row: {result_row},"
  515. # f"header: {self.header}, data_type: {data_type}")
  516. result = {}
  517. result_row = result_row.tolist()
  518. for col_idx, header_idx in enumerate(self.cols_index):
  519. result[self.header[header_idx]] = result_row[col_idx]
  520. return result
  521. def get_missing_ratio(self):
  522. return self.get_statics("missing_ratio")
  523. @property
  524. def missing_ratio(self):
  525. self.missing_static_obj = MissingStatistic()
  526. all_missing_ratio = self.missing_static_obj.fit(self.data_instances)
  527. return np.array([all_missing_ratio[self.header[idx]] for idx in self.cols_index])
  528. @property
  529. def missing_count(self):
  530. # missing_ratio = self.missing_ratio
  531. # missing_count = missing_ratio * self.data_instances.count()
  532. # return missing_count.astype(int)
  533. if self.missing_static_obj is None:
  534. self.missing_static_obj = MissingStatistic()
  535. self.missing_static_obj.fit(self.data_instances)
  536. all_missing_count = self.missing_static_obj.count_summary
  537. return np.array([all_missing_count[self.header[idx]] for idx in self.cols_index])
  538. @staticmethod
  539. def get_label_static_dict(data_instances):
  540. result_dict = {}
  541. for instance in data_instances:
  542. label_key = instance[1].label
  543. if label_key not in result_dict:
  544. result_dict[label_key] = 1
  545. else:
  546. result_dict[label_key] += 1
  547. return result_dict
  548. @staticmethod
  549. def merge_result_dict(dict_a, dict_b):
  550. for k, v in dict_b.items():
  551. if k in dict_a:
  552. dict_a[k] += v
  553. else:
  554. dict_a[k] = v
  555. return dict_a
  556. def get_label_histogram(self):
  557. label_histogram = self.data_instances.applyPartitions(self.get_label_static_dict).reduce(self.merge_result_dict)
  558. return label_histogram