classification_metric.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. import copy
  2. import sys
  3. import numpy as np
  4. import pandas as pd
  5. from scipy.stats import stats
  6. from sklearn.metrics import accuracy_score
  7. from sklearn.metrics import precision_score
  8. from sklearn.metrics import recall_score
  9. from sklearn.metrics import average_precision_score
  10. ROUND_NUM = 6
  11. def neg_pos_count(labels: np.ndarray, pos_label: int):
  12. pos_num = ((labels == pos_label) + 0).sum()
  13. neg_num = len(labels) - pos_num
  14. return pos_num, neg_num
  15. def sort_score_and_label(labels: np.ndarray, pred_scores: np.ndarray):
  16. labels = np.array(labels)
  17. pred_scores = np.array(pred_scores)
  18. sort_idx = np.flip(pred_scores.argsort())
  19. sorted_labels = labels[sort_idx]
  20. sorted_scores = pred_scores[sort_idx]
  21. return sorted_labels, sorted_scores
  22. class ConfusionMatrix(object):
  23. @staticmethod
  24. def compute(sorted_labels: list, sorted_pred_scores: list, score_thresholds: list, ret: list, pos_label=1):
  25. for ret_type in ret:
  26. assert ret_type in ['tp', 'tn', 'fp', 'fn']
  27. sorted_labels = np.array(sorted_labels)
  28. sorted_scores = np.array(sorted_pred_scores)
  29. sorted_labels[sorted_labels != pos_label] = 0
  30. sorted_labels[sorted_labels == pos_label] = 1
  31. score_thresholds = np.array([score_thresholds]).transpose()
  32. pred_labels = (sorted_scores > score_thresholds) + 0
  33. ret_dict = {}
  34. if 'tp' in ret or 'tn' in ret:
  35. match_arr = (pred_labels + sorted_labels)
  36. if 'tp' in ret:
  37. tp_num = (match_arr == 2).sum(axis=-1)
  38. ret_dict['tp'] = tp_num
  39. if 'tn' in ret:
  40. tn_num = (match_arr == 0).sum(axis=-1)
  41. ret_dict['tn'] = tn_num
  42. if 'fp' in ret or 'fn' in ret:
  43. match_arr = (sorted_labels - pred_labels)
  44. if 'fp' in ret:
  45. fp_num = (match_arr == -1).sum(axis=-1)
  46. ret_dict['fp'] = fp_num
  47. if 'fn' in ret:
  48. fn_num = (match_arr == 1).sum(axis=-1)
  49. ret_dict['fn'] = fn_num
  50. return ret_dict
  51. class ThresholdCutter(object):
  52. @staticmethod
  53. def cut_by_step(sorted_scores, steps=0.01):
  54. assert isinstance(steps, float) and (0 < steps < 1)
  55. thresholds = list(set(sorted_scores))
  56. thresholds, cuts = ThresholdCutter.__filt_threshold(thresholds, 0.01)
  57. score_threshold = thresholds
  58. return score_threshold, cuts
  59. @staticmethod
  60. def fixed_interval_threshold(steps=0.01):
  61. intervals = np.array([i for i in range(0, 100)])
  62. intervals = intervals * steps
  63. return intervals
  64. @staticmethod
  65. def cut_by_index(sorted_scores):
  66. cuts = np.array([c / 100 for c in range(100)])
  67. data_size = len(sorted_scores)
  68. indexs = [int(data_size * cut) for cut in cuts]
  69. score_threshold = [sorted_scores[idx] for idx in indexs]
  70. return score_threshold, cuts
  71. @staticmethod
  72. def __filt_threshold(thresholds, step):
  73. cuts = list(map(float, np.arange(0, 1, step)))
  74. size = len(list(thresholds))
  75. thresholds.sort(reverse=True)
  76. index_list = [int(size * cut) for cut in cuts]
  77. new_thresholds = [thresholds[idx] for idx in index_list]
  78. return new_thresholds, cuts
  79. @staticmethod
  80. def cut_by_quantile(scores, quantile_list=None, interpolation='nearest', remove_duplicate=True):
  81. if quantile_list is None: # default is 20 intervals
  82. quantile_list = [round(i * 0.05, 3) for i in range(20)] + [1.0]
  83. quantile_val = np.quantile(scores, quantile_list, interpolation=interpolation)
  84. if remove_duplicate:
  85. quantile_val = sorted(list(set(quantile_val)))
  86. else:
  87. quantile_val = sorted(list(quantile_val))
  88. if len(quantile_val) == 1:
  89. quantile_val = [np.min(scores), np.max(scores)]
  90. return quantile_val
  91. class KS(object):
  92. @staticmethod
  93. def compute(labels, pred_scores, pos_label=1, fixed_interval_threshold=True):
  94. sorted_labels, sorted_scores = sort_score_and_label(labels, pred_scores)
  95. threshold, cuts = ThresholdCutter.cut_by_index(sorted_scores)
  96. confusion_mat = ConfusionMatrix.compute(sorted_labels, sorted_scores, threshold, ret=['tp', 'fp'],
  97. pos_label=pos_label)
  98. pos_num, neg_num = neg_pos_count(sorted_labels, pos_label=pos_label)
  99. assert pos_num > 0 and neg_num > 0, "error when computing KS metric, pos sample number and neg sample number" \
  100. "must be larger than 0"
  101. tpr_arr = confusion_mat['tp'] / pos_num
  102. fpr_arr = confusion_mat['fp'] / neg_num
  103. tpr = np.append(tpr_arr, np.array([1.0]))
  104. fpr = np.append(fpr_arr, np.array([1.0]))
  105. cuts = np.append(cuts, np.array([1.0]))
  106. ks_curve = tpr[:-1] - fpr[:-1]
  107. ks_val = np.max(ks_curve)
  108. return ks_val, fpr, tpr, threshold, cuts
  109. class BiClassMetric(object):
  110. def __init__(self, cut_method='step', remove_duplicate=False, pos_label=1):
  111. assert cut_method in ['step', 'quantile']
  112. self.cut_method = cut_method
  113. self.remove_duplicate = remove_duplicate # available when cut_method is quantile
  114. self.pos_label = pos_label
  115. def prepare_confusion_mat(self, labels, scores, add_to_end=True, ):
  116. sorted_labels, sorted_scores = sort_score_and_label(labels, scores)
  117. score_threshold, cuts = None, None
  118. if self.cut_method == 'step':
  119. score_threshold, cuts = ThresholdCutter.cut_by_step(sorted_scores, steps=0.01)
  120. if add_to_end:
  121. score_threshold.append(min(score_threshold) - 0.001)
  122. cuts.append(1)
  123. elif self.cut_method == 'quantile':
  124. score_threshold = ThresholdCutter.cut_by_quantile(sorted_scores, remove_duplicate=self.remove_duplicate)
  125. score_threshold = list(np.flip(score_threshold))
  126. confusion_mat = ConfusionMatrix.compute(sorted_labels, sorted_scores, score_threshold,
  127. ret=['tp', 'fp', 'fn', 'tn'], pos_label=self.pos_label)
  128. return confusion_mat, score_threshold, cuts
  129. def compute(self, labels, scores, ):
  130. confusion_mat, score_threshold, cuts = self.prepare_confusion_mat(labels, scores, )
  131. metric_scores = self.compute_metric_from_confusion_mat(confusion_mat)
  132. return list(metric_scores), score_threshold, cuts
  133. def compute_metric_from_confusion_mat(self, *args):
  134. raise NotImplementedError()
  135. class Lift(BiClassMetric):
  136. """
  137. Compute lift
  138. """
  139. @staticmethod
  140. def _lift_helper(val):
  141. tp, fp, fn, tn, labels_num = val[0], val[1], val[2], val[3], val[4]
  142. lift_x_type, lift_y_type = [], []
  143. for label_type in ['1', '0']:
  144. if label_type == '0':
  145. tp, tn = tn, tp
  146. fp, fn = fn, fp
  147. if labels_num == 0:
  148. lift_x = 1
  149. denominator = 1
  150. else:
  151. lift_x = (tp + fp) / labels_num
  152. denominator = (tp + fn) / labels_num
  153. if tp + fp == 0:
  154. numerator = 1
  155. else:
  156. numerator = tp / (tp + fp)
  157. if denominator == 0:
  158. lift_y = sys.float_info.max
  159. else:
  160. lift_y = numerator / denominator
  161. lift_x_type.insert(0, lift_x)
  162. lift_y_type.insert(0, lift_y)
  163. return lift_x_type, lift_y_type
  164. def compute(self, labels, pred_scores, pos_label=1):
  165. confusion_mat, score_threshold, cuts = self.prepare_confusion_mat(labels, pred_scores, add_to_end=False, )
  166. lifts_y, lifts_x = self.compute_metric_from_confusion_mat(confusion_mat, len(labels), )
  167. return lifts_y, lifts_x, list(score_threshold)
  168. def compute_metric_from_confusion_mat(self, confusion_mat, labels_len, ):
  169. labels_nums = np.zeros(len(confusion_mat['tp'])) + labels_len
  170. rs = map(self._lift_helper, zip(confusion_mat['tp'], confusion_mat['fp'],
  171. confusion_mat['fn'], confusion_mat['tn'], labels_nums))
  172. rs = list(rs)
  173. lifts_x, lifts_y = [i[0] for i in rs], [i[1] for i in rs]
  174. return lifts_y, lifts_x
  175. class Gain(BiClassMetric):
  176. """
  177. Compute Gain
  178. """
  179. @staticmethod
  180. def _gain_helper(val):
  181. tp, fp, fn, tn, num_label = val[0], val[1], val[2], val[3], val[4]
  182. gain_x_type, gain_y_type = [], []
  183. for pos_label in ['1', '0']:
  184. if pos_label == '0':
  185. tp, tn = tn, tp
  186. fp, fn = fn, fp
  187. if num_label == 0:
  188. gain_x = 1
  189. else:
  190. gain_x = float((tp + fp) / num_label)
  191. num_positives = tp + fn
  192. if num_positives == 0:
  193. gain_y = 1
  194. else:
  195. gain_y = float(tp / num_positives)
  196. gain_x_type.insert(0, gain_x)
  197. gain_y_type.insert(0, gain_y)
  198. return gain_x_type, gain_y_type
  199. def compute(self, labels, pred_scores, pos_label=1):
  200. confusion_mat, score_threshold, cuts = self.prepare_confusion_mat(labels, pred_scores, add_to_end=False, )
  201. gain_y, gain_x = self.compute_metric_from_confusion_mat(confusion_mat, len(labels))
  202. return gain_y, gain_x, list(score_threshold)
  203. def compute_metric_from_confusion_mat(self, confusion_mat, labels_len):
  204. labels_nums = np.zeros(len(confusion_mat['tp'])) + labels_len
  205. rs = map(self._gain_helper, zip(confusion_mat['tp'], confusion_mat['fp'],
  206. confusion_mat['fn'], confusion_mat['tn'], labels_nums))
  207. rs = list(rs)
  208. gain_x, gain_y = [i[0] for i in rs], [i[1] for i in rs]
  209. return gain_y, gain_x
  210. class BiClassPrecision(BiClassMetric):
  211. """
  212. Compute binary classification precision
  213. """
  214. def compute_metric_from_confusion_mat(self, confusion_mat, formatted=True, impute_val=1.0):
  215. numerator = confusion_mat['tp']
  216. denominator = (confusion_mat['tp'] + confusion_mat['fp'])
  217. zero_indexes = (denominator == 0)
  218. denominator[zero_indexes] = 1
  219. precision_scores = numerator / denominator
  220. precision_scores[zero_indexes] = impute_val # impute_val is for prettifying when drawing pr curves
  221. if formatted:
  222. score_formatted = [[0, i] for i in precision_scores]
  223. return score_formatted
  224. else:
  225. return precision_scores
  226. class MultiClassPrecision(object):
  227. """
  228. Compute multi-classification precision
  229. """
  230. def compute(self, labels, pred_scores):
  231. all_labels = sorted(set(labels).union(set(pred_scores)))
  232. return precision_score(labels, pred_scores, average=None), all_labels
  233. class BiClassRecall(BiClassMetric):
  234. """
  235. Compute binary classification recall
  236. """
  237. def compute_metric_from_confusion_mat(self, confusion_mat, formatted=True):
  238. recall_scores = confusion_mat['tp'] / (confusion_mat['tp'] + confusion_mat['fn'])
  239. if formatted:
  240. score_formatted = [[0, i] for i in recall_scores]
  241. return score_formatted
  242. else:
  243. return recall_scores
  244. class MultiClassRecall(object):
  245. """
  246. Compute multi-classification recall
  247. """
  248. def compute(self, labels, pred_scores):
  249. all_labels = sorted(set(labels).union(set(pred_scores)))
  250. return recall_score(labels, pred_scores, average=None), all_labels
  251. class BiClassAccuracy(BiClassMetric):
  252. """
  253. Compute binary classification accuracy
  254. """
  255. def compute(self, labels, scores, normalize=True):
  256. confusion_mat, score_threshold, cuts = self.prepare_confusion_mat(labels, scores)
  257. metric_scores = self.compute_metric_from_confusion_mat(confusion_mat, normalize=normalize)
  258. return list(metric_scores), score_threshold[: len(metric_scores)], cuts[: len(metric_scores)]
  259. def compute_metric_from_confusion_mat(self, confusion_mat, normalize=True):
  260. rs = (confusion_mat['tp'] + confusion_mat['tn']) / \
  261. (confusion_mat['tp'] + confusion_mat['tn'] + confusion_mat['fn'] + confusion_mat['fp']) if normalize \
  262. else (confusion_mat['tp'] + confusion_mat['tn'])
  263. return rs[:-1]
  264. class MultiClassAccuracy(object):
  265. """
  266. Compute multi-classification accuracy
  267. """
  268. def compute(self, labels, pred_scores, normalize=True):
  269. return accuracy_score(labels, pred_scores, normalize=normalize)
  270. class FScore(object):
  271. """
  272. Compute F score from bi-class confusion mat
  273. """
  274. @staticmethod
  275. def compute(labels, pred_scores, beta=1, pos_label=1):
  276. sorted_labels, sorted_scores = sort_score_and_label(labels, pred_scores)
  277. _, cuts = ThresholdCutter.cut_by_step(sorted_scores, steps=0.01)
  278. fixed_interval_threshold = ThresholdCutter.fixed_interval_threshold()
  279. confusion_mat = ConfusionMatrix.compute(sorted_labels, sorted_scores,
  280. fixed_interval_threshold,
  281. ret=['tp', 'fp', 'fn', 'tn'], pos_label=pos_label)
  282. precision_computer = BiClassPrecision()
  283. recall_computer = BiClassRecall()
  284. p_score = precision_computer.compute_metric_from_confusion_mat(confusion_mat, formatted=False)
  285. r_score = recall_computer.compute_metric_from_confusion_mat(confusion_mat, formatted=False)
  286. beta_2 = beta * beta
  287. denominator = (beta_2 * p_score + r_score)
  288. denominator[denominator == 0] = 1e-6 # in case denominator is 0
  289. numerator = (1 + beta_2) * (p_score * r_score)
  290. f_score = numerator / denominator
  291. return f_score, fixed_interval_threshold, cuts
  292. class PSI(object):
  293. def compute(self, train_scores: list, validate_scores: list, train_labels=None, validate_labels=None,
  294. debug=False, str_intervals=False, round_num=3, pos_label=1):
  295. """
  296. train/validate scores: predicted scores on train/validate set
  297. train/validate labels: true labels
  298. debug: print debug message
  299. if train&validate labels are not None, count positive sample percentage in every interval
  300. pos_label: pos label
  301. round_num: round number
  302. str_intervals: return str intervals
  303. """
  304. train_scores = np.array(train_scores)
  305. validate_scores = np.array(validate_scores)
  306. quantile_points = ThresholdCutter().cut_by_quantile(train_scores)
  307. train_count = self.quantile_binning_and_count(train_scores, quantile_points)
  308. validate_count = self.quantile_binning_and_count(validate_scores, quantile_points)
  309. train_pos_perc, validate_pos_perc = None, None
  310. if train_labels is not None and validate_labels is not None:
  311. assert len(train_labels) == len(train_scores) and len(validate_labels) == len(validate_scores)
  312. train_labels, validate_labels = np.array(train_labels), np.array(validate_labels)
  313. train_pos_count = self.quantile_binning_and_count(train_scores[train_labels == pos_label], quantile_points)
  314. validate_pos_count = self.quantile_binning_and_count(validate_scores[validate_labels == pos_label],
  315. quantile_points)
  316. train_pos_perc = np.array(train_pos_count['count']) / np.array(train_count['count'])
  317. validate_pos_perc = np.array(validate_pos_count['count']) / np.array(validate_count['count'])
  318. # handle special cases
  319. train_pos_perc[train_pos_perc == np.inf] = -1
  320. validate_pos_perc[validate_pos_perc == np.inf] = -1
  321. train_pos_perc[np.isnan(train_pos_perc)] = 0
  322. validate_pos_perc[np.isnan(validate_pos_perc)] = 0
  323. if debug:
  324. print(train_count)
  325. print(validate_count)
  326. assert (train_count['interval'] == validate_count['interval']), 'train count interval is not equal to ' \
  327. 'validate count interval'
  328. expected_interval = np.array(train_count['count'])
  329. actual_interval = np.array(validate_count['count'])
  330. expected_interval = expected_interval.astype(np.float)
  331. actual_interval = actual_interval.astype(np.float)
  332. psi_scores, total_psi, expected_interval, actual_interval, expected_percentage, actual_percentage \
  333. = self.psi_score(expected_interval, actual_interval, len(train_scores), len(validate_scores))
  334. intervals = train_count['interval'] if not str_intervals else PSI.intervals_to_str(train_count['interval'],
  335. round_num=round_num)
  336. if train_labels is None and validate_labels is None:
  337. return psi_scores, total_psi, expected_interval, expected_percentage, actual_interval, actual_percentage, \
  338. intervals
  339. else:
  340. return psi_scores, total_psi, expected_interval, expected_percentage, actual_interval, actual_percentage, \
  341. train_pos_perc, validate_pos_perc, intervals
  342. @staticmethod
  343. def quantile_binning_and_count(scores, quantile_points):
  344. """
  345. left edge and right edge of last interval are closed
  346. """
  347. assert len(quantile_points) >= 2
  348. left_bounds = copy.deepcopy(quantile_points[:-1])
  349. right_bounds = copy.deepcopy(quantile_points[1:])
  350. last_interval_left = left_bounds.pop()
  351. last_interval_right = right_bounds.pop()
  352. bin_result_1, bin_result_2 = None, None
  353. if len(left_bounds) != 0 and len(right_bounds) != 0:
  354. bin_result_1 = pd.cut(scores, pd.IntervalIndex.from_arrays(left_bounds, right_bounds, closed='left'))
  355. bin_result_2 = pd.cut(scores, pd.IntervalIndex.from_arrays([last_interval_left], [last_interval_right],
  356. closed='both'))
  357. count1 = None if bin_result_1 is None else bin_result_1.value_counts().reset_index()
  358. count2 = bin_result_2.value_counts().reset_index()
  359. # if predict scores are the same, count1 will be None, only one interval exists
  360. final_interval = list(count1['index']) + list(count2['index']) if count1 is not None else list(count2['index'])
  361. final_count = list(count1[0]) + list(count2[0]) if count1 is not None else list(count2[0])
  362. rs = {'interval': final_interval, 'count': final_count}
  363. return rs
  364. @staticmethod
  365. def interval_psi_score(val):
  366. expected, actual = val[0], val[1]
  367. return (actual - expected) * np.log(actual / expected)
  368. @staticmethod
  369. def intervals_to_str(intervals, round_num=3):
  370. str_intervals = []
  371. for interval in intervals:
  372. left_bound, right_bound = '[', ']'
  373. if interval.closed == 'left':
  374. right_bound = ')'
  375. elif interval.closed == 'right':
  376. left_bound = '('
  377. str_intervals.append("{}{}, {}{}".format(left_bound, round(interval.left, round_num),
  378. round(interval.right, round_num), right_bound))
  379. return str_intervals
  380. @staticmethod
  381. def psi_score(expected_interval: np.ndarray, actual_interval: np.ndarray, expect_total_num, actual_total_num,
  382. debug=False):
  383. expected_interval[expected_interval == 0] = 1e-6 # in case no overlap samples
  384. actual_interval[actual_interval == 0] = 1e-6 # in case no overlap samples
  385. expected_percentage = expected_interval / expect_total_num
  386. actual_percentage = actual_interval / actual_total_num
  387. if debug:
  388. print(expected_interval)
  389. print(actual_interval)
  390. print(expected_percentage)
  391. print(actual_percentage)
  392. psi_scores = list(map(PSI.interval_psi_score, zip(expected_percentage, actual_percentage)))
  393. psi_scores = np.array(psi_scores)
  394. total_psi = psi_scores.sum()
  395. return psi_scores, total_psi, expected_interval, actual_interval, expected_percentage, actual_percentage
  396. class KSTest(object):
  397. @staticmethod
  398. def compute(train_scores, validate_scores):
  399. """
  400. train/validate scores: predicted scores on train/validate set
  401. """
  402. return stats.ks_2samp(train_scores, validate_scores).pvalue
  403. class AveragePrecisionScore(object):
  404. @staticmethod
  405. def compute(train_scores, validate_scores, train_labels, validate_labels):
  406. """
  407. train/validate scores: predicted scores on train/validate set
  408. train/validate labels: true labels
  409. """
  410. train_mAP = average_precision_score(train_labels, train_scores)
  411. validate_mAP = average_precision_score(validate_labels, validate_scores)
  412. return abs(train_mAP - validate_mAP)
  413. class Distribution(object):
  414. @staticmethod
  415. def compute(train_scores: list, validate_scores: list):
  416. """
  417. train/validate scores: predicted scores on train/validate set
  418. """
  419. train_scores = np.array(train_scores)
  420. validate_scores = np.array(validate_scores)
  421. validate_scores = dict(validate_scores)
  422. count = 0
  423. for key, value in train_scores:
  424. if key in validate_scores.keys() and value != validate_scores.get(key):
  425. count += 1
  426. return count / len(train_scores)