optimizer.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. import numpy as np
  18. from federatedml.linear_model.linear_model_weight import LinearModelWeights
  19. from federatedml.util import LOGGER, consts, paillier_check, ipcl_operator
  20. class _Optimizer(object):
  21. def __init__(self, learning_rate, alpha, penalty, decay, decay_sqrt, mu=0):
  22. self.learning_rate = learning_rate
  23. self.iters = 0
  24. self.alpha = alpha
  25. self.penalty = penalty
  26. self.decay = decay
  27. self.decay_sqrt = decay_sqrt
  28. self.mu = mu
  29. def decay_learning_rate(self):
  30. if self.decay_sqrt:
  31. lr = self.learning_rate / np.sqrt(1 + self.decay * self.iters)
  32. else:
  33. lr = self.learning_rate / (1 + self.decay * self.iters)
  34. return lr
  35. @property
  36. def shrinkage_val(self):
  37. this_step_size = self.learning_rate / np.sqrt(self.iters)
  38. return self.alpha * this_step_size
  39. def set_iters(self, iters):
  40. self.iters = iters
  41. def apply_gradients(self, grad):
  42. raise NotImplementedError("Should not call here")
  43. def _l1_updator(self, model_weights: LinearModelWeights, gradient):
  44. coef_ = model_weights.coef_
  45. if model_weights.fit_intercept:
  46. gradient_without_intercept = gradient[: -1]
  47. else:
  48. gradient_without_intercept = gradient
  49. new_weights = np.sign(coef_ - gradient_without_intercept) * np.maximum(0, np.abs(
  50. coef_ - gradient_without_intercept) - self.shrinkage_val)
  51. if model_weights.fit_intercept:
  52. new_weights = np.append(new_weights, model_weights.intercept_)
  53. new_weights[-1] -= gradient[-1]
  54. new_param = LinearModelWeights(new_weights, model_weights.fit_intercept, model_weights.raise_overflow_error)
  55. # LOGGER.debug("In _l1_updator, original weight: {}, new_weights: {}".format(
  56. # model_weights.unboxed, new_weights
  57. # ))
  58. return new_param
  59. def _l2_updator(self, lr_weights: LinearModelWeights, gradient):
  60. """
  61. For l2 regularization, the regular term has been added in gradients.
  62. """
  63. new_weights = lr_weights.unboxed - gradient
  64. new_param = LinearModelWeights(new_weights, lr_weights.fit_intercept, lr_weights.raise_overflow_error)
  65. return new_param
  66. def add_regular_to_grad(self, grad, lr_weights):
  67. if self.penalty == consts.L2_PENALTY:
  68. if paillier_check.is_single_ipcl_encrypted_number(lr_weights.unboxed):
  69. grad_ct = ipcl_operator.merge_encrypted_number_array(grad)
  70. grad_ct = np.array(grad_ct)
  71. if lr_weights.fit_intercept:
  72. alpha = np.append(np.ones(len(grad) - 1) * self.alpha, 0.0)
  73. new_grad = grad_ct + lr_weights.unboxed.item(0) * alpha
  74. else:
  75. new_grad = grad_ct + self.alpha * lr_weights.coef_
  76. else:
  77. if lr_weights.fit_intercept:
  78. gradient_without_intercept = grad[: -1]
  79. gradient_without_intercept += self.alpha * lr_weights.coef_
  80. new_grad = np.append(gradient_without_intercept, grad[-1])
  81. else:
  82. new_grad = grad + self.alpha * lr_weights.coef_
  83. else:
  84. new_grad = grad
  85. return new_grad
  86. def regularization_update(self, model_weights: LinearModelWeights, grad,
  87. prev_round_weights: LinearModelWeights = None):
  88. # LOGGER.debug(f"In regularization_update, input model_weights: {model_weights.unboxed}")
  89. if self.penalty == consts.L1_PENALTY:
  90. model_weights = self._l1_updator(model_weights, grad)
  91. elif self.penalty == consts.L2_PENALTY:
  92. model_weights = self._l2_updator(model_weights, grad)
  93. else:
  94. new_vars = model_weights.unboxed - grad
  95. model_weights = LinearModelWeights(new_vars,
  96. model_weights.fit_intercept,
  97. model_weights.raise_overflow_error)
  98. if prev_round_weights is not None: # additional proximal term
  99. coef_ = model_weights.unboxed
  100. if model_weights.fit_intercept:
  101. coef_without_intercept = coef_[: -1]
  102. else:
  103. coef_without_intercept = coef_
  104. coef_without_intercept -= self.mu * (model_weights.coef_ - prev_round_weights.coef_)
  105. if model_weights.fit_intercept:
  106. new_coef_ = np.append(coef_without_intercept, coef_[-1])
  107. else:
  108. new_coef_ = coef_without_intercept
  109. model_weights = LinearModelWeights(new_coef_,
  110. model_weights.fit_intercept,
  111. model_weights.raise_overflow_error)
  112. return model_weights
  113. def __l1_loss_norm(self, model_weights: LinearModelWeights):
  114. coef_ = model_weights.coef_
  115. loss_norm = np.sum(self.alpha * np.abs(coef_))
  116. return loss_norm
  117. def __l2_loss_norm(self, model_weights: LinearModelWeights):
  118. coef_ = model_weights.coef_
  119. loss_norm = 0.5 * self.alpha * np.dot(coef_, coef_)
  120. return loss_norm
  121. def __add_proximal(self, model_weights, prev_round_weights):
  122. prev_round_coef_ = prev_round_weights.coef_
  123. coef_ = model_weights.coef_
  124. diff = coef_ - prev_round_coef_
  125. loss_norm = self.mu * 0.5 * np.dot(diff, diff)
  126. return loss_norm
  127. def loss_norm(self, model_weights: LinearModelWeights, prev_round_weights: LinearModelWeights = None):
  128. proximal_term = None
  129. if prev_round_weights is not None:
  130. proximal_term = self.__add_proximal(model_weights, prev_round_weights)
  131. if self.penalty == consts.L1_PENALTY:
  132. loss_norm_value = self.__l1_loss_norm(model_weights)
  133. elif self.penalty == consts.L2_PENALTY:
  134. loss_norm_value = self.__l2_loss_norm(model_weights)
  135. else:
  136. loss_norm_value = None
  137. # additional proximal term
  138. if loss_norm_value is None:
  139. loss_norm_value = proximal_term
  140. elif proximal_term is not None:
  141. loss_norm_value += proximal_term
  142. return loss_norm_value
  143. def hess_vector_norm(self, delta_s: LinearModelWeights):
  144. if self.penalty == consts.L1_PENALTY:
  145. return LinearModelWeights(np.zeros_like(delta_s.unboxed),
  146. fit_intercept=delta_s.fit_intercept,
  147. raise_overflow_error=delta_s.raise_overflow_error)
  148. elif self.penalty == consts.L2_PENALTY:
  149. return LinearModelWeights(self.alpha * np.array(delta_s.unboxed),
  150. fit_intercept=delta_s.fit_intercept,
  151. raise_overflow_error=delta_s.raise_overflow_error)
  152. else:
  153. return LinearModelWeights(np.zeros_like(delta_s.unboxed),
  154. fit_intercept=delta_s.fit_intercept,
  155. raise_overflow_error=delta_s.raise_overflow_error)
  156. def update_model(self, model_weights: LinearModelWeights, grad, prev_round_weights: LinearModelWeights = None,
  157. has_applied=True):
  158. if not has_applied:
  159. grad = self.add_regular_to_grad(grad, model_weights)
  160. delta_grad = self.apply_gradients(grad)
  161. else:
  162. delta_grad = grad
  163. model_weights = self.regularization_update(model_weights, delta_grad, prev_round_weights)
  164. return model_weights
  165. class _SgdOptimizer(_Optimizer):
  166. def apply_gradients(self, grad):
  167. learning_rate = self.decay_learning_rate()
  168. delta_grad = learning_rate * grad
  169. # LOGGER.debug("In sgd optimizer, learning_rate: {}, delta_grad: {}".format(learning_rate, delta_grad))
  170. return delta_grad
  171. class _RMSPropOptimizer(_Optimizer):
  172. def __init__(self, learning_rate, alpha, penalty, decay, decay_sqrt, mu):
  173. super().__init__(learning_rate, alpha, penalty, decay, decay_sqrt)
  174. self.rho = 0.99
  175. self.opt_m = None
  176. def apply_gradients(self, grad):
  177. learning_rate = self.decay_learning_rate()
  178. if self.opt_m is None:
  179. self.opt_m = np.zeros_like(grad)
  180. self.opt_m = self.rho * self.opt_m + (1 - self.rho) * np.square(grad)
  181. self.opt_m = np.array(self.opt_m, dtype=np.float64)
  182. delta_grad = learning_rate * grad / np.sqrt(self.opt_m + 1e-6)
  183. return delta_grad
  184. class _AdaGradOptimizer(_Optimizer):
  185. def __init__(self, learning_rate, alpha, penalty, decay, decay_sqrt, mu):
  186. super().__init__(learning_rate, alpha, penalty, decay, decay_sqrt)
  187. self.opt_m = None
  188. def apply_gradients(self, grad):
  189. learning_rate = self.decay_learning_rate()
  190. if self.opt_m is None:
  191. self.opt_m = np.zeros_like(grad)
  192. self.opt_m = self.opt_m + np.square(grad)
  193. self.opt_m = np.array(self.opt_m, dtype=np.float64)
  194. delta_grad = learning_rate * grad / (np.sqrt(self.opt_m) + 1e-7)
  195. return delta_grad
  196. class _NesterovMomentumSGDOpimizer(_Optimizer):
  197. def __init__(self, learning_rate, alpha, penalty, decay, decay_sqrt, mu):
  198. super().__init__(learning_rate, alpha, penalty, decay, decay_sqrt)
  199. self.nesterov_momentum_coeff = 0.9
  200. self.opt_m = None
  201. def apply_gradients(self, grad):
  202. learning_rate = self.decay_learning_rate()
  203. if self.opt_m is None:
  204. self.opt_m = np.zeros_like(grad)
  205. v = self.nesterov_momentum_coeff * self.opt_m - learning_rate * grad
  206. delta_grad = self.nesterov_momentum_coeff * self.opt_m - (1 + self.nesterov_momentum_coeff) * v
  207. self.opt_m = v
  208. # LOGGER.debug('In nesterov_momentum, opt_m: {}, v: {}, delta_grad: {}'.format(
  209. # self.opt_m, v, delta_grad
  210. # ))
  211. return delta_grad
  212. class _AdamOptimizer(_Optimizer):
  213. def __init__(self, learning_rate, alpha, penalty, decay, decay_sqrt, mu):
  214. super().__init__(learning_rate, alpha, penalty, decay, decay_sqrt)
  215. self.opt_beta1 = 0.9
  216. self.opt_beta2 = 0.999
  217. self.opt_beta1_decay = 1.0
  218. self.opt_beta2_decay = 1.0
  219. self.opt_m = None
  220. self.opt_v = None
  221. def apply_gradients(self, grad):
  222. learning_rate = self.decay_learning_rate()
  223. if self.opt_m is None:
  224. self.opt_m = np.zeros_like(grad)
  225. if self.opt_v is None:
  226. self.opt_v = np.zeros_like(grad)
  227. self.opt_beta1_decay = self.opt_beta1_decay * self.opt_beta1
  228. self.opt_beta2_decay = self.opt_beta2_decay * self.opt_beta2
  229. self.opt_m = self.opt_beta1 * self.opt_m + (1 - self.opt_beta1) * grad
  230. self.opt_v = self.opt_beta2 * self.opt_v + (1 - self.opt_beta2) * np.square(grad)
  231. opt_m_hat = self.opt_m / (1 - self.opt_beta1_decay)
  232. opt_v_hat = self.opt_v / (1 - self.opt_beta2_decay)
  233. opt_v_hat = np.array(opt_v_hat, dtype=np.float64)
  234. delta_grad = learning_rate * opt_m_hat / (np.sqrt(opt_v_hat) + 1e-8)
  235. return delta_grad
  236. class _StochasticQuansiNewtonOptimizer(_Optimizer):
  237. def __init__(self, learning_rate, alpha, penalty, decay, decay_sqrt, mu):
  238. super().__init__(learning_rate, alpha, penalty, decay, decay_sqrt)
  239. self.__opt_hess = None
  240. def apply_gradients(self, grad):
  241. learning_rate = self.decay_learning_rate()
  242. # LOGGER.debug("__opt_hess is: {}".format(self.__opt_hess))
  243. if self.__opt_hess is None:
  244. delta_grad = learning_rate * grad
  245. else:
  246. delta_grad = learning_rate * self.__opt_hess.dot(grad)
  247. # LOGGER.debug("In sqn updater, grad: {}, delta_grad: {}".format(grad, delta_grad))
  248. return delta_grad
  249. def set_hess_matrix(self, hess_matrix):
  250. self.__opt_hess = hess_matrix
  251. def optimizer_factory(param):
  252. try:
  253. optimizer_type = param.optimizer
  254. learning_rate = param.learning_rate
  255. alpha = param.alpha
  256. penalty = param.penalty
  257. decay = param.decay
  258. decay_sqrt = param.decay_sqrt
  259. if hasattr(param, 'mu'):
  260. mu = param.mu
  261. else:
  262. mu = 0.0
  263. init_params = [learning_rate, alpha, penalty, decay, decay_sqrt, mu]
  264. except AttributeError:
  265. raise AttributeError("Optimizer parameters has not been totally set")
  266. LOGGER.debug("in optimizer_factory, optimizer_type: {}, learning_rate: {}, alpha: {}, penalty: {},"
  267. "decay: {}, decay_sqrt: {}".format(optimizer_type, *init_params))
  268. if optimizer_type == 'sgd':
  269. return _SgdOptimizer(*init_params)
  270. elif optimizer_type == 'nesterov_momentum_sgd':
  271. return _NesterovMomentumSGDOpimizer(*init_params)
  272. elif optimizer_type == 'rmsprop':
  273. return _RMSPropOptimizer(*init_params)
  274. elif optimizer_type == 'adam':
  275. return _AdamOptimizer(*init_params)
  276. elif optimizer_type == 'adagrad':
  277. return _AdaGradOptimizer(*init_params)
  278. elif optimizer_type == 'sqn':
  279. return _StochasticQuansiNewtonOptimizer(*init_params)
  280. else:
  281. raise NotImplementedError("Optimize method cannot be recognized: {}".format(optimizer_type))