import copy import gc import logging import time from collections import Counter import numpy as np import torch import torch._utils import torch.nn as nn import torch.nn.functional as F import model import utils from communication import ONLINE, TARGET, BOTH, LOCAL, GLOBAL, DAPU, NONE, EMA, DYNAMIC_DAPU, DYNAMIC_EMA_ONLINE, SELECTIVE_EMA from easyfl.client.base import BaseClient from easyfl.distributed.distributed import CPU logger = logging.getLogger(__name__) L2 = "l2" class FedSSLClient(BaseClient): def __init__(self, cid, conf, train_data, test_data, device, sleep_time=0): super(FedSSLClient, self).__init__(cid, conf, train_data, test_data, device, sleep_time) self._local_model = None self.DAPU_predictor = LOCAL self.encoder_distance = 1 self.encoder_distances = [] self.previous_trained_round = -1 self.weight_scaler = None def decompression(self): if self.model is None: # Initialization at beginning of the task self.model = self.compressed_model self.update_model() def update_model(self): if self.conf.model in [model.MoCo, model.MoCoV2]: self.model.encoder_q = self.compressed_model.encoder_q # self.model.encoder_k = copy.deepcopy(self._local_model.encoder_k) elif self.conf.model == model.SimCLR: self.model.online_encoder = self.compressed_model.online_encoder elif self.conf.model in [model.SimSiam, model.SimSiamNoSG]: if self._local_model is None: self.model.online_encoder = self.compressed_model.online_encoder self.model.online_predictor = self.compressed_model.online_predictor return if self.conf.update_encoder == ONLINE: online_encoder = self.compressed_model.online_encoder else: raise ValueError(f"Encoder: aggregate {self.conf.aggregate_encoder}, " f"update {self.conf.update_encoder} is not supported") if self.conf.update_predictor == GLOBAL: predictor = self.compressed_model.online_predictor else: raise ValueError(f"Predictor: {self.conf.update_predictor} is not supported") self.model.online_encoder = copy.deepcopy(online_encoder) self.model.online_predictor = copy.deepcopy(predictor) elif self.conf.model in [model.Symmetric, model.SymmetricNoSG]: self.model.online_encoder = self.compressed_model.online_encoder elif self.conf.model in [model.BYOL, model.BYOLNoSG, model.BYOLNoPredictor]: if self._local_model is None: logger.info("Use aggregated encoder and predictor") self.model.online_encoder = self.compressed_model.online_encoder self.model.target_encoder = self.compressed_model.online_encoder self.model.online_predictor = self.compressed_model.online_predictor return def ema_online(): self._calculate_weight_scaler() logger.info(f"Encoder: update online with EMA of global encoder @ round {self.conf.round_id}") weight = self.encoder_distance weight = min(1, self.weight_scaler * weight) weight = 1 - weight self.compressed_model = self.compressed_model.cpu() online_encoder = self.compressed_model.online_encoder target_encoder = self._local_model.target_encoder ema_updater = model.EMA(weight) model.update_moving_average(ema_updater, online_encoder, self._local_model.online_encoder) return online_encoder, target_encoder def ema_predictor(): logger.info(f"Predictor: use dynamic DAPU") distance = self.encoder_distance distance = min(1, distance * self.weight_scaler) if distance > 0.5: weight = distance ema_updater = model.EMA(weight) predictor = self._local_model.online_predictor model.update_moving_average(ema_updater, predictor, self.compressed_model.online_predictor) else: weight = 1 - distance ema_updater = model.EMA(weight) predictor = self.compressed_model.online_predictor model.update_moving_average(ema_updater, predictor, self._local_model.online_predictor) return predictor if self.conf.aggregate_encoder == ONLINE and self.conf.update_encoder == ONLINE: logger.info("Encoder: aggregate online, update online") online_encoder = self.compressed_model.online_encoder target_encoder = self._local_model.target_encoder elif self.conf.aggregate_encoder == TARGET and self.conf.update_encoder == ONLINE: logger.info("Encoder: aggregate target, update online") online_encoder = self.compressed_model.target_encoder target_encoder = self._local_model.target_encoder elif self.conf.aggregate_encoder == TARGET and self.conf.update_encoder == TARGET: logger.info("Encoder: aggregate target, update target") online_encoder = self._local_model.online_encoder target_encoder = self.compressed_model.target_encoder elif self.conf.aggregate_encoder == ONLINE and self.conf.update_encoder == TARGET: logger.info("Encoder: aggregate online, update target") online_encoder = self._local_model.online_encoder target_encoder = self.compressed_model.online_encoder elif self.conf.aggregate_encoder == ONLINE and self.conf.update_encoder == BOTH: logger.info("Encoder: aggregate online, update both") online_encoder = self.compressed_model.online_encoder target_encoder = self.compressed_model.online_encoder elif self.conf.aggregate_encoder == TARGET and self.conf.update_encoder == BOTH: logger.info("Encoder: aggregate target, update both") online_encoder = self.compressed_model.target_encoder target_encoder = self.compressed_model.target_encoder elif self.conf.update_encoder == NONE: logger.info("Encoder: use local online and target encoders") online_encoder = self._local_model.online_encoder target_encoder = self._local_model.target_encoder elif self.conf.update_encoder == EMA: logger.info(f"Encoder: use EMA, weight {self.conf.encoder_weight}") online_encoder = self._local_model.online_encoder ema_updater = model.EMA(self.conf.encoder_weight) model.update_moving_average(ema_updater, online_encoder, self.compressed_model.online_encoder) target_encoder = self._local_model.target_encoder elif self.conf.update_encoder == DYNAMIC_EMA_ONLINE: # Use FedEMA to update online encoder online_encoder, target_encoder = ema_online() elif self.conf.update_encoder == SELECTIVE_EMA: # Use FedEMA to update online encoder # For random selection, only update with EMA when the client is selected in previous round. if self.previous_trained_round + 1 == self.conf.round_id: online_encoder, target_encoder = ema_online() else: logger.info(f"Encoder: update online and target @ round {self.conf.round_id}") online_encoder = self.compressed_model.online_encoder target_encoder = self.compressed_model.online_encoder else: raise ValueError(f"Encoder: aggregate {self.conf.aggregate_encoder}, " f"update {self.conf.update_encoder} is not supported") if self.conf.update_predictor == GLOBAL: logger.info("Predictor: use global predictor") predictor = self.compressed_model.online_predictor elif self.conf.update_predictor == LOCAL: logger.info("Predictor: use local predictor") predictor = self._local_model.online_predictor elif self.conf.update_predictor == DAPU: # Divergence-aware predictor update (DAPU) logger.info(f"Predictor: use DAPU, mu {self.conf.dapu_threshold}") if self.DAPU_predictor == GLOBAL: predictor = self.compressed_model.online_predictor elif self.DAPU_predictor == LOCAL: predictor = self._local_model.online_predictor else: raise ValueError(f"Predictor: DAPU predictor can either use local or global predictor") elif self.conf.update_predictor == DYNAMIC_DAPU: # Use FedEMA to update predictor predictor = ema_predictor() elif self.conf.update_predictor == SELECTIVE_EMA: # For random selection, only update with EMA when the client is selected in previous round. if self.previous_trained_round + 1 == self.conf.round_id: predictor = ema_predictor() else: logger.info("Predictor: use global predictor") predictor = self.compressed_model.online_predictor elif self.conf.update_predictor == EMA: logger.info(f"Predictor: use EMA, weight {self.conf.predictor_weight}") predictor = self._local_model.online_predictor ema_updater = model.EMA(self.conf.predictor_weight) model.update_moving_average(ema_updater, predictor, self.compressed_model.online_predictor) else: raise ValueError(f"Predictor: {self.conf.update_predictor} is not supported") self.model.online_encoder = copy.deepcopy(online_encoder) self.model.target_encoder = copy.deepcopy(target_encoder) self.model.online_predictor = copy.deepcopy(predictor) def train(self, conf, device=CPU): start_time = time.time() loss_fn, optimizer = self.pretrain_setup(conf, device) if conf.model in [model.MoCo, model.MoCoV2]: self.model.reset_key_encoder() self.train_loss = [] self.model.to(device) old_model = copy.deepcopy(nn.Sequential(*list(self.model.children())[:-1])).cpu() for i in range(conf.local_epoch): batch_loss = [] for (batched_x1, batched_x2), _ in self.train_loader: x1, x2 = batched_x1.to(device), batched_x2.to(device) optimizer.zero_grad() if conf.model in [model.MoCo, model.MoCoV2]: loss = self.model(x1, x2, device) elif conf.model == model.SimCLR: images = torch.cat((x1, x2), dim=0) features = self.model(images) logits, labels = self.info_nce_loss(features) loss = loss_fn(logits, labels) else: loss = self.model(x1, x2) loss.backward() optimizer.step() batch_loss.append(loss.item()) if conf.model in [model.BYOL, model.BYOLNoSG, model.BYOLNoPredictor] and conf.momentum_update: self.model.update_moving_average() current_epoch_loss = sum(batch_loss) / len(batch_loss) self.train_loss.append(float(current_epoch_loss)) self.train_time = time.time() - start_time # store trained model locally self._local_model = copy.deepcopy(self.model).cpu() self.previous_trained_round = conf.round_id if conf.update_predictor in [DAPU, DYNAMIC_DAPU, SELECTIVE_EMA] or conf.update_encoder in [DYNAMIC_EMA_ONLINE, SELECTIVE_EMA]: new_model = copy.deepcopy(nn.Sequential(*list(self.model.children())[:-1])).cpu() self.encoder_distance = self._calculate_divergence(old_model, new_model) self.encoder_distances.append(self.encoder_distance.item()) self.DAPU_predictor = self._DAPU_predictor_usage(self.encoder_distance) if self.conf.auto_scaler == 'y' and self.conf.random_selection: self._calculate_weight_scaler() if (conf.round_id + 1) % 100 == 0: logger.info(f"Client {self.cid}, encoder distances: {self.encoder_distances}") def _DAPU_predictor_usage(self, distance): if distance < self.conf.dapu_threshold: return GLOBAL else: return LOCAL def _calculate_divergence(self, old_model, new_model, typ=L2): size = 0 total_distance = 0 old_dict = old_model.state_dict() new_dict = new_model.state_dict() for name, param in old_model.named_parameters(): if 'conv' in name and 'weight' in name: total_distance += self._calculate_distance(old_dict[name].detach().clone().view(1, -1), new_dict[name].detach().clone().view(1, -1), typ) size += 1 if size!=0: distance = total_distance / size logger.info(f"Model distance: {distance} = {total_distance}/{size}") return distance else: return 0 def _calculate_distance(self, m1, m2, typ=L2): if typ == L2: return torch.dist(m1, m2, 2) def _calculate_weight_scaler(self): if not self.weight_scaler: if self.conf.auto_scaler == 'y': self.weight_scaler = self.conf.auto_scaler_target / self.encoder_distance else: self.weight_scaler = self.conf.weight_scaler logger.info(f"Client {self.cid}: weight scaler {self.weight_scaler}") def load_loader(self, conf): drop_last = conf.drop_last train_loader = self.train_data.loader(conf.batch_size, self.cid, shuffle=True, drop_last=drop_last, seed=conf.seed, transform=self._load_transform(conf)) _print_label_count(self.cid, self.train_data.data[self.cid]['y']) return train_loader def load_optimizer(self, conf): lr = conf.optimizer.lr if conf.optimizer.lr_type == "cosine": lr = compute_lr(conf.round_id, conf.rounds, 0, conf.optimizer.lr) # movo_v1 should use the default learning rate if conf.model == model.MoCo: lr = conf.optimizer.lr params = self.model.parameters() if conf.model in [model.BYOL, model.BYOLNoSG, model.BYOLNoPredictor]: params = [ {'params': self.model.online_encoder.parameters()}, {'params': self.model.online_predictor.parameters()} ] if conf.optimizer.type == "Adam": optimizer = torch.optim.Adam(params, lr=lr) else: optimizer = torch.optim.SGD(params, lr=lr, momentum=conf.optimizer.momentum, weight_decay=conf.optimizer.weight_decay) return optimizer def _load_transform(self, conf): transformation = utils.get_transformation(conf.model) return transformation(conf.image_size, conf.gaussian) def post_upload(self): if self.conf.model in [model.BYOL, model.BYOLNoSG, model.BYOLNoPredictor]: del self.model del self.compressed_model self.model = None self.compressed_model = None assert self.model is None assert self.compressed_model is None gc.collect() torch.cuda.empty_cache() def info_nce_loss(self, features, n_views=2, temperature=0.07): labels = torch.cat([torch.arange(self.conf.batch_size) for i in range(n_views)], dim=0) labels = (labels.unsqueeze(0) == labels.unsqueeze(1)).float() labels = labels.to(self.device) features = F.normalize(features, dim=1) similarity_matrix = torch.matmul(features, features.T) # assert similarity_matrix.shape == ( # n_views * self.conf.batch_size, n_views * self.conf.batch_size) # assert similarity_matrix.shape == labels.shape # discard the main diagonal from both: labels and similarities matrix mask = torch.eye(labels.shape[0], dtype=torch.bool).to(self.device) labels = labels[~mask].view(labels.shape[0], -1) similarity_matrix = similarity_matrix[~mask].view(similarity_matrix.shape[0], -1) # assert similarity_matrix.shape == labels.shape # select and combine multiple positives positives = similarity_matrix[labels.bool()].view(labels.shape[0], -1) # select only the negatives the negatives negatives = similarity_matrix[~labels.bool()].view(similarity_matrix.shape[0], -1) logits = torch.cat([positives, negatives], dim=1) labels = torch.zeros(logits.shape[0], dtype=torch.long).to(self.device) logits = logits / temperature return logits, labels def compute_lr(current_round, rounds=800, eta_min=0, eta_max=0.3): """Compute learning rate as cosine decay""" pi = np.pi eta_t = eta_min + 0.5 * (eta_max - eta_min) * (np.cos(pi * current_round / rounds) + 1) return eta_t def _print_label_count(cid, labels): logger.info(f"client {cid}: {Counter(labels)}")