config.yaml 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. # The unique identifier for each federated learning task
  2. task_id: ""
  3. # Provide dataset and federated learning simulation related configuration.
  4. data:
  5. # The root directory where datasets are stored.
  6. root: "./data/"
  7. # The name of the dataset, support: femnist, shakespeare, cifar10, and cifar100.
  8. dataset: femnist
  9. # The data distribution of each client, support: iid, niid (for femnist and shakespeare), and dir and class (for cifar datasets).
  10. # `iid` means independent and identically distributed data.
  11. # `niid` means non-independent and identically distributed data for FEMNIST and Shakespeare.
  12. # `dir` means using Dirichlet process to simulate non-iid data, for CIFAR-10 and CIFAR-100 datasets.
  13. # `class` means partitioning the dataset by label classes, for datasets like CIFAR-10, CIFAR-100.
  14. split_type: "iid"
  15. # The minimal number of samples in each client. It is applicable for LEAF datasets and dir simulation of CIFAR-10 and CIFAR-100.
  16. min_size: 10
  17. # The fraction of data sampled for LEAF datasets. e.g., 10% means that only 10% of total dataset size are used.
  18. data_amount: 0.05
  19. # The fraction of the number of clients used when the split_type is 'iid'.
  20. iid_fraction: 0.1
  21. # Whether partition users of the dataset into train-test groups. Only applicable to femnist and shakespeare datasets.
  22. # True means partitioning users of the dataset into train-test groups.
  23. # False means partitioning each users' samples into train-test groups.
  24. user: False
  25. # The fraction of data for training; the rest are for testing.
  26. train_test_split: 0.9
  27. # The number of classes in each client. Only applicable when the split_type is 'class'.
  28. class_per_client: 1
  29. # The targeted number of clients to construct.used in non-leaf dataset, number of clients split into. for leaf dataset, only used when split type class.
  30. num_of_clients: 100
  31. # The parameter for Dirichlet distribution simulation, applicable only when split_type is `dir` for CIFAR datasets.
  32. alpha: 0.5
  33. # The targeted distribution of quantities to simulate data quantity heterogeneity.
  34. # The values should sum up to 1. e.g., [0.1, 0.2, 0.7].
  35. # The `num_of_clients` should be divisible by `len(weights)`.
  36. # None means clients are simulated with the same data quantity.
  37. weights: NULL
  38. # The name of the model for training, support: lenet, rnn, resnet, resnet18, resnet50, vgg9.
  39. model: lenet
  40. # How to conduct testing, options: test_in_client or test_in_server.
  41. # `test_in_client` means that each client has a test set to run testing.
  42. # `test_in_server` means that server has a test set to run testing for the global model. Use this mode for cifar datasets.
  43. test_mode: "test_in_client"
  44. # The way to measure testing performance (accuracy) when test mode is `test_in_client`, support: average or weighted (means weighted average).
  45. test_method: "average"
  46. server:
  47. track: False # Whether track server metrics using the tracking service.
  48. rounds: 10 # Total training round.
  49. clients_per_round: 5 # The number of clients to train in each round.
  50. test_every: 1 # The frequency of testing: conduct testing every N round.
  51. save_model_every: 10 # The frequency of saving model: save model every N round.
  52. save_model_path: "" # The path to save model. Default path is root directory of the library.
  53. batch_size: 32 # The batch size of test_in_server.
  54. test_all: False # Whether test all clients or only selected clients.
  55. random_selection: True # Whether select clients to train randomly.
  56. # The strategy to aggregate client uploaded models, options: FedAvg, equal.
  57. # FedAvg aggregates models using weighted average, where the weights are data size of clients.
  58. # equal aggregates model by simple averaging.
  59. aggregation_stragtegy: "FedAvg"
  60. # The content of aggregation, options: all, parameters.
  61. # all means aggregating models using state_dict, including both model parameters and persistent buffers like BatchNorm stats.
  62. # parameters means aggregating only model parameters.
  63. aggregation_content: "all"
  64. client:
  65. track: False # Whether track server metrics using the tracking service.
  66. batch_size: 32 # The batch size of training in client.
  67. test_batch_size: 5 # The batch size of testing in client.
  68. local_epoch: 10 # The number of epochs to train in each round.
  69. optimizer:
  70. type: "Adam" # The name of the optimizer, options: Adam, SGD.
  71. lr: 0.001
  72. momentum: 0.9
  73. weight_decay: 0
  74. seed: 0
  75. local_test: False # Whether test the trained models in clients before uploading them to the server.
  76. gpu: 0 # The total number of GPUs used in training. 0 means CPU.
  77. distributed: # The distributed training configurations. It is only applicable when gpu > 1.
  78. backend: "nccl" # The distributed backend.
  79. init_method: ""
  80. world_size: 0
  81. rank: 0
  82. local_rank: 0
  83. tracking: # The configurations for logging and tracking.
  84. database: "" # The path of local dataset, sqlite3.
  85. log_file: ""
  86. log_level: "INFO" # The level of logging.
  87. metric_file: ""
  88. save_every: 1
  89. # The configuration for system heterogeneity simulation.
  90. resource_heterogeneous:
  91. simulate: False # Whether simulate system heterogeneity in federated learning.
  92. # The type of heterogeneity to simulate, support iso, dir, real.
  93. # iso means that
  94. hetero_type: "real"
  95. level: 3 # The level of heterogeneous (0-5), 0 means no heterogeneous among clients.
  96. sleep_group_num: 1000 # The number of groups with different sleep time. 1 means all clients are the same.
  97. total_time: 1000 # The total sleep time of all clients, unit: second.
  98. fraction: 1 # The fraction of clients attending heterogeneous simulation.
  99. grouping_strategy: "greedy" # The grouping strategy to handle system heterogeneity, support: random, greedy, slowest.
  100. initial_default_time: 5 # The estimated default training time for each training round, unit: second.
  101. default_time_momentum: 0.2 # The default momentum for default time update.
  102. seed: 0 # The random seed.