123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- import numpy as np
- import lightgbm as lgb
- from ..component_converter import ComponentConverterBase
- from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import BoostingTreeModelMeta
- from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import BoostingTreeModelParam, \
- DecisionTreeModelParam, NodeParam
- from federatedml.util import consts
- from federatedml.util import LOGGER
- """
- We only keep the necessary variable to make sure that lightgbm can run predict function on the converted model
- """
- FAKE_FEATURE_INFO_STR = '[0:1] '
- END_OF_TREE = 'end of trees'
- END_OF_PARA = 'end of parameters'
- SPLIT = '\n\n'
- HEADER_TEMPLATE = """tree
- version=v3
- num_class={}
- num_tree_per_iteration={}
- label_index={}
- max_feature_idx={}
- objective={}
- feature_names={}
- feature_infos={}
- """
- TREE_TEMPLATE = """Tree={}
- num_leaves={}
- num_cat={}
- split_feature={}
- threshold={}
- decision_type={}
- left_child={}
- right_child={}
- leaf_value={}
- internal_value={}
- shrinkage={}
- """
- PARA_TEMPLATE = """parameters:
- [boosting: gbdt]
- [objective: {}]
- [num_iterations: {}]
- [learning_rate: {}]
- [max_depth: {}]
- [max_bin: {}]
- [use_missing: {}]
- [zero_as_missing: {}]
- [num_class: {}]
- [lambda_l1: {}]
- [lambda_l2: {}]
- [min_data_in_leaf: {}]
- [min_gain_to_split: {}]
- """
- LGB_OBJECTIVE = {
- consts.BINARY: "binary sigmoid:1",
- consts.REGRESSION: "regression",
- consts.MULTY: 'multiclass num_class:{}'
- }
- PARA_OBJECTIVE = {
- consts.BINARY: "binary",
- consts.REGRESSION: "regression",
- consts.MULTY: 'multiclass'
- }
- def get_decision_type(node: NodeParam, use_missing, zero_as_missing):
- # 00 0 0
- # Nan,0 or None default left or right? cat feature or not?
- default_type = 0 # 0000 None, default right, not cat feat
- if not use_missing:
- return default_type
- if node.missing_dir == -1:
- default_type = default_type | 2 # 0010
- if zero_as_missing:
- default_type = default_type | 4 # 0100 0
- else:
- default_type = default_type | 8 # 1000 np.Nan
- return default_type
- def get_lgb_objective(task_type, num_classes, ret_dict, need_multi_format=True):
- if task_type == consts.CLASSIFICATION:
- if num_classes == 1:
- objective = ret_dict[consts.BINARY]
- else:
- objective = ret_dict[consts.MULTY].format(num_classes) if need_multi_format else ret_dict[consts.MULTY]
- else:
- objective = ret_dict[consts.REGRESSION]
- return objective
- def list_to_str(l_):
- return str(l_).replace('[', '').replace(']', '').replace(',', '')
- def parse_header(param: BoostingTreeModelParam, meta: BoostingTreeModelMeta):
- # generated header of lgb str model file
- # binary/regression num class is 1 in lgb
- num_classes = len(param.classes_) if len(param.classes_) > 2 else 1
- objective = get_lgb_objective(meta.task_type, num_classes, LGB_OBJECTIVE, need_multi_format=True)
- num_tree_per_iteration = param.tree_dim
- label_index = 0 # by default
- max_feature_idx = len(param.feature_name_fid_mapping) - 1
- feature_names = ''
- for name in [param.feature_name_fid_mapping[i] for i in range(max_feature_idx + 1)]:
- if ' ' in name: # space is not allowed
- name = name.replace(' ', '-')
- feature_names += name + ' '
- feature_names = feature_names[:-1]
- feature_info = FAKE_FEATURE_INFO_STR * (max_feature_idx + 1) # need to make fake feature info
- feature_info = feature_info[:-1]
- result_str = HEADER_TEMPLATE.format(num_classes, num_tree_per_iteration, label_index, max_feature_idx,
- objective, feature_names, feature_info)
- return result_str
- def internal_count_computer(cur_id, tree_node, leaf_count, internal_count):
- if cur_id in leaf_count:
- return leaf_count[cur_id]
- left_count = internal_count_computer(tree_node[cur_id].left_nodeid, tree_node, leaf_count, internal_count)
- right_count = internal_count_computer(tree_node[cur_id].right_nodeid, tree_node, leaf_count, internal_count)
- internal_count[cur_id] = left_count + right_count
- return internal_count[cur_id]
- def compute_internal_count(tree_param: DecisionTreeModelParam):
- root = tree_param.tree_[0]
- internal_count = {}
- leaf_count = tree_param.leaf_count
- root_count = internal_count_computer(root.id, tree_param.tree_, leaf_count, internal_count)
- if root.id not in internal_count:
- internal_count[root_count] = root_count
- return internal_count
- def update_leaf_count(param):
- # in homo sbt, sometimes a leaf covers no sample, so need to add 1 to leaf count
- tmp = {}
- for i in param.leaf_count:
- tmp[i] = param.leaf_count[i]
- for i in tmp:
- if tmp[i] == 0:
- param.leaf_count[i] += 1
- def parse_a_tree(
- param: DecisionTreeModelParam,
- tree_idx: int,
- use_missing=False,
- zero_as_missing=False,
- learning_rate=0.1,
- init_score=None):
- split_feature = []
- split_threshold = []
- decision_type = []
- internal_weight = []
- leaf_weight = []
- left, right = [], []
- leaf_idx = -1
- lgb_node_idx = 0
- sbt_lgb_node_map = {}
- is_leaf = []
- leaf_count = []
- internal_count, internal_count_dict = [], {}
- has_count_info = len(param.leaf_count) != 0
- # compute internal count
- if has_count_info:
- update_leaf_count(param)
- internal_count_dict = compute_internal_count(param) # get internal count from leaf count
- # mark leaf nodes and get sbt-lgb node mapping
- for node in param.tree_:
- is_leaf.append(node.is_leaf)
- if not node.is_leaf:
- sbt_lgb_node_map[node.id] = lgb_node_idx
- lgb_node_idx += 1
- for cur_idx, node in enumerate(param.tree_):
- if not node.is_leaf:
- split_feature.append(node.fid)
- # if is hetero model need to decode split point and missing dir
- if param.split_maskdict and param.missing_dir_maskdict is not None:
- node.bid = param.split_maskdict[node.id]
- node.missing_dir = param.missing_dir_maskdict[node.id]
- # extract split point and weight
- split_threshold.append(node.bid)
- internal_weight.append(node.weight)
- # add internal count
- if has_count_info:
- internal_count.append(internal_count_dict[node.id])
- if is_leaf[node.left_nodeid]: # generate lgb leaf idx
- left.append(leaf_idx)
- if has_count_info:
- leaf_count.append(param.leaf_count[node.left_nodeid])
- leaf_idx -= 1
- else:
- left.append(sbt_lgb_node_map[node.left_nodeid])
- if is_leaf[node.right_nodeid]: # generate lgb leaf idx
- right.append(leaf_idx)
- if has_count_info:
- leaf_count.append(param.leaf_count[node.right_nodeid])
- leaf_idx -= 1
- else:
- right.append(sbt_lgb_node_map[node.right_nodeid])
- # get lgb decision type
- decision_type.append(get_decision_type(node, use_missing, zero_as_missing))
- else:
- # regression model need to add init score
- if init_score is not None:
- score = node.weight * learning_rate + init_score
- else:
- # leaf value is node.weight * learning_rate in lgb
- score = node.weight * learning_rate
- leaf_weight.append(score)
- leaves_num = len(leaf_weight)
- num_cat = 0
- # to string
- result_str = TREE_TEMPLATE.format(tree_idx, leaves_num, num_cat, list_to_str(split_feature),
- list_to_str(split_threshold), list_to_str(decision_type),
- list_to_str(left), list_to_str(right), list_to_str(leaf_weight),
- list_to_str(internal_weight), learning_rate)
- if len(internal_count) != 0:
- result_str += 'internal_count={}\n'.format(list_to_str(internal_count))
- if len(leaf_count) != 0:
- result_str += 'leaf_count={}\n'.format(list_to_str(leaf_count))
- return result_str
- def parse_feature_importance(param):
- feat_importance_str = "feature_importances:\n"
- mapping = param.feature_name_fid_mapping
- for impt in param.feature_importances:
- impt_val = impt.importance
- try:
- if impt.main == 'split':
- impt_val = int(impt_val)
- except BaseException:
- LOGGER.warning("old version protobuf contains no filed 'main'")
- feat_importance_str += '{}={}\n'.format(mapping[impt.fid], impt_val)
- return feat_importance_str
- def parse_parameter(param, meta):
- """
- we only keep parameters offered by SBT
- """
- tree_meta = meta.tree_meta
- num_classes = 1 if meta.task_type == consts.CLASSIFICATION and param.num_classes < 3 else param.num_classes
- objective = get_lgb_objective(meta.task_type, num_classes, PARA_OBJECTIVE, need_multi_format=False)
- rs = PARA_TEMPLATE.format(objective, meta.num_trees, meta.learning_rate, tree_meta.max_depth,
- meta.quantile_meta.bin_num, meta.tree_meta.use_missing + 0,
- meta.tree_meta.zero_as_missing + 0,
- num_classes, tree_meta.criterion_meta.criterion_param[0],
- tree_meta.criterion_meta.criterion_param[1],
- tree_meta.min_leaf_node,
- tree_meta.min_impurity_split
- )
- return rs
- def sbt_to_lgb(model_param: BoostingTreeModelParam,
- model_meta: BoostingTreeModelMeta,
- load_feature_importance=True):
- """
- Transform sbt model to lgb model
- """
- result = ''
- # parse header
- header_str = parse_header(model_param, model_meta)
- use_missing = model_meta.tree_meta.use_missing
- zero_as_missing = model_meta.tree_meta.zero_as_missing
- learning_rate = model_meta.learning_rate
- tree_str_list = []
- # parse tree
- for idx, param in enumerate(model_param.trees_):
- if idx == 0 and model_meta.task_type == consts.REGRESSION: # regression task has init score
- init_score = model_param.init_score[0]
- else:
- init_score = 0
- tree_str_list.append(parse_a_tree(param, idx, use_missing, zero_as_missing, learning_rate, init_score))
- # add header and tree str to result
- result += header_str + '\n'
- for s in tree_str_list:
- result += s
- result += SPLIT
- result += END_OF_TREE
- # handle feature importance
- if load_feature_importance:
- feat_importance_str = parse_feature_importance(model_param)
- result += SPLIT + feat_importance_str
- # parameters
- para_str = parse_parameter(model_param, model_meta)
- result += '\n' + para_str + '\n' + END_OF_PARA + '\n'
- result += '\npandas_categorical:[]\n'
- return result
- def save_lgb(model: lgb.Booster, path):
- model_str = model.model_to_string()
- f = open(path, 'w')
- f.write(model_str)
- f.close()
- def load_lgb(path):
- f = open(path, 'r')
- model_str = f.read()
- f.close()
- lgb_model = lgb.Booster(model_str=model_str)
- return lgb_model
- class HomoSBTComponentConverter(ComponentConverterBase):
- @staticmethod
- def get_target_modules():
- return ['HomoSecureboost']
- def convert(self, model_dict):
- param_obj = model_dict["HomoSecureBoostingTreeGuestParam"]
- meta_obj = model_dict["HomoSecureBoostingTreeGuestMeta"]
- lgb_model_str = sbt_to_lgb(param_obj, meta_obj)
- lgb_model = lgb.Booster(model_str=lgb_model_str)
- return lgb_model
|