Shellmiao
/
FateWithShellmiaoComment


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
							import numpy as np
import lightgbm as lgb
from ..component_converter import ComponentConverterBase
from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import BoostingTreeModelMeta
from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import BoostingTreeModelParam, \
    DecisionTreeModelParam, NodeParam
from federatedml.util import consts
from federatedml.util import LOGGER


"""
We only keep the necessary variable to make sure that lightgbm can run predict function on the converted model
"""

FAKE_FEATURE_INFO_STR = '[0:1] '

END_OF_TREE = 'end of trees'
END_OF_PARA = 'end of parameters'

SPLIT = '\n\n'

HEADER_TEMPLATE = """tree
version=v3
num_class={}
num_tree_per_iteration={}
label_index={}
max_feature_idx={}
objective={}
feature_names={}
feature_infos={}
"""

TREE_TEMPLATE = """Tree={}
num_leaves={}
num_cat={}
split_feature={}
threshold={}
decision_type={}
left_child={}
right_child={}
leaf_value={}
internal_value={}
shrinkage={}
"""

PARA_TEMPLATE = """parameters:
[boosting: gbdt]
[objective: {}]
[num_iterations: {}]
[learning_rate: {}]
[max_depth: {}]
[max_bin: {}]
[use_missing: {}]
[zero_as_missing: {}]
[num_class: {}]
[lambda_l1: {}]
[lambda_l2: {}]
[min_data_in_leaf: {}]
[min_gain_to_split: {}]
"""

LGB_OBJECTIVE = {
    consts.BINARY: "binary sigmoid:1",
    consts.REGRESSION: "regression",
    consts.MULTY: 'multiclass num_class:{}'
}

PARA_OBJECTIVE = {
    consts.BINARY: "binary",
    consts.REGRESSION: "regression",
    consts.MULTY: 'multiclass'
}


def get_decision_type(node: NodeParam, use_missing, zero_as_missing):

    # 00                     0                         0
    # Nan,0 or None         default left or right?    cat feature or not？
    default_type = 0  # 0000 None, default right, not cat feat

    if not use_missing:
        return default_type

    if node.missing_dir == -1:
        default_type = default_type | 2   # 0010

    if zero_as_missing:
        default_type = default_type | 4   # 0100 0
    else:
        default_type = default_type | 8  # 1000 np.Nan

    return default_type


def get_lgb_objective(task_type, num_classes, ret_dict, need_multi_format=True):

    if task_type == consts.CLASSIFICATION:
        if num_classes == 1:
            objective = ret_dict[consts.BINARY]
        else:
            objective = ret_dict[consts.MULTY].format(num_classes) if need_multi_format else ret_dict[consts.MULTY]
    else:
        objective = ret_dict[consts.REGRESSION]
    return objective


def list_to_str(l_):
    return str(l_).replace('[', '').replace(']', '').replace(',', '')


def parse_header(param: BoostingTreeModelParam, meta: BoostingTreeModelMeta):
    # generated header of lgb str model file
    # binary/regression num class is 1 in lgb
    num_classes = len(param.classes_) if len(param.classes_) > 2 else 1
    objective = get_lgb_objective(meta.task_type, num_classes, LGB_OBJECTIVE, need_multi_format=True)
    num_tree_per_iteration = param.tree_dim
    label_index = 0  # by default
    max_feature_idx = len(param.feature_name_fid_mapping) - 1
    feature_names = ''
    for name in [param.feature_name_fid_mapping[i] for i in range(max_feature_idx + 1)]:
        if ' ' in name:  # space is not allowed
            name = name.replace(' ', '-')
        feature_names += name + ' '
    feature_names = feature_names[:-1]
    feature_info = FAKE_FEATURE_INFO_STR * (max_feature_idx + 1)  # need to make fake feature info
    feature_info = feature_info[:-1]
    result_str = HEADER_TEMPLATE.format(num_classes, num_tree_per_iteration, label_index, max_feature_idx,
                                        objective, feature_names, feature_info)
    return result_str


def internal_count_computer(cur_id, tree_node, leaf_count, internal_count):

    if cur_id in leaf_count:
        return leaf_count[cur_id]

    left_count = internal_count_computer(tree_node[cur_id].left_nodeid, tree_node, leaf_count, internal_count)
    right_count = internal_count_computer(tree_node[cur_id].right_nodeid, tree_node, leaf_count, internal_count)
    internal_count[cur_id] = left_count + right_count
    return internal_count[cur_id]


def compute_internal_count(tree_param: DecisionTreeModelParam):

    root = tree_param.tree_[0]
    internal_count = {}
    leaf_count = tree_param.leaf_count
    root_count = internal_count_computer(root.id, tree_param.tree_, leaf_count, internal_count)
    if root.id not in internal_count:
        internal_count[root_count] = root_count
    return internal_count


def update_leaf_count(param):
    # in homo sbt, sometimes a leaf covers no sample, so need to add 1 to leaf count
    tmp = {}
    for i in param.leaf_count:
        tmp[i] = param.leaf_count[i]
    for i in tmp:
        if tmp[i] == 0:
            param.leaf_count[i] += 1


def parse_a_tree(
        param: DecisionTreeModelParam,
        tree_idx: int,
        use_missing=False,
        zero_as_missing=False,
        learning_rate=0.1,
        init_score=None):

    split_feature = []
    split_threshold = []
    decision_type = []
    internal_weight = []
    leaf_weight = []
    left, right = [], []
    leaf_idx = -1
    lgb_node_idx = 0
    sbt_lgb_node_map = {}
    is_leaf = []
    leaf_count = []
    internal_count, internal_count_dict = [], {}
    has_count_info = len(param.leaf_count) != 0

    # compute internal count
    if has_count_info:
        update_leaf_count(param)
        internal_count_dict = compute_internal_count(param)  # get internal count from leaf count

    # mark leaf nodes and get sbt-lgb node mapping
    for node in param.tree_:
        is_leaf.append(node.is_leaf)
        if not node.is_leaf:
            sbt_lgb_node_map[node.id] = lgb_node_idx
            lgb_node_idx += 1

    for cur_idx, node in enumerate(param.tree_):

        if not node.is_leaf:

            split_feature.append(node.fid)

            # if is hetero model need to decode split point and missing dir
            if param.split_maskdict and param.missing_dir_maskdict is not None:
                node.bid = param.split_maskdict[node.id]
                node.missing_dir = param.missing_dir_maskdict[node.id]

            # extract split point and weight
            split_threshold.append(node.bid)
            internal_weight.append(node.weight)

            # add internal count
            if has_count_info:
                internal_count.append(internal_count_dict[node.id])

            if is_leaf[node.left_nodeid]:  # generate lgb leaf idx
                left.append(leaf_idx)
                if has_count_info:
                    leaf_count.append(param.leaf_count[node.left_nodeid])
                leaf_idx -= 1
            else:
                left.append(sbt_lgb_node_map[node.left_nodeid])

            if is_leaf[node.right_nodeid]:  # generate lgb leaf idx
                right.append(leaf_idx)
                if has_count_info:
                    leaf_count.append(param.leaf_count[node.right_nodeid])
                leaf_idx -= 1
            else:
                right.append(sbt_lgb_node_map[node.right_nodeid])

            # get lgb decision type
            decision_type.append(get_decision_type(node, use_missing, zero_as_missing))
        else:
            # regression model need to add init score
            if init_score is not None:
                score = node.weight * learning_rate + init_score
            else:
                # leaf value is node.weight * learning_rate in lgb
                score = node.weight * learning_rate

            leaf_weight.append(score)

    leaves_num = len(leaf_weight)
    num_cat = 0
    # to string
    result_str = TREE_TEMPLATE.format(tree_idx, leaves_num, num_cat, list_to_str(split_feature),
                                      list_to_str(split_threshold), list_to_str(decision_type),
                                      list_to_str(left), list_to_str(right), list_to_str(leaf_weight),
                                      list_to_str(internal_weight), learning_rate)

    if len(internal_count) != 0:
        result_str += 'internal_count={}\n'.format(list_to_str(internal_count))
    if len(leaf_count) != 0:
        result_str += 'leaf_count={}\n'.format(list_to_str(leaf_count))

    return result_str


def parse_feature_importance(param):

    feat_importance_str = "feature_importances:\n"
    mapping = param.feature_name_fid_mapping
    for impt in param.feature_importances:
        impt_val = impt.importance
        try:
            if impt.main == 'split':
                impt_val = int(impt_val)
        except BaseException:
            LOGGER.warning("old version protobuf contains no filed 'main'")
        feat_importance_str += '{}={}\n'.format(mapping[impt.fid], impt_val)

    return feat_importance_str


def parse_parameter(param, meta):
    """
    we only keep parameters offered by SBT
    """
    tree_meta = meta.tree_meta
    num_classes = 1 if meta.task_type == consts.CLASSIFICATION and param.num_classes < 3 else param.num_classes
    objective = get_lgb_objective(meta.task_type, num_classes, PARA_OBJECTIVE, need_multi_format=False)
    rs = PARA_TEMPLATE.format(objective, meta.num_trees, meta.learning_rate, tree_meta.max_depth,
                              meta.quantile_meta.bin_num, meta.tree_meta.use_missing + 0,
                              meta.tree_meta.zero_as_missing + 0,
                              num_classes, tree_meta.criterion_meta.criterion_param[0],
                              tree_meta.criterion_meta.criterion_param[1],
                              tree_meta.min_leaf_node,
                              tree_meta.min_impurity_split
                              )
    return rs


def sbt_to_lgb(model_param: BoostingTreeModelParam,
               model_meta: BoostingTreeModelMeta,
               load_feature_importance=True):
    """
    Transform sbt model to lgb model
    """

    result = ''
    # parse header
    header_str = parse_header(model_param, model_meta)
    use_missing = model_meta.tree_meta.use_missing
    zero_as_missing = model_meta.tree_meta.zero_as_missing
    learning_rate = model_meta.learning_rate
    tree_str_list = []

    # parse tree
    for idx, param in enumerate(model_param.trees_):

        if idx == 0 and model_meta.task_type == consts.REGRESSION:  # regression task has init score
            init_score = model_param.init_score[0]
        else:
            init_score = 0
        tree_str_list.append(parse_a_tree(param, idx, use_missing, zero_as_missing, learning_rate, init_score))

    # add header and tree str to result
    result += header_str + '\n'
    for s in tree_str_list:
        result += s
        result += SPLIT
    result += END_OF_TREE

    # handle feature importance
    if load_feature_importance:
        feat_importance_str = parse_feature_importance(model_param)
        result += SPLIT + feat_importance_str

    # parameters
    para_str = parse_parameter(model_param, model_meta)
    result += '\n' + para_str + '\n' + END_OF_PARA + '\n'
    result += '\npandas_categorical:[]\n'

    return result


def save_lgb(model: lgb.Booster, path):
    model_str = model.model_to_string()
    f = open(path, 'w')
    f.write(model_str)
    f.close()


def load_lgb(path):
    f = open(path, 'r')
    model_str = f.read()
    f.close()
    lgb_model = lgb.Booster(model_str=model_str)
    return lgb_model


class HomoSBTComponentConverter(ComponentConverterBase):

    @staticmethod
    def get_target_modules():
        return ['HomoSecureboost']

    def convert(self, model_dict):

        param_obj = model_dict["HomoSecureBoostingTreeGuestParam"]
        meta_obj = model_dict["HomoSecureBoostingTreeGuestMeta"]

        lgb_model_str = sbt_to_lgb(param_obj, meta_obj)
        lgb_model = lgb.Booster(model_str=lgb_model_str)

        return lgb_model