12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- import json
- import numpy as np
- import lightgbm as lgb
- from sklearn.pipeline import Pipeline
- from lightgbm.sklearn import _LGBMLabelEncoder
- from federatedml.protobuf.homo_model_convert.lightgbm.gbdt import sbt_to_lgb
- from federatedml.protobuf.generated.boosting_tree_model_param_pb2 import BoostingTreeModelParam
- from federatedml.protobuf.generated.boosting_tree_model_meta_pb2 import BoostingTreeModelMeta
- from google.protobuf import json_format
- from federatedml.util.anonymous_generator_util import Anonymous
- def _merge_sbt(guest_param, host_param, host_sitename, rename_host=True):
- # update feature name fid mapping
- guest_fid_map = guest_param['featureNameFidMapping']
- guest_fid_map = {int(k): v for k, v in guest_fid_map.items()}
- host_fid_map = sorted([(int(k), v) for k, v in host_param['featureNameFidMapping'].items()], key=lambda x: x[0])
- guest_feat_len = len(guest_fid_map)
- start = guest_feat_len
- host_new_fid = {}
- for k, v in host_fid_map:
- guest_fid_map[start] = v if not rename_host else v + '_' + host_sitename
- host_new_fid[k] = start
- start += 1
- guest_param['featureNameFidMapping'] = guest_fid_map
- # merging trees
- for tree_guest, tree_host in zip(guest_param['trees'], host_param['trees']):
- tree_guest['splitMaskdict'].update(tree_host['splitMaskdict'])
- tree_guest['missingDirMaskdict'].update(tree_host['missingDirMaskdict'])
- for node_g, node_h in zip(tree_guest['tree'], tree_host['tree']):
- if str(node_h['id']) in tree_host['splitMaskdict']:
- node_g['fid'] = int(host_new_fid[int(node_h['fid'])])
- node_g['sitename'] = host_sitename
- node_g['bid'] = 0
- return guest_param
- def extract_host_name(host_param, idx):
- try:
- anonymous_obj = Anonymous()
- anonymous_dict = host_param['anonymousNameMapping']
- role, party_id = None, None
- for key in anonymous_dict:
- role = anonymous_obj.get_role_from_anonymous_column(key)
- party_id = anonymous_obj.get_party_id_from_anonymous_column(key)
- break
- if role is not None and party_id is not None:
- return role + '_' + party_id
- else:
- return None
- except Exception as e:
- return 'host_{}'.format(idx)
- def merge_sbt(guest_param: dict, guest_meta: dict, host_params: list, host_metas: list, output_format: str,
- target_name='y', host_rename=True):
- result_param = None
- for idx, host_param in enumerate(host_params):
- host_name = extract_host_name(host_param, idx)
- if result_param is None:
- result_param = _merge_sbt(guest_param, host_param, host_name, host_rename)
- else:
- result_param = _merge_sbt(result_param, host_param, host_name, host_rename)
- pb_param = json_format.Parse(json.dumps(result_param), BoostingTreeModelParam())
- pb_meta = json_format.Parse(json.dumps(guest_meta), BoostingTreeModelMeta())
- lgb_model = sbt_to_lgb(pb_param, pb_meta, False)
- if output_format in ['lgb', 'lightgbm']:
- return lgb_model
- elif output_format in ['pmml']:
- classes = list(map(int, pb_param.classes_))
- bst = lgb.Booster(model_str=lgb_model)
- new_clf = lgb.LGBMRegressor() if guest_meta['taskType'] == 'regression' else lgb.LGBMClassifier()
- new_clf._Booster = bst
- new_clf._n_features = len(bst.feature_name())
- new_clf._n_classes = len(np.unique(classes))
- new_clf._le = _LGBMLabelEncoder().fit(np.array(classes))
- new_clf.fitted_ = True
- new_clf._classes = new_clf._le.classes_
- test_pipeline = Pipeline([("lgb", new_clf)])
- return test_pipeline
- else:
- raise ValueError('unknown output type {}'.format(output_format))
|