data_transform.py 56 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright 2019 The FATE Authors. All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License");
  7. # you may not use this file except in compliance with the License.
  8. # You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. ################################################################################
  19. #
  20. #
  21. ################################################################################
  22. import copy
  23. import functools
  24. import numpy as np
  25. from federatedml.feature.instance import Instance
  26. from federatedml.feature.sparse_vector import SparseVector
  27. from federatedml.model_base import ModelBase
  28. from federatedml.protobuf.generated.data_transform_meta_pb2 import DataTransformMeta
  29. from federatedml.protobuf.generated.data_transform_meta_pb2 import DataTransformImputerMeta
  30. from federatedml.protobuf.generated.data_transform_meta_pb2 import DataTransformOutlierMeta
  31. from federatedml.protobuf.generated.data_transform_param_pb2 import DataTransformParam
  32. from federatedml.protobuf.generated.data_transform_param_pb2 import DataTransformImputerParam
  33. from federatedml.protobuf.generated.data_transform_param_pb2 import DataTransformOutlierParam
  34. from federatedml.statistic import data_overview
  35. from federatedml.util import abnormal_detection
  36. from federatedml.util import consts
  37. from federatedml.util import LOGGER
  38. from federatedml.util.io_check import assert_io_num_rows_equal
  39. from federatedml.util.data_format_preprocess import DataFormatPreProcess
  40. from federatedml.util.anonymous_generator_util import Anonymous
  41. # =============================================================================
  42. # DenseFeatureTransformer
  43. # =============================================================================
  44. class DenseFeatureTransformer(object):
  45. def __init__(self, data_transform_param):
  46. self.delimitor = data_transform_param.delimitor
  47. self.data_type = data_transform_param.data_type
  48. self.missing_fill = data_transform_param.missing_fill
  49. self.default_value = data_transform_param.default_value
  50. self.missing_fill_method = data_transform_param.missing_fill_method
  51. self.missing_impute = data_transform_param.missing_impute
  52. self.outlier_replace = data_transform_param.outlier_replace
  53. self.outlier_replace_method = data_transform_param.outlier_replace_method
  54. self.outlier_impute = data_transform_param.outlier_impute
  55. self.outlier_replace_value = data_transform_param.outlier_replace_value
  56. self.with_label = data_transform_param.with_label
  57. self.label_name = data_transform_param.label_name.lower() if self.with_label else None
  58. self.label_type = data_transform_param.label_type if self.with_label else None
  59. self.output_format = data_transform_param.output_format
  60. self.missing_impute_rate = None
  61. self.outlier_replace_rate = None
  62. self.header = None
  63. self.sid_name = None
  64. self.exclusive_data_type_fid_map = {}
  65. self.match_id_name = data_transform_param.match_id_name
  66. self.match_id_index = 0
  67. self.with_match_id = data_transform_param.with_match_id
  68. self.anonymous_generator = None
  69. self.anonymous_header = None
  70. if data_transform_param.exclusive_data_type:
  71. self.exclusive_data_type = dict([(k.lower(), v)
  72. for k, v in data_transform_param.exclusive_data_type.items()])
  73. else:
  74. self.exclusive_data_type = None
  75. def _update_param(self, schema):
  76. meta = schema["meta"]
  77. self.delimitor = meta.get("delimiter", ",")
  78. self.data_type = meta.get("data_type")
  79. self.with_label = meta.get("with_label", False)
  80. if self.with_label:
  81. self.label_type = meta.get("label_type", "int")
  82. self.label_name = meta.get("label_name", '')
  83. self.with_match_id = meta.get("with_match_id", False)
  84. if self.with_match_id:
  85. match_id_name = schema.get("match_id_name", [])
  86. if not self.match_id_name:
  87. if isinstance(match_id_name, list) and len(self.match_id_name) > 1:
  88. raise ValueError("Multiple Match ID exist, please specified the one to use")
  89. self.match_id_name = match_id_name[0] if isinstance(match_id_name, list) else match_id_name
  90. self.match_id_index = schema["original_index_info"]["match_id_index"][0]
  91. else:
  92. try:
  93. idx = match_id_name.index(self.match_id_name)
  94. except ValueError:
  95. raise ValueError(f"Can not find {self.match_id_name} in {match_id_name}")
  96. self.match_id_index = schema["original_index_info"]["match_id_index"][idx]
  97. schema["match_id_name"] = self.match_id_name
  98. header = schema["header"]
  99. exclusive_data_type = meta.get("exclusive_data_type", None)
  100. if exclusive_data_type:
  101. self.exclusive_data_type = dict([(k.lower(), v) for k, v in exclusive_data_type.items()])
  102. for idx, col_name in enumerate(header):
  103. if col_name in self.exclusive_data_type:
  104. self.exclusive_data_type_fid_map[idx] = self.exclusive_data_type[col_name]
  105. def extract_feature_value(self, value, header_index=None):
  106. if not header_index:
  107. return []
  108. value = value.split(self.delimitor, -1)
  109. if len(value) <= header_index[-1]:
  110. raise ValueError("Feature shape is smaller than header shape")
  111. feature_values = []
  112. for idx in header_index:
  113. feature_values.append(value[idx])
  114. return feature_values
  115. def read_data(self, input_data, mode="fit"):
  116. LOGGER.info("start to read dense data and change data to instance")
  117. abnormal_detection.empty_table_detection(input_data)
  118. schema = copy.deepcopy(input_data.schema)
  119. if not schema.get("meta"):
  120. LOGGER.warning("Data meta is supported to be set with data uploading or binding, "
  121. "please refer to data transform using guides.")
  122. meta = dict(input_format="dense",
  123. delimiter=self.delimitor,
  124. with_label=self.with_label,
  125. label_name=self.label_name,
  126. with_match_id=self.with_match_id,
  127. data_type=self.data_type,
  128. )
  129. if mode == "transform" and self.with_label \
  130. and self.label_name not in schema["header"].split(self.delimitor, -1):
  131. del meta["label_name"]
  132. del meta["with_label"]
  133. schema["meta"] = meta
  134. generated_header = DataFormatPreProcess.generate_header(input_data, schema)
  135. schema.update(generated_header)
  136. schema = self.anonymous_generator.generate_anonymous_header(schema)
  137. set_schema(input_data, schema)
  138. else:
  139. self._update_param(schema)
  140. header = schema["header"]
  141. anonymous_header = schema["anonymous_header"]
  142. training_header = self.header
  143. if mode == "transform":
  144. if (set(self.header) & set(header)) != set(self.header):
  145. raise ValueError(f"Transform Data's header is {header}, expect {self.header}")
  146. self.header = header
  147. if not self.anonymous_header:
  148. self.anonymous_header = anonymous_header
  149. else:
  150. self.header = header
  151. self.anonymous_header = anonymous_header
  152. header_index = schema["original_index_info"]["header_index"]
  153. extract_feature_func = functools.partial(self.extract_feature_value,
  154. header_index=header_index)
  155. input_data_features = input_data.mapValues(extract_feature_func)
  156. # input_data_features.schema = input_data.schema
  157. input_data_features.schema = schema
  158. input_data_labels = None
  159. input_data_match_id = None
  160. if "label_name" in schema:
  161. label_index = schema["original_index_info"]["label_index"]
  162. input_data_labels = input_data.mapValues(lambda value: value.split(self.delimitor, -1)[label_index])
  163. if self.with_match_id:
  164. input_data_match_id = input_data.mapValues(
  165. lambda value: value.split(self.delimitor, -1)[self.match_id_index])
  166. if mode == "fit":
  167. data_instance = self.fit(input_data, input_data_features, input_data_labels, input_data_match_id)
  168. set_schema(data_instance, schema)
  169. else:
  170. data_instance = self.transform(input_data_features, input_data_labels, input_data_match_id)
  171. data_instance = data_overview.header_alignment(data_instance, training_header, self.anonymous_header)
  172. self.header = training_header
  173. return data_instance
  174. def fit(self, input_data, input_data_features, input_data_labels, input_data_match_id):
  175. input_data_features = self.fill_missing_value(input_data_features, "fit")
  176. input_data_features = self.replace_outlier_value(input_data_features, "fit")
  177. data_instance = self.gen_data_instance(input_data_features, input_data_labels, input_data_match_id)
  178. return data_instance
  179. @assert_io_num_rows_equal
  180. def transform(self, input_data_features, input_data_labels, input_data_match_id):
  181. schema = input_data_features.schema
  182. input_data_features = self.fill_missing_value(input_data_features, "transform")
  183. input_data_features = self.replace_outlier_value(input_data_features, "transform")
  184. data_instance = self.gen_data_instance(input_data_features, input_data_labels, input_data_match_id)
  185. data_instance.schema = schema
  186. return data_instance
  187. def fill_missing_value(self, input_data_features, mode="fit"):
  188. if self.missing_fill:
  189. from federatedml.feature.imputer import Imputer
  190. imputer_processor = Imputer(self.missing_impute)
  191. if mode == "fit":
  192. input_data_features, self.default_value = imputer_processor.fit(input_data_features,
  193. replace_method=self.missing_fill_method,
  194. replace_value=self.default_value)
  195. if self.missing_impute is None:
  196. self.missing_impute = imputer_processor.get_missing_value_list()
  197. else:
  198. input_data_features = imputer_processor.transform(input_data_features,
  199. transform_value=self.default_value)
  200. if self.missing_impute is None:
  201. self.missing_impute = imputer_processor.get_missing_value_list()
  202. self.missing_impute_rate = imputer_processor.get_impute_rate(mode)
  203. return input_data_features
  204. def replace_outlier_value(self, input_data_features, mode="fit"):
  205. if self.outlier_replace:
  206. from federatedml.feature.imputer import Imputer
  207. imputer_processor = Imputer(self.outlier_impute)
  208. if mode == "fit":
  209. input_data_features, self.outlier_replace_value = \
  210. imputer_processor.fit(input_data_features,
  211. replace_method=self.outlier_replace_method,
  212. replace_value=self.outlier_replace_value)
  213. if self.outlier_impute is None:
  214. self.outlier_impute = imputer_processor.get_missing_value_list()
  215. else:
  216. input_data_features = imputer_processor.transform(input_data_features,
  217. transform_value=self.outlier_replace_value)
  218. self.outlier_replace_rate = imputer_processor.get_impute_rate(mode)
  219. return input_data_features
  220. def gen_data_instance(self, input_data_features, input_data_labels, input_data_match_id):
  221. if input_data_labels:
  222. data_instance = input_data_features.join(input_data_labels,
  223. lambda features, label: self.to_instance(features, label))
  224. else:
  225. data_instance = input_data_features.mapValues(lambda features: self.to_instance(features))
  226. if self.with_match_id:
  227. data_instance = data_instance.join(input_data_match_id, self.append_match_id)
  228. return data_instance
  229. def append_match_id(self, inst, match_id):
  230. inst.inst_id = match_id
  231. return inst
  232. def to_instance(self, features, label=None):
  233. if self.header is None and len(features) != 0:
  234. raise ValueError("features shape {} not equal to header shape 0".format(len(features)))
  235. elif self.header is not None and len(self.header) != len(features):
  236. raise ValueError("features shape {} not equal to header shape {}".format(len(features), len(self.header)))
  237. if label is not None:
  238. if self.label_type == 'int':
  239. label = int(label)
  240. elif self.label_type in ["float", "float64"]:
  241. label = float(label)
  242. format_features = DenseFeatureTransformer.gen_output_format(features, self.data_type,
  243. self.exclusive_data_type_fid_map,
  244. self.output_format,
  245. missing_impute=self.missing_impute)
  246. else:
  247. format_features = DenseFeatureTransformer.gen_output_format(features, self.data_type,
  248. self.exclusive_data_type_fid_map,
  249. self.output_format,
  250. missing_impute=self.missing_impute)
  251. return Instance(inst_id=None,
  252. features=format_features,
  253. label=label)
  254. @staticmethod
  255. def gen_output_format(features, data_type='float', exclusive_data_type_fid_map=None,
  256. output_format='dense', missing_impute=None):
  257. if output_format not in ["dense", "sparse"]:
  258. raise ValueError("output format {} is not define".format(output_format))
  259. missing_impute_dtype_set = {"int", "int64", "long", "float", "float64", "double"}
  260. missing_impute_value_set = {'', 'NULL', 'null', "NA"}
  261. type_mapping = dict()
  262. if output_format == "dense":
  263. # format_features = copy.deepcopy(features)
  264. format_features = [None] * len(features)
  265. for fid in range(len(features)):
  266. if exclusive_data_type_fid_map is not None and fid in exclusive_data_type_fid_map:
  267. dtype = exclusive_data_type_fid_map[fid]
  268. else:
  269. dtype = data_type
  270. if dtype in missing_impute_dtype_set:
  271. if (missing_impute is not None and features[fid] in missing_impute) or \
  272. (missing_impute is None and features[fid] in missing_impute_value_set):
  273. format_features[fid] = np.nan
  274. continue
  275. format_features[fid] = features[fid]
  276. if exclusive_data_type_fid_map:
  277. if dtype not in type_mapping:
  278. np_type = getattr(np, dtype)
  279. type_mapping[dtype] = np_type
  280. format_features[fid] = type_mapping[dtype](format_features[fid])
  281. if exclusive_data_type_fid_map:
  282. return np.asarray(format_features, dtype=object)
  283. else:
  284. return np.asarray(format_features, dtype=data_type)
  285. indices = []
  286. data = []
  287. column_shape = len(features)
  288. non_zero = 0
  289. for i in range(column_shape):
  290. if (missing_impute is not None and features[i] in missing_impute) or \
  291. (missing_impute is None and features[i] in missing_impute_value_set):
  292. indices.append(i)
  293. data.append(np.nan)
  294. non_zero += 1
  295. elif data_type in ['float', 'float64', "double"]:
  296. if np.fabs(float(features[i])) < consts.FLOAT_ZERO:
  297. continue
  298. indices.append(i)
  299. data.append(float(features[i]))
  300. non_zero += 1
  301. elif data_type in ['int', "int64", "long"]:
  302. if int(features[i]) == 0:
  303. continue
  304. indices.append(i)
  305. data.append(int(features[i]))
  306. else:
  307. indices.append(i)
  308. data.append(features[i])
  309. return SparseVector(indices, data, column_shape)
  310. def get_summary(self):
  311. if not self.missing_fill and not self.outlier_replace:
  312. return {}
  313. summary_buf = {}
  314. if self.missing_fill:
  315. missing_summary = dict()
  316. missing_summary["missing_value"] = list(self.missing_impute)
  317. missing_summary["missing_impute_value"] = dict(zip(self.header, self.default_value))
  318. missing_summary["missing_impute_rate"] = dict(zip(self.header, self.missing_impute_rate))
  319. summary_buf["missing_fill_info"] = missing_summary
  320. if self.outlier_replace:
  321. outlier_replace_summary = dict()
  322. outlier_replace_summary["outlier_value"] = list(self.outlier_impute)
  323. outlier_replace_summary["outlier_replace_value"] = dict(zip(self.header, self.outlier_replace_value))
  324. outlier_replace_summary["outlier_replace_rate"] = dict(zip(self.header, self.outlier_replace_rate))
  325. summary_buf["outlier_replace_rate"] = outlier_replace_summary
  326. return summary_buf
  327. def save_model(self):
  328. transform_meta, transform_param = save_data_transform_model(input_format="dense",
  329. delimitor=self.delimitor,
  330. data_type=self.data_type,
  331. exclusive_data_type=self.exclusive_data_type,
  332. with_label=self.with_label,
  333. label_type=self.label_type,
  334. output_format=self.output_format,
  335. header=self.header,
  336. sid_name=self.sid_name,
  337. label_name=self.label_name,
  338. with_match_id=self.with_match_id,
  339. model_name="DenseFeatureTransformer",
  340. anonymous_header=self.anonymous_header)
  341. missing_imputer_meta, missing_imputer_param = save_missing_imputer_model(self.missing_fill,
  342. self.missing_fill_method,
  343. self.missing_impute,
  344. self.default_value,
  345. self.missing_impute_rate,
  346. self.header,
  347. "Imputer")
  348. transform_meta.imputer_meta.CopyFrom(missing_imputer_meta)
  349. transform_param.imputer_param.CopyFrom(missing_imputer_param)
  350. outlier_meta, outlier_param = save_outlier_model(self.outlier_replace,
  351. self.outlier_replace_method,
  352. self.outlier_impute,
  353. self.outlier_replace_value,
  354. self.outlier_replace_rate,
  355. self.header,
  356. "Outlier")
  357. transform_meta.outlier_meta.CopyFrom(outlier_meta)
  358. transform_param.outlier_param.CopyFrom(outlier_param)
  359. return {"DataTransformMeta": transform_meta,
  360. "DataTransformParam": transform_param
  361. }
  362. def load_model(self, model_meta, model_param):
  363. self.delimitor, self.data_type, self.exclusive_data_type, _1, _2, self.with_label, \
  364. self.label_type, self.output_format, self.header, self.sid_name, self.label_name, self.with_match_id, self.anonymous_header = \
  365. load_data_transform_model("DenseFeatureTransformer", model_meta, model_param)
  366. self.missing_fill, self.missing_fill_method, \
  367. self.missing_impute, self.default_value = load_missing_imputer_model(self.header,
  368. "Imputer",
  369. model_meta.imputer_meta,
  370. model_param.imputer_param)
  371. self.outlier_replace, self.outlier_replace_method, \
  372. self.outlier_impute, self.outlier_replace_value = load_outlier_model(self.header,
  373. "Outlier",
  374. model_meta.outlier_meta,
  375. model_param.outlier_param)
  376. # =============================================================================
  377. # SparseFeatureTransformer: mainly for libsvm input format
  378. # =============================================================================
  379. class SparseFeatureTransformer(object):
  380. def __init__(self, data_transform_param):
  381. self.delimitor = data_transform_param.delimitor
  382. self.data_type = data_transform_param.data_type
  383. self.label_type = data_transform_param.label_type
  384. self.output_format = data_transform_param.output_format
  385. self.header = None
  386. self.sid_name = "sid"
  387. self.with_match_id = data_transform_param.with_match_id
  388. self.match_id_name = "match_id" if self.with_match_id else None
  389. self.match_id_index = data_transform_param.match_id_index
  390. self.with_label = data_transform_param.with_label
  391. self.label_name = data_transform_param.label_name.lower() if self.with_label else None
  392. self.anonymous_generator = None
  393. self.anonymous_header = None
  394. def _update_param(self, schema):
  395. meta = schema["meta"]
  396. self.delimitor = meta.get("delimiter", ",")
  397. self.data_type = meta.get("data_type")
  398. self.with_label = meta.get("with_label", False)
  399. if self.with_label:
  400. self.label_type = meta.get("label_type", "int")
  401. self.label_name = meta.get("label_name", "")
  402. self.with_match_id = meta.get("with_match_id", False)
  403. if self.with_match_id:
  404. match_id_name = schema.get("match_id_name")
  405. if isinstance(match_id_name, list):
  406. self.match_id_name = match_id_name[self.match_id_index]
  407. else:
  408. self.match_id_name = match_id_name
  409. schema["match_id_name"] = self.match_id_name
  410. def read_data(self, input_data, mode="fit"):
  411. LOGGER.info("start to read sparse data and change data to instance")
  412. abnormal_detection.empty_table_detection(input_data)
  413. schema = copy.deepcopy(input_data.schema)
  414. if not schema.get("meta", {}):
  415. LOGGER.warning("Data meta is supported to be set with data uploading or binding, "
  416. "please refer to data transform using guides.")
  417. meta = dict(input_format="sparse",
  418. delimiter=self.delimitor,
  419. with_label=self.with_label,
  420. with_match_id=self.with_match_id,
  421. data_type=self.data_type)
  422. schema["meta"] = meta
  423. generated_header = DataFormatPreProcess.generate_header(input_data, schema)
  424. schema.update(generated_header)
  425. schema = self.anonymous_generator.generate_anonymous_header(schema)
  426. set_schema(input_data, schema)
  427. else:
  428. self._update_param(schema)
  429. if mode == "fit":
  430. self.header = schema["header"]
  431. self.anonymous_header = schema["anonymous_header"]
  432. data_instance = self.fit(input_data)
  433. else:
  434. if not self.anonymous_header:
  435. header_set = set(self.header)
  436. self.anonymous_header = []
  437. for column, anonymous_column in zip(schema["header"], schema["anonymous_header"]):
  438. if column not in header_set:
  439. continue
  440. self.anonymous_header.append(anonymous_column)
  441. schema["header"] = self.header
  442. schema["anonymous_header"] = self.anonymous_header
  443. set_schema(input_data, schema)
  444. data_instance = self.transform(input_data)
  445. set_schema(data_instance, schema)
  446. return data_instance
  447. def fit(self, input_data):
  448. max_feature = len(self.header)
  449. if max_feature == 0:
  450. raise ValueError("no feature value in input data, please check!")
  451. data_instance = self.gen_data_instance(input_data, max_feature)
  452. return data_instance
  453. def transform(self, input_data):
  454. max_feature = len(self.header)
  455. data_instance = self.gen_data_instance(input_data, max_feature)
  456. return data_instance
  457. def gen_data_instance(self, input_data, max_feature):
  458. id_range = input_data.schema["meta"].get("id_range", 0)
  459. params = [self.delimitor, self.data_type,
  460. self.label_type, self.with_match_id,
  461. self.match_id_index, id_range,
  462. self.output_format,
  463. self.with_label, max_feature]
  464. to_instance_with_param = functools.partial(self.to_instance, params)
  465. data_instance = input_data.mapValues(to_instance_with_param)
  466. return data_instance
  467. @staticmethod
  468. def to_instance(param_list, value):
  469. delimitor = param_list[0]
  470. data_type = param_list[1]
  471. label_type = param_list[2]
  472. with_match_id = param_list[3]
  473. match_id_index = param_list[4]
  474. id_range = param_list[5]
  475. output_format = param_list[6]
  476. with_label = param_list[7]
  477. max_fid = param_list[8]
  478. if output_format not in ["dense", "sparse"]:
  479. raise ValueError("output format {} is not define".format(output_format))
  480. cols = value.split(delimitor, -1)
  481. offset = 0
  482. if with_match_id:
  483. offset = id_range if id_range else 1
  484. match_id = cols[match_id_index]
  485. else:
  486. match_id = None
  487. label = None
  488. if with_label:
  489. label = cols[offset]
  490. if label_type == 'int':
  491. label = int(label)
  492. elif label_type in ["float", "float64"]:
  493. label = float(label)
  494. offset += 1
  495. fid_value = []
  496. for i in range(offset, len(cols)):
  497. fid, val = cols[i].split(":", -1)
  498. fid = int(fid)
  499. if data_type in ["float", "float64"]:
  500. val = float(val)
  501. elif data_type in ["int", "int64"]:
  502. val = int(val)
  503. fid_value.append((fid, val))
  504. if output_format == "dense":
  505. features = [0 for i in range(max_fid)]
  506. for fid, val in fid_value:
  507. features[fid] = val
  508. features = np.asarray(features, dtype=data_type)
  509. else:
  510. indices = []
  511. data = []
  512. for fid, val in fid_value:
  513. indices.append(fid)
  514. data.append(val)
  515. features = SparseVector(indices, data, max_fid)
  516. return Instance(inst_id=match_id,
  517. features=features,
  518. label=label)
  519. def save_model(self):
  520. transform_meta, transform_param = save_data_transform_model(input_format="sparse",
  521. delimitor=self.delimitor,
  522. data_type=self.data_type,
  523. label_type=self.label_type,
  524. output_format=self.output_format,
  525. header=self.header,
  526. sid_name=self.sid_name,
  527. label_name=self.label_name,
  528. with_match_id=self.with_match_id,
  529. with_label=self.with_label,
  530. model_name="SparseFeatureTransformer",
  531. anonymous_header=self.anonymous_header)
  532. missing_imputer_meta, missing_imputer_param = save_missing_imputer_model(missing_fill=False,
  533. model_name="Imputer")
  534. transform_meta.imputer_meta.CopyFrom(missing_imputer_meta)
  535. transform_param.imputer_param.CopyFrom(missing_imputer_param)
  536. outlier_meta, outlier_param = save_outlier_model(outlier_replace=False,
  537. model_name="Outlier")
  538. transform_meta.outlier_meta.CopyFrom(outlier_meta)
  539. transform_param.outlier_param.CopyFrom(outlier_param)
  540. return {"DataTransformMeta": transform_meta,
  541. "DataTransformParam": transform_param
  542. }
  543. def load_model(self, model_meta, model_param):
  544. self.delimitor, self.data_type, _0, _1, _2, self.with_label, self.label_type, self.output_format, \
  545. self.header, self.sid_name, self.label_name, self.with_match_id, self.anonymous_header = \
  546. load_data_transform_model(
  547. "SparseFeatureTransformer",
  548. model_meta,
  549. model_param)
  550. # =============================================================================
  551. # SparseTagTransformer: mainly for tag data
  552. # =============================================================================
  553. class SparseTagTransformer(object):
  554. def __init__(self, data_transform_param):
  555. self.delimitor = data_transform_param.delimitor
  556. self.data_type = data_transform_param.data_type
  557. self.tag_with_value = data_transform_param.tag_with_value
  558. self.tag_value_delimitor = data_transform_param.tag_value_delimitor
  559. self.with_label = data_transform_param.with_label
  560. self.label_type = data_transform_param.label_type if self.with_label else None
  561. self.output_format = data_transform_param.output_format
  562. self.header = None
  563. self.sid_name = "sid"
  564. self.label_name = data_transform_param.label_name.lower() if data_transform_param.label_name else None
  565. self.missing_fill = data_transform_param.missing_fill
  566. self.missing_fill_method = data_transform_param.missing_fill_method
  567. self.default_value = data_transform_param.default_value
  568. self.with_match_id = data_transform_param.with_match_id
  569. self.match_id_index = data_transform_param.match_id_index
  570. self.match_id_name = "match_id" if self.with_match_id else None
  571. self.missing_impute_rate = None
  572. self.missing_impute = None
  573. self.anonymous_generator = None
  574. self.anonymous_header = None
  575. def _update_param(self, schema):
  576. meta = schema["meta"]
  577. self.delimitor = meta.get("delimiter", ",")
  578. self.data_type = meta.get("data_type")
  579. self.tag_with_value = meta.get("tag_with_value")
  580. self.tag_value_delimitor = meta.get("tag_value_delimiter", ":")
  581. self.with_label = meta.get("with_label", False)
  582. if self.with_label:
  583. self.label_type = meta.get("label_type", "int")
  584. self.label_name = meta.get("label_name")
  585. self.with_match_id = meta.get("with_match_id", False)
  586. if self.with_match_id:
  587. match_id_name = schema.get("match_id_name")
  588. if isinstance(match_id_name, list):
  589. if not isinstance(self.match_id_index, int) or self.match_id_index >= len(match_id_name):
  590. raise ValueError(f"match id index should between 0 and {len(match_id_name) - 1}, "
  591. f"but {self.match_id_index} is given")
  592. self.match_id_name = match_id_name[self.match_id_index]
  593. else:
  594. if self.match_id_index != 0:
  595. raise ValueError("Only one match_id exist, match_id_index should be 0")
  596. self.match_id_name = match_id_name
  597. schema["match_id_name"] = self.match_id_name
  598. def read_data(self, input_data, mode="fit"):
  599. LOGGER.info("start to read sparse data and change data to instance")
  600. abnormal_detection.empty_table_detection(input_data)
  601. schema = copy.deepcopy(input_data.schema)
  602. if not schema.get("meta", {}):
  603. LOGGER.warning("Data meta is supported to be set with data uploading or binding, "
  604. "please refer to data transform using guides.")
  605. meta = dict(input_format="tag",
  606. delimiter=self.delimitor,
  607. with_label=self.with_label,
  608. with_match_id=self.with_match_id,
  609. tag_with_value=self.tag_with_value,
  610. tag_value_delimiter=self.tag_value_delimitor,
  611. data_type=self.data_type)
  612. schema["meta"] = meta
  613. generated_header = DataFormatPreProcess.generate_header(input_data, schema)
  614. schema.update(generated_header)
  615. schema = self.anonymous_generator.generate_anonymous_header(schema)
  616. set_schema(input_data, schema)
  617. else:
  618. self._update_param(schema)
  619. if mode == "fit":
  620. self.header = schema["header"]
  621. self.anonymous_header = schema["anonymous_header"]
  622. data_instance = self.fit(input_data)
  623. else:
  624. if not self.anonymous_header:
  625. header_set = set(self.header)
  626. self.anonymous_header = []
  627. for column, anonymous_column in zip(schema["header"], schema["anonymous_header"]):
  628. if column not in header_set:
  629. continue
  630. self.anonymous_header.append(anonymous_column)
  631. schema["header"] = self.header
  632. schema["anonymous_header"] = self.anonymous_header
  633. set_schema(input_data, schema)
  634. data_instance = self.transform(input_data)
  635. set_schema(data_instance, schema)
  636. return data_instance
  637. @staticmethod
  638. def change_tag_to_str(value, tags_dict=None, delimitor=",", feature_offset=0,
  639. tag_value_delimitor=":"):
  640. vals = value.split(delimitor, -1)
  641. ret = [''] * len(tags_dict)
  642. vals = vals[feature_offset:]
  643. for i in range(len(vals)):
  644. tag, value = vals[i].split(tag_value_delimitor, -1)
  645. idx = tags_dict.get(tag, None)
  646. if idx is not None:
  647. ret[idx] = value
  648. return ret
  649. @staticmethod
  650. def change_str_to_tag(value, tags_dict=None, delimitor=",", tag_value_delimitor=":"):
  651. ret = [None] * len(tags_dict)
  652. tags = sorted(list(tags_dict.keys()))
  653. for i in range(len(value)):
  654. tag, val = tags[i], value[i]
  655. ret[i] = tag_value_delimitor.join([tag, val])
  656. return delimitor.join(ret)
  657. def fill_missing_value(self, input_data, tags_dict, schema, mode="fit"):
  658. feature_offset = DataFormatPreProcess.get_feature_offset(schema)
  659. str_trans_method = functools.partial(self.change_tag_to_str,
  660. tags_dict=tags_dict,
  661. delimitor=self.delimitor,
  662. feature_offset=feature_offset,
  663. tag_value_delimitor=self.tag_value_delimitor)
  664. input_data = input_data.mapValues(str_trans_method)
  665. set_schema(input_data, schema)
  666. from federatedml.feature.imputer import Imputer
  667. imputer_processor = Imputer()
  668. if mode == "fit":
  669. data, self.default_value = imputer_processor.fit(input_data,
  670. replace_method=self.missing_fill_method,
  671. replace_value=self.default_value)
  672. LOGGER.debug("self.default_value is {}".format(self.default_value))
  673. else:
  674. data = imputer_processor.transform(input_data,
  675. transform_value=self.default_value)
  676. if self.missing_impute is None:
  677. self.missing_impute = imputer_processor.get_missing_value_list()
  678. LOGGER.debug("self.missing_impute is {}".format(self.missing_impute))
  679. self.missing_impute_rate = imputer_processor.get_impute_rate(mode)
  680. str_trans_tag_method = functools.partial(self.change_str_to_tag,
  681. tags_dict=tags_dict,
  682. delimitor=self.delimitor,
  683. tag_value_delimitor=self.tag_value_delimitor)
  684. data = data.mapValues(str_trans_tag_method)
  685. return data
  686. def fit(self, input_data):
  687. schema = input_data.schema
  688. tags_dict = dict(zip(schema["header"], range(len(schema["header"]))))
  689. if self.tag_with_value and self.missing_fill:
  690. input_data = self.fill_missing_value(input_data, tags_dict, schema, mode="fit")
  691. data_instance = self.gen_data_instance(input_data, schema["meta"], tags_dict)
  692. return data_instance
  693. def transform(self, input_data):
  694. schema = input_data.schema
  695. tags_dict = dict(zip(self.header, range(len(self.header))))
  696. if self.tag_with_value and self.missing_fill:
  697. input_data = self.fill_missing_value(input_data, tags_dict, schema, mode="transform")
  698. data_instance = self.gen_data_instance(input_data, schema["meta"], tags_dict)
  699. return data_instance
  700. def gen_data_instance(self, input_data, meta, tags_dict):
  701. params = [self.delimitor,
  702. self.data_type,
  703. self.tag_with_value,
  704. self.tag_value_delimitor,
  705. self.with_label,
  706. self.with_match_id,
  707. self.match_id_index,
  708. meta.get("id_range", 0),
  709. self.label_type,
  710. self.output_format,
  711. tags_dict]
  712. to_instance_with_param = functools.partial(self.to_instance, params)
  713. data_instance = input_data.mapValues(to_instance_with_param)
  714. return data_instance
  715. def get_summary(self):
  716. if not self.missing_fill:
  717. return {}
  718. missing_summary = dict()
  719. missing_summary["missing_value"] = list(self.missing_impute)
  720. missing_summary["missing_impute_value"] = dict(zip(self.header, self.default_value))
  721. missing_summary["missing_impute_rate"] = dict(zip(self.header, self.missing_impute_rate))
  722. summary_buf = {"missing_fill_info": missing_summary}
  723. return summary_buf
  724. @staticmethod
  725. def to_instance(param_list, value):
  726. delimitor = param_list[0]
  727. data_type = param_list[1]
  728. tag_with_value = param_list[2]
  729. tag_value_delimitor = param_list[3]
  730. with_label = param_list[4]
  731. with_match_id = param_list[5]
  732. match_id_index = param_list[6]
  733. id_range = param_list[7]
  734. label_type = param_list[8]
  735. output_format = param_list[9]
  736. tags_dict = param_list[10]
  737. if output_format not in ["dense", "sparse"]:
  738. raise ValueError("output format {} is not define".format(output_format))
  739. cols = value.split(delimitor, -1)
  740. offset = 0
  741. label = None
  742. match_id = None
  743. if with_match_id:
  744. offset = id_range if id_range else 1
  745. if offset == 0:
  746. offset = 1
  747. match_id = cols[match_id_index]
  748. if with_label:
  749. label = cols[offset]
  750. offset += 1
  751. if label_type == 'int':
  752. label = int(label)
  753. elif label_type in ["float", "float64"]:
  754. label = float(label)
  755. if output_format == "dense":
  756. features = [0 for i in range(len(tags_dict))]
  757. for fea in cols[offset:]:
  758. if tag_with_value:
  759. _tag, _val = fea.split(tag_value_delimitor, -1)
  760. if _tag in tags_dict:
  761. features[tags_dict.get(_tag)] = _val
  762. else:
  763. if fea in tags_dict:
  764. features[tags_dict.get(fea)] = 1
  765. features = np.asarray(features, dtype=data_type)
  766. else:
  767. indices = []
  768. data = []
  769. for fea in cols[offset:]:
  770. if tag_with_value:
  771. _tag, _val = fea.split(tag_value_delimitor, -1)
  772. else:
  773. _tag = fea
  774. _val = 1
  775. if _tag not in tags_dict:
  776. continue
  777. indices.append(tags_dict.get(_tag))
  778. if data_type in ["float", "float64"]:
  779. _val = float(_val)
  780. elif data_type in ["int", "int64", "long"]:
  781. _val = int(_val)
  782. elif data_type == "str":
  783. _val = str(_val)
  784. data.append(_val)
  785. features = SparseVector(indices, data, len(tags_dict))
  786. return Instance(inst_id=match_id,
  787. features=features,
  788. label=label)
  789. def save_model(self):
  790. transform_meta, transform_param = save_data_transform_model(input_format="tag",
  791. delimitor=self.delimitor,
  792. data_type=self.data_type,
  793. tag_with_value=self.tag_with_value,
  794. tag_value_delimitor=self.tag_value_delimitor,
  795. with_label=self.with_label,
  796. label_type=self.label_type,
  797. with_match_id=self.with_match_id,
  798. output_format=self.output_format,
  799. header=self.header,
  800. sid_name=self.sid_name,
  801. label_name=self.label_name,
  802. model_name="Transformer",
  803. anonymous_header=self.anonymous_header)
  804. missing_imputer_meta, missing_imputer_param = save_missing_imputer_model(self.missing_fill,
  805. self.missing_fill_method,
  806. self.missing_impute,
  807. self.default_value,
  808. self.missing_impute_rate,
  809. self.header,
  810. "Imputer")
  811. transform_meta.imputer_meta.CopyFrom(missing_imputer_meta)
  812. transform_param.imputer_param.CopyFrom(missing_imputer_param)
  813. outlier_meta, outlier_param = save_outlier_model(outlier_replace=False,
  814. model_name="Outlier")
  815. transform_meta.outlier_meta.CopyFrom(outlier_meta)
  816. transform_param.outlier_param.CopyFrom(outlier_param)
  817. return {"DataTransformMeta": transform_meta,
  818. "DataTransformParam": transform_param
  819. }
  820. def load_model(self, model_meta, model_param):
  821. self.delimitor, self.data_type, _0, self.tag_with_value, self.tag_value_delimitor, self.with_label, \
  822. self.label_type, self.output_format, self.header, self.sid_name, self.label_name, self.with_match_id, \
  823. self.anonymous_header = load_data_transform_model(
  824. "SparseTagTransformer",
  825. model_meta,
  826. model_param)
  827. self.missing_fill, self.missing_fill_method, \
  828. self.missing_impute, self.default_value = load_missing_imputer_model(self.header,
  829. "Imputer",
  830. model_meta.imputer_meta,
  831. model_param.imputer_param)
  832. class DataTransform(ModelBase):
  833. def __init__(self):
  834. super(DataTransform, self).__init__()
  835. self.transformer = None
  836. from federatedml.param.data_transform_param import DataTransformParam
  837. self.model_param = DataTransformParam()
  838. self._input_model_meta = None
  839. self._input_model_param = None
  840. def _load_reader(self, schema=None):
  841. if schema is None or not schema.get("meta", {}):
  842. input_format = self.model_param.input_format
  843. else:
  844. input_format = schema["meta"].get("input_format")
  845. if input_format == "dense":
  846. self.transformer = DenseFeatureTransformer(self.model_param)
  847. elif input_format == "sparse" or input_format == "svmlight":
  848. self.transformer = SparseFeatureTransformer(self.model_param)
  849. elif input_format == "tag":
  850. self.transformer = SparseTagTransformer(self.model_param)
  851. else:
  852. raise ValueError("Cannot recognize input format")
  853. if self._input_model_meta:
  854. self.transformer.load_model(self._input_model_meta, self._input_model_param)
  855. self._input_model_meta, self._input_model_param = None, None
  856. self.transformer.anonymous_generator = Anonymous(self.role, self.component_properties.local_partyid)
  857. def _init_model(self, model_param):
  858. self.model_param = model_param
  859. def load_model(self, model_dict):
  860. for _, value in model_dict["model"].items():
  861. for model in value:
  862. if model.endswith("Meta"):
  863. self._input_model_meta = value[model]
  864. if model.endswith("Param"):
  865. self._input_model_param = value[model]
  866. def fit(self, data):
  867. self._load_reader(data.schema)
  868. data_inst = self.transformer.read_data(data, "fit")
  869. if isinstance(self.transformer, (DenseFeatureTransformer, SparseTagTransformer)):
  870. summary_buf = self.transformer.get_summary()
  871. if summary_buf:
  872. self.set_summary(summary_buf)
  873. clear_schema(data_inst)
  874. return data_inst
  875. def transform(self, data):
  876. self._load_reader(data.schema)
  877. data_inst = self.transformer.read_data(data, "transform")
  878. clear_schema(data_inst)
  879. return data_inst
  880. def export_model(self):
  881. if not self.need_run:
  882. model_meta = DataTransformMeta()
  883. model_meta.need_run = False
  884. model_param = DataTransformParam()
  885. model_dict = dict(DataTransformMeta=model_param,
  886. DataTransformParam=model_param)
  887. else:
  888. model_dict = self.transformer.save_model()
  889. return model_dict
  890. def clear_schema(data_inst):
  891. ret_schema = copy.deepcopy(data_inst.schema)
  892. key_words = {"sid", "header", "anonymous_header", "label_name",
  893. "anonymous_label", "match_id_name"}
  894. for key in data_inst.schema:
  895. if key not in key_words:
  896. del ret_schema[key]
  897. data_inst.schema = ret_schema
  898. def set_schema(data_instance, schema):
  899. data_instance.schema = schema
  900. def save_data_transform_model(input_format="dense",
  901. delimitor=",",
  902. data_type="str",
  903. exclusive_data_type=None,
  904. tag_with_value=False,
  905. tag_value_delimitor=":",
  906. with_label=False,
  907. label_name='',
  908. label_type="int",
  909. output_format="dense",
  910. header=None,
  911. sid_name=None,
  912. with_match_id=False,
  913. model_name="DataTransform",
  914. anonymous_header=None):
  915. model_meta = DataTransformMeta()
  916. model_param = DataTransformParam()
  917. model_meta.input_format = input_format
  918. model_meta.delimitor = delimitor
  919. model_meta.data_type = data_type
  920. model_meta.tag_with_value = tag_with_value
  921. model_meta.tag_value_delimitor = tag_value_delimitor
  922. model_meta.with_label = with_label
  923. if with_label:
  924. model_meta.label_name = label_name
  925. model_meta.label_type = label_type
  926. model_meta.output_format = output_format
  927. model_meta.with_match_id = with_match_id
  928. if header is not None:
  929. model_param.header.extend(header)
  930. if anonymous_header is not None:
  931. model_param.anonymous_header.extend(anonymous_header)
  932. if sid_name:
  933. model_param.sid_name = sid_name
  934. if label_name:
  935. model_param.label_name = label_name
  936. if exclusive_data_type is not None:
  937. model_meta.exclusive_data_type.update(exclusive_data_type)
  938. return model_meta, model_param
  939. def load_data_transform_model(model_name="DataTransform",
  940. model_meta=None,
  941. model_param=None):
  942. delimitor = model_meta.delimitor
  943. data_type = model_meta.data_type
  944. tag_with_value = model_meta.tag_with_value
  945. tag_value_delimitor = model_meta.tag_value_delimitor
  946. with_label = model_meta.with_label
  947. label_name = model_meta.label_name if with_label else None
  948. label_type = model_meta.label_type if with_label else None
  949. try:
  950. with_match_id = model_meta.with_match_id
  951. except AttributeError:
  952. with_match_id = False
  953. output_format = model_meta.output_format
  954. header = list(model_param.header) or None
  955. try:
  956. anonymous_header = list(model_param.anonymous_header)
  957. except AttributeError:
  958. anonymous_header = None
  959. sid_name = None
  960. if model_param.sid_name:
  961. sid_name = model_param.sid_name
  962. exclusive_data_type = None
  963. if model_meta.exclusive_data_type:
  964. exclusive_data_type = {}
  965. for col_name in model_meta.exclusive_data_type:
  966. exclusive_data_type[col_name] = model_meta.exclusive_data_type.get(col_name)
  967. return delimitor, data_type, exclusive_data_type, tag_with_value, tag_value_delimitor, with_label, \
  968. label_type, output_format, header, sid_name, label_name, with_match_id, anonymous_header
  969. def save_missing_imputer_model(missing_fill=False,
  970. missing_replace_method=None,
  971. missing_impute=None,
  972. missing_fill_value=None,
  973. missing_replace_rate=None,
  974. header=None,
  975. model_name="Imputer"):
  976. model_meta = DataTransformImputerMeta()
  977. model_param = DataTransformImputerParam()
  978. model_meta.is_imputer = missing_fill
  979. if missing_fill:
  980. if missing_replace_method:
  981. model_meta.strategy = str(missing_replace_method)
  982. if missing_impute is not None:
  983. model_meta.missing_value.extend(map(str, missing_impute))
  984. if missing_fill_value is not None:
  985. feature_value_dict = dict(zip(header, map(str, missing_fill_value)))
  986. model_param.missing_replace_value.update(feature_value_dict)
  987. if missing_replace_rate is not None:
  988. missing_replace_rate_dict = dict(zip(header, missing_replace_rate))
  989. model_param.missing_value_ratio.update(missing_replace_rate_dict)
  990. return model_meta, model_param
  991. def load_missing_imputer_model(header=None,
  992. model_name="Imputer",
  993. model_meta=None,
  994. model_param=None):
  995. missing_fill = model_meta.is_imputer
  996. missing_replace_method = model_meta.strategy
  997. missing_value = model_meta.missing_value
  998. missing_fill_value = model_param.missing_replace_value
  999. if missing_fill:
  1000. if not missing_replace_method:
  1001. missing_replace_method = None
  1002. if not missing_value:
  1003. missing_value = None
  1004. else:
  1005. missing_value = list(missing_value)
  1006. if missing_fill_value:
  1007. missing_fill_value = [missing_fill_value.get(head) for head in header]
  1008. else:
  1009. missing_fill_value = None
  1010. else:
  1011. missing_replace_method = None
  1012. missing_value = None
  1013. missing_fill_value = None
  1014. return missing_fill, missing_replace_method, missing_value, missing_fill_value
  1015. def save_outlier_model(outlier_replace=False,
  1016. outlier_replace_method=None,
  1017. outlier_impute=None,
  1018. outlier_replace_value=None,
  1019. outlier_replace_rate=None,
  1020. header=None,
  1021. model_name="Outlier"):
  1022. model_meta = DataTransformOutlierMeta()
  1023. model_param = DataTransformOutlierParam()
  1024. model_meta.is_outlier = outlier_replace
  1025. if outlier_replace:
  1026. if outlier_replace_method:
  1027. model_meta.strategy = str(outlier_replace_method)
  1028. if outlier_impute:
  1029. model_meta.outlier_value.extend(map(str, outlier_impute))
  1030. if outlier_replace_value:
  1031. outlier_value_dict = dict(zip(header, map(str, outlier_replace_value)))
  1032. model_param.outlier_replace_value.update(outlier_value_dict)
  1033. if outlier_replace_rate:
  1034. outlier_value_ratio_dict = dict(zip(header, outlier_replace_rate))
  1035. model_param.outlier_value_ratio.update(outlier_value_ratio_dict)
  1036. return model_meta, model_param
  1037. def load_outlier_model(header=None,
  1038. model_name="Outlier",
  1039. model_meta=None,
  1040. model_param=None):
  1041. outlier_replace = model_meta.is_outlier
  1042. outlier_replace_method = model_meta.strategy
  1043. outlier_value = model_meta.outlier_value
  1044. outlier_replace_value = model_param.outlier_replace_value
  1045. if outlier_replace:
  1046. if not outlier_replace_method:
  1047. outlier_replace_method = None
  1048. if not outlier_value:
  1049. outlier_value = None
  1050. else:
  1051. outlier_value = list(outlier_value)
  1052. if outlier_replace_value:
  1053. outlier_replace_value = [outlier_replace_value.get(head) for head in header]
  1054. else:
  1055. outlier_replace_value = None
  1056. else:
  1057. outlier_replace_method = None
  1058. outlier_value = None
  1059. outlier_replace_value = None
  1060. return outlier_replace, outlier_replace_method, outlier_value, outlier_replace_value