pipelined_component.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. #
  2. # Copyright 2022 The FATE Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the 'License');
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an 'AS IS' BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import hashlib
  17. import os
  18. from pathlib import Path
  19. from zipfile import ZipFile
  20. from ruamel import yaml
  21. from fate_arch.common.base_utils import json_dumps, json_loads
  22. from fate_flow.db.db_models import DB, PipelineComponentMeta
  23. from fate_flow.db.db_utils import bulk_insert_into_db
  24. from fate_flow.model import Locker, local_cache_required, lock
  25. from fate_flow.pipelined_model import Pipelined
  26. from fate_flow.settings import TEMP_DIRECTORY
  27. from fate_flow.utils.base_utils import get_fate_flow_directory
  28. class PipelinedComponent(Pipelined, Locker):
  29. def __init__(self, **kwargs):
  30. Pipelined.__init__(self, **kwargs)
  31. self.model_path = Path(get_fate_flow_directory('model_local_cache'), self.party_model_id, self.model_version)
  32. self.define_meta_path = self.model_path / 'define' / 'define_meta.yaml'
  33. self.variables_data_path = self.model_path / 'variables' / 'data'
  34. self.run_parameters_path = self.model_path / 'run_parameters'
  35. self.checkpoint_path = self.model_path / 'checkpoint'
  36. self.query_args = (
  37. PipelineComponentMeta.f_model_id == self.model_id,
  38. PipelineComponentMeta.f_model_version == self.model_version,
  39. PipelineComponentMeta.f_role == self.role,
  40. PipelineComponentMeta.f_party_id == self.party_id,
  41. )
  42. Locker.__init__(self, self.model_path)
  43. def exists(self, component_name=None, model_alias=None):
  44. if component_name is None:
  45. return self.model_path.is_dir() and set(os.listdir(self.model_path)) - {'.lock'}
  46. query = self.get_define_meta_from_db(PipelineComponentMeta.f_component_name == component_name)
  47. if query:
  48. query = query[0]
  49. if model_alias is None:
  50. model_alias = query.f_model_alias
  51. model_proto_index = query.f_model_proto_index
  52. else:
  53. query = self.get_define_meta_from_file()
  54. try:
  55. query = query['model_proto'][component_name]
  56. except KeyError:
  57. return False
  58. if model_alias is None:
  59. if len(query) != 1:
  60. return False
  61. model_alias = next(iter(query.keys()))
  62. try:
  63. model_proto_index = query[model_alias]
  64. except KeyError:
  65. return False
  66. if not model_proto_index:
  67. return False
  68. variables_data_path = self.variables_data_path / component_name / model_alias
  69. for model_name, buffer_name in model_proto_index.items():
  70. if not (variables_data_path / model_name).is_file():
  71. return False
  72. return True
  73. def get_define_meta_from_file(self):
  74. if not self.define_meta_path.is_file():
  75. return {}
  76. return yaml.safe_load(self.define_meta_path.read_text('utf-8'))
  77. @DB.connection_context()
  78. def get_define_meta_from_db(self, *query_args):
  79. return tuple(PipelineComponentMeta.select().where(*self.query_args, *query_args))
  80. def rearrange_define_meta(self, data):
  81. define_meta = {
  82. 'component_define': {},
  83. 'model_proto': {},
  84. }
  85. for row in data:
  86. define_meta['component_define'][row.f_component_name] = {'module_name': row.f_component_module_name}
  87. # there is only one model_alias in a component
  88. if row.f_component_name not in define_meta['model_proto']:
  89. define_meta['model_proto'][row.f_component_name] = {}
  90. define_meta['model_proto'][row.f_component_name][row.f_model_alias] = row.f_model_proto_index
  91. return define_meta
  92. def get_define_meta(self):
  93. query = self.get_define_meta_from_db()
  94. return self.rearrange_define_meta(query) if query else self.get_define_meta_from_file()
  95. @DB.connection_context()
  96. def save_define_meta(self, component_name, component_module_name, model_alias, model_proto_index, run_parameters):
  97. PipelineComponentMeta.insert(
  98. f_model_id=self.model_id,
  99. f_model_version=self.model_version,
  100. f_role=self.role,
  101. f_party_id=self.party_id,
  102. f_component_name=component_name,
  103. f_component_module_name=component_module_name,
  104. f_model_alias=model_alias,
  105. f_model_proto_index=model_proto_index,
  106. f_run_parameters=run_parameters,
  107. ).on_conflict(preserve=(
  108. PipelineComponentMeta.f_update_time,
  109. PipelineComponentMeta.f_update_date,
  110. PipelineComponentMeta.f_component_module_name,
  111. PipelineComponentMeta.f_model_alias,
  112. PipelineComponentMeta.f_model_proto_index,
  113. PipelineComponentMeta.f_run_parameters,
  114. )).execute()
  115. @lock
  116. def save_define_meta_from_db_to_file(self):
  117. query = self.get_define_meta_from_db()
  118. for row in query:
  119. run_parameters_path = self.get_run_parameters_path(row.f_component_name)
  120. run_parameters_path.parent.mkdir(parents=True, exist_ok=True)
  121. with run_parameters_path.open('w', encoding='utf-8') as f:
  122. f.write(json_dumps(row.f_run_parameters))
  123. self.define_meta_path.parent.mkdir(parents=True, exist_ok=True)
  124. with self.define_meta_path.open('w', encoding='utf-8') as f:
  125. yaml.dump(self.rearrange_define_meta(query), f, Dumper=yaml.RoundTripDumper)
  126. # import model
  127. @local_cache_required(True)
  128. def save_define_meta_from_file_to_db(self, replace_on_conflict=False):
  129. if not replace_on_conflict:
  130. with DB.connection_context():
  131. count = PipelineComponentMeta.select().where(*self.query_args).count()
  132. if count > 0:
  133. raise ValueError(f'The define_meta data already exists in database.')
  134. define_meta = self.get_define_meta_from_file()
  135. run_parameters = self.get_run_parameters_from_files()
  136. insert = []
  137. for component_name, component_define in define_meta['component_define'].items():
  138. for model_alias, model_proto_index in define_meta['model_proto'][component_name].items():
  139. row = {
  140. 'f_model_id': self.model_id,
  141. 'f_model_version': self.model_version,
  142. 'f_role': self.role,
  143. 'f_party_id': self.party_id,
  144. 'f_component_name': component_name,
  145. 'f_component_module_name': component_define['module_name'],
  146. 'f_model_alias': model_alias,
  147. 'f_model_proto_index': model_proto_index,
  148. 'f_run_parameters': run_parameters.get(component_name, {}),
  149. }
  150. insert.append(row)
  151. bulk_insert_into_db(PipelineComponentMeta, insert, replace_on_conflict)
  152. def replicate_define_meta(self, modification, query_args=(), replace_on_conflict=False):
  153. query = self.get_define_meta_from_db(*query_args)
  154. if not query:
  155. return
  156. insert = []
  157. for row in query:
  158. row = row.to_dict()
  159. del row['id']
  160. row.update(modification)
  161. insert.append(row)
  162. bulk_insert_into_db(PipelineComponentMeta, insert, replace_on_conflict)
  163. def get_run_parameters_path(self, component_name):
  164. return self.run_parameters_path / component_name / 'run_parameters.json'
  165. @lock
  166. def get_run_parameters_from_files(self):
  167. if not self.run_parameters_path.is_dir():
  168. return {}
  169. return {
  170. path.name: json_loads(self.get_run_parameters_path(path.name).read_text('utf-8'))
  171. for path in self.run_parameters_path.iterdir()
  172. }
  173. def get_run_parameters(self):
  174. query = self.get_define_meta_from_db()
  175. return {
  176. row.f_component_name: row.f_run_parameters
  177. for row in query
  178. } if query else self.get_run_parameters_from_files()
  179. def get_archive_path(self, component_name):
  180. return Path(TEMP_DIRECTORY, f'{self.party_model_id}_{self.model_version}_{component_name}.zip')
  181. def walk_component(self, zip_file, path: Path):
  182. if path.is_dir():
  183. for subpath in path.iterdir():
  184. self.walk_component(zip_file, subpath)
  185. elif path.is_file():
  186. zip_file.write(path, path.relative_to(self.model_path))
  187. @local_cache_required(True)
  188. def pack_component(self, component_name):
  189. filename = self.get_archive_path(component_name)
  190. with ZipFile(filename, 'w') as zip_file:
  191. self.walk_component(zip_file, self.variables_data_path / component_name)
  192. self.walk_component(zip_file, self.checkpoint_path / component_name)
  193. hash_ = hashlib.sha256(filename.read_bytes()).hexdigest()
  194. return filename, hash_
  195. @lock
  196. def unpack_component(self, component_name, hash_=None):
  197. filename = self.get_archive_path(component_name)
  198. if hash_ is not None:
  199. sha256 = hashlib.sha256(filename.read_bytes()).hexdigest()
  200. if hash_ != sha256:
  201. raise ValueError(f'Model archive hash mismatch. path: {filename} expected: {hash_} actual: {sha256}')
  202. with ZipFile(filename, 'r') as zip_file:
  203. zip_file.extractall(self.model_path)