get_hashes.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. """
  2. These codes are adopted from LEAF with some modifications.
  3. """
  4. import hashlib
  5. import logging
  6. import os
  7. from easyfl.datasets.utils import util
  8. logger = logging.getLogger(__name__)
  9. def get_hash(base_folder):
  10. cfd = os.path.join(base_folder, "intermediate", "class_file_dirs")
  11. wfd = os.path.join(base_folder, "intermediate", "write_file_dirs")
  12. class_file_dirs = util.load_obj(cfd)
  13. write_file_dirs = util.load_obj(wfd)
  14. class_file_hashes = []
  15. write_file_hashes = []
  16. count = 0
  17. for tup in class_file_dirs:
  18. if (count % 100000 == 0):
  19. logger.info("hashed %d class images" % count)
  20. (cclass, cfile) = tup
  21. file_path = os.path.join(base_folder, cfile)
  22. chash = hashlib.md5(open(file_path, "rb").read()).hexdigest()
  23. class_file_hashes.append((cclass, cfile, chash))
  24. count += 1
  25. cfhd = os.path.join(base_folder, "intermediate", "class_file_hashes")
  26. util.save_obj(class_file_hashes, cfhd)
  27. count = 0
  28. for tup in write_file_dirs:
  29. if (count % 100000 == 0):
  30. logger.info("hashed %d write images" % count)
  31. (cclass, cfile) = tup
  32. file_path = os.path.join(base_folder, cfile)
  33. chash = hashlib.md5(open(file_path, "rb").read()).hexdigest()
  34. write_file_hashes.append((cclass, cfile, chash))
  35. count += 1
  36. wfhd = os.path.join(base_folder, "intermediate", "write_file_hashes")
  37. util.save_obj(write_file_hashes, wfhd)