data_to_json.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. """
  2. These codes are adopted from LEAF with some modifications.
  3. It converts a list of (writer, [list of (file,class)]) tuples into a json object of the form:
  4. {users: [bob, etc], num_samples: [124, etc.],
  5. user_data: {bob : {x:[img1,img2,etc], y:[class1,class2,etc]}, etc}},
  6. where "img_" is a vectorized representation of the corresponding image.
  7. """
  8. from __future__ import division
  9. import json
  10. import math
  11. import os
  12. import numpy as np
  13. from PIL import Image
  14. from easyfl.datasets.utils import util
  15. MAX_WRITERS = 100 # max number of writers per json file.
  16. def relabel_class(c):
  17. """
  18. maps hexadecimal class value (string) to a decimal number
  19. returns:
  20. - 0 through 9 for classes representing respective numbers
  21. - 10 through 35 for classes representing respective uppercase letters
  22. - 36 through 61 for classes representing respective lowercase letters
  23. """
  24. if c.isdigit() and int(c) < 40:
  25. return int(c) - 30
  26. elif int(c, 16) <= 90: # uppercase
  27. return int(c, 16) - 55
  28. else:
  29. return int(c, 16) - 61
  30. def data_to_json(base_folder):
  31. by_writer_dir = os.path.join(base_folder, "intermediate", "images_by_writer")
  32. writers = util.load_obj(by_writer_dir)
  33. num_json = int(math.ceil(len(writers) / MAX_WRITERS))
  34. users = []
  35. num_samples = []
  36. user_data = {}
  37. writer_count = 0
  38. json_index = 0
  39. for (w, l) in writers:
  40. users.append(w)
  41. num_samples.append(len(l))
  42. user_data[w] = {"x": [], "y": []}
  43. size = 28, 28 # original image size is 128, 128
  44. for (f, c) in l:
  45. file_path = os.path.join(base_folder, f)
  46. img = Image.open(file_path)
  47. gray = img.convert("L")
  48. gray.thumbnail(size, Image.ANTIALIAS)
  49. arr = np.asarray(gray).copy()
  50. vec = arr.flatten()
  51. vec = vec / 255 # scale all pixel values to between 0 and 1
  52. vec = vec.tolist()
  53. nc = relabel_class(c)
  54. user_data[w]["x"].append(vec)
  55. user_data[w]["y"].append(nc)
  56. writer_count += 1
  57. if writer_count == MAX_WRITERS:
  58. all_data = {}
  59. all_data["users"] = users
  60. all_data["num_samples"] = num_samples
  61. all_data["user_data"] = user_data
  62. file_name = "all_data_%d.json" % json_index
  63. file_path = os.path.join(base_folder, "all_data", file_name)
  64. print("writing %s" % file_name)
  65. with open(file_path, "w") as outfile:
  66. json.dump(all_data, outfile)
  67. writer_count = 0
  68. json_index += 1
  69. users[:] = []
  70. num_samples[:] = []
  71. user_data.clear()