12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- """
- These codes are adopted from LEAF with some modifications.
- It converts a list of (writer, [list of (file,class)]) tuples into a json object of the form:
- {users: [bob, etc], num_samples: [124, etc.],
- user_data: {bob : {x:[img1,img2,etc], y:[class1,class2,etc]}, etc}},
- where "img_" is a vectorized representation of the corresponding image.
- """
- from __future__ import division
- import json
- import math
- import os
- import numpy as np
- from PIL import Image
- from easyfl.datasets.utils import util
- MAX_WRITERS = 100 # max number of writers per json file.
- def relabel_class(c):
- """
- maps hexadecimal class value (string) to a decimal number
- returns:
- - 0 through 9 for classes representing respective numbers
- - 10 through 35 for classes representing respective uppercase letters
- - 36 through 61 for classes representing respective lowercase letters
- """
- if c.isdigit() and int(c) < 40:
- return int(c) - 30
- elif int(c, 16) <= 90: # uppercase
- return int(c, 16) - 55
- else:
- return int(c, 16) - 61
- def data_to_json(base_folder):
- by_writer_dir = os.path.join(base_folder, "intermediate", "images_by_writer")
- writers = util.load_obj(by_writer_dir)
- num_json = int(math.ceil(len(writers) / MAX_WRITERS))
- users = []
- num_samples = []
- user_data = {}
- writer_count = 0
- json_index = 0
- for (w, l) in writers:
- users.append(w)
- num_samples.append(len(l))
- user_data[w] = {"x": [], "y": []}
- size = 28, 28 # original image size is 128, 128
- for (f, c) in l:
- file_path = os.path.join(base_folder, f)
- img = Image.open(file_path)
- gray = img.convert("L")
- gray.thumbnail(size, Image.ANTIALIAS)
- arr = np.asarray(gray).copy()
- vec = arr.flatten()
- vec = vec / 255 # scale all pixel values to between 0 and 1
- vec = vec.tolist()
- nc = relabel_class(c)
- user_data[w]["x"].append(vec)
- user_data[w]["y"].append(nc)
- writer_count += 1
- if writer_count == MAX_WRITERS:
- all_data = {}
- all_data["users"] = users
- all_data["num_samples"] = num_samples
- all_data["user_data"] = user_data
- file_name = "all_data_%d.json" % json_index
- file_path = os.path.join(base_folder, "all_data", file_name)
- print("writing %s" % file_name)
- with open(file_path, "w") as outfile:
- json.dump(all_data, outfile)
- writer_count = 0
- json_index += 1
- users[:] = []
- num_samples[:] = []
- user_data.clear()
|