""" Helper functions for preprocessing shakespeare data. These codes are adopted from LEAF with some modifications. """ import json import os import re def __txt_to_data(txt_dir, seq_length=80): """Parses text file in given directory into data for next-character model. Args: txt_dir: path to text file seq_length: length of strings in X """ raw_text = "" with open(txt_dir, 'r') as inf: raw_text = inf.read() raw_text = raw_text.replace('\n', ' ') raw_text = re.sub(r" *", r' ', raw_text) dataX = [] dataY = [] for i in range(0, len(raw_text) - seq_length, 1): seq_in = raw_text[i:i + seq_length] seq_out = raw_text[i + seq_length] dataX.append(seq_in) dataY.append(seq_out) return dataX, dataY def parse_data_in(data_dir, users_and_plays_path, raw=False): """ returns dictionary with keys: users, num_samples, user_data raw := bool representing whether to include raw text in all_data if raw is True, then user_data key removes users with no data """ with open(users_and_plays_path, 'r') as inf: users_and_plays = json.load(inf) files = os.listdir(data_dir) users = [] hierarchies = [] num_samples = [] user_data = {} for f in files: user = f[:-4] passage = '' filename = os.path.join(data_dir, f) with open(filename, 'r') as inf: passage = inf.read() dataX, dataY = __txt_to_data(filename) if (len(dataX) > 0): users.append(user) if raw: user_data[user] = {'raw': passage} else: user_data[user] = {} user_data[user]['x'] = dataX user_data[user]['y'] = dataY hierarchies.append(users_and_plays[user]) num_samples.append(len(dataY)) all_data = {} all_data['users'] = users all_data['hierarchies'] = hierarchies all_data['num_samples'] = num_samples all_data['user_data'] = user_data return all_data