123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- """
- Helper functions for preprocessing shakespeare data.
- These codes are adopted from LEAF with some modifications.
- """
- import json
- import os
- import re
- def __txt_to_data(txt_dir, seq_length=80):
- """Parses text file in given directory into data for next-character model.
- Args:
- txt_dir: path to text file
- seq_length: length of strings in X
- """
- raw_text = ""
- with open(txt_dir, 'r') as inf:
- raw_text = inf.read()
- raw_text = raw_text.replace('\n', ' ')
- raw_text = re.sub(r" *", r' ', raw_text)
- dataX = []
- dataY = []
- for i in range(0, len(raw_text) - seq_length, 1):
- seq_in = raw_text[i:i + seq_length]
- seq_out = raw_text[i + seq_length]
- dataX.append(seq_in)
- dataY.append(seq_out)
- return dataX, dataY
- def parse_data_in(data_dir, users_and_plays_path, raw=False):
- """
- returns dictionary with keys: users, num_samples, user_data
- raw := bool representing whether to include raw text in all_data
- if raw is True, then user_data key
- removes users with no data
- """
- with open(users_and_plays_path, 'r') as inf:
- users_and_plays = json.load(inf)
- files = os.listdir(data_dir)
- users = []
- hierarchies = []
- num_samples = []
- user_data = {}
- for f in files:
- user = f[:-4]
- passage = ''
- filename = os.path.join(data_dir, f)
- with open(filename, 'r') as inf:
- passage = inf.read()
- dataX, dataY = __txt_to_data(filename)
- if (len(dataX) > 0):
- users.append(user)
- if raw:
- user_data[user] = {'raw': passage}
- else:
- user_data[user] = {}
- user_data[user]['x'] = dataX
- user_data[user]['y'] = dataY
- hierarchies.append(users_and_plays[user])
- num_samples.append(len(dataY))
- all_data = {}
- all_data['users'] = users
- all_data['hierarchies'] = hierarchies
- all_data['num_samples'] = num_samples
- all_data['user_data'] = user_data
- return all_data
|