nn
nn copied to clipboard
preproc_char build_data is too slow for repeated index lookup
In dsets/preproc/preproc_char.py
, the build_data
is extremely slow since the character index is lookup for multiple times.
chars = ['<null>'] * (CONTEXT - 1) + ['<s>'] + list(line) + ['</s>']
# Begin right after start symbol, stop at end symbol inclusive
for k in range(CONTEXT, len(chars)):
c_inds = [char_inds[c] for c in chars[k-CONTEXT:k+1]]
It is possible to speed up by eliminating redundant computations.
chars_prefix = ['<null>'] * (context_size - 1) + ['<s>']
chars_prefix = [char_inds[c] for c in chars_prefix]
chars_suffix = [char_inds['</s>']]
chars_prefix_array = np.array(chars_prefix)
chars_suffix_array = np.array(chars_suffix)
for f in data_files:
path = pjoin(parent_path, f)
print 'Processing %s' % path
with open(path, 'r') as fin:
for line in fin:
## Only for brown corpus
# line = preproc_line(line)
if not line:
continue
chars = list(line)
chars = [char_inds[c] for c in chars]
# Begin right after start symbol, stop at end symbol inclusive
c_inds_all = np.concatenate((chars_prefix_array, np.array(chars),
chars_suffix_array))
# c_inds_all = np.array(chars_prefix + chars + chars_suffix)
for k in range(context_size, c_inds_all.shape[0]):
c_inds = c_inds_all[k - context_size : k + 1]
if data_ind < num_train:
train_data[:, data_ind] = c_inds
elif data_ind < num_train + num_dev:
dev_data[:, data_ind - num_train] = c_inds
else:
test_data[:, data_ind - num_train - num_dev] = c_inds
data_ind += 1
log_util can not be found, is there this module?
some error i met as follow:
Traceback (most recent call last):
File "train.py", line 89, in
what is the cfg.json? what should i do to get cfg.json