Theano-MPI
Theano-MPI copied to clipboard
Cifar10, with BSP vs EASGD
launch_session.py
# launch_session.py
from theanompi import EASGD
#from theanompi import BSP
rule=EASGD()
#rule=BSP()
# modelfile: the relative path to the model file
# modelclass: the class name of the model to be imported from that file
rule.init(devices=['cuda0', 'cuda1', 'cuda2'] ,
modelfile = 'theanompi.models.cifar10',
modelclass = 'Cifar10_model')
rule.wait()
Which is based on:
class Cifar10_model(object): # c01b input
def __init__(self, config):
self.verbose = config['verbose']
self.rank = config['rank'] # will be used in sharding and distinguish rng
self.size = config['size']
import theano
self.name = 'Cifar10_model'
# data
from theanompi.models.data import Cifar10_data
self.data = Cifar10_data(verbose=False)
self.channels = self.data.channels # 'c' mean(R,G,B) = (103.939, 116.779, 123.68)
self.input_width = input_width # '0' single scale training 224
self.input_height = input_height # '1' single scale training 224
# if self.size>1: # only use avg
# self.batch_size = batch_size/self.size
# else:
self.batch_size = batch_size # 'b'
self.file_batch_size = file_batch_size
self.n_softmax_out = self.data.n_class
# mini batching and other data parallel common routine
self.data.batch_data(file_batch_size)
from theanompi import BSP
#rule=EASGD()
rule=BSP()
# modelfile: the relative path to the model file
# modelclass: the class name of the model to be imported from that file
rule.init(devices=['cuda0', 'cuda1', 'cuda2'],
modelfile = 'models.keras_model_zoo.wresnet',
modelclass = 'Wide_ResNet')
rule.wait()
Which uses:
class Wide_ResNet(object):
'''
Modified from:
https://gist.github.com/kashif/0ba0270279a0f38280423754cea2ee1e
'''
def __init__(self, config):
self.verbose = config['verbose']
self.size = config['size']
self.rank = config['rank']
self.name = 'Wide_ResNet'
# data
from theanompi.models.keras_model_zoo.data.cifar10 import Cifar10_data # dalton code
self.data = Cifar10_data(verbose=False)
self.build_model()
# iter related
self.current_t = 0
self.last_one_t=False
self.current_v = 0
self.last_one_v=False
self.n_subb = 1
self.n_epochs = nb_epoch
self.epoch=0
.
.
.
def compile_iter_fns(self, sync_type='avg'):
self.data.batch_data(self.model, batch_size)
self.data.extend_data(rank=self.rank, size=self.size)
self.data.shuffle_data(mode='train', common_seed=1234)
self.data.shuffle_data(mode='val')
self.data.shard_data(mode='train', rank=self.rank, size=self.size) # to update data.n_batch_train
self.data.shard_data(mode='val', rank=self.rank, size=self.size) # to update data.n_batch_val
Could you please share your test results for both models based on this dataset:
from theanompi.models.keras_model_zoo.data.cifar10 import Cifar10_data
self.data = Cifar10_data(verbose=False)
@dlunga
Ok. Let me run those examples and get back to you.
@dlunga
Here are the log files when training Cifar10_model (the toy model): cifar10-3worker-log.txt cifar10-1worker-log.txt and Wide_ResNet: wresnet-3worker-log.txt
Single worker Wide_ResNet has not finished yet.
@dlunga
Note though both dataset classes are named Cifar10_data
, there are some differences in the class methods between the two for feeding input data. This is due to some Keras internals which are not supposed to be used by users and therefore it's a bit of hacking Keras.
On the other hand, the Cifar10_data for Cifar10_model is implemented in the normal Theano-MPI way.
Thanks so much. As indicated on the other channel. I will test on a different cluster to see if the issues go away. Will update once done.