spark-deep-learning
spark-deep-learning copied to clipboard
MLlib’s tooling for tuning the hyperparameters from Keras model and tensors (no images)
Hi Team, Is there any plan to extend functionality so that we are able to use MLlib's tooling for tunning the hyperparameters of a keras model in e.g. numeric data (i.e. no images)? I believe I would need the equivalent to the KerasImageFileEstimator but for tensors. Is there an easy workaround in the meantime? Best, Ana
I was able to get a model being trained by creating this class which overrides the KerasImageFileEstimator class
I see the model being trained in the worker and all its fine. But for some reason the returned model has a very bad accuracy eventough in the logs of the worker , the validation accuracy was very good.
Give it a try , maybe you are lucky
from sparkdl.param import (
keyword_only, CanLoadImage, HasKerasModel, HasKerasOptimizer, HasKerasLoss, HasOutputMode,
HasInputCol, HasLabelCol, HasOutputCol)
from sparkdl.transformers.keras_tensor import KerasTransformer from sparkdl.estimators.keras_image_file_estimator import KerasImageFileEstimator import sparkdl.utils.keras_model as kmutil
class generalKerasEstimator(KerasImageFileEstimator):
def _getNumpyFeaturesAndLabels(self, dataset):
image_uri_col = self.getInputCol()
input_col = self.getInputCol()
label_col = None
if self.isDefined(self.labelCol) and self.getLabelCol() != "":
label_col = self.getLabelCol()
tmp_image_col = self._loadedImageCol()
image_df = dataset.select(input_col,label_col)
# Extract features
localFeatures = []
rows = image_df.collect()
for row in rows:
feat = row[input_col]
features = array(feat)
localFeatures.append(features)
if not localFeatures: # NOTE(phi-dbq): pep-8 recommended against testing 0 == len(array)
raise ValueError("Cannot extract any feature from dataset!")
X = np.stack(localFeatures, axis=0)
# Extract labels
y = None
if label_col is not None:
label_schema = image_df.schema[label_col]
label_dtype = label_schema.dataType
#assert isinstance(label_dtype, spla.VectorUDT), \
# "must encode labels in one-hot vector format, but got {}".format(label_dtype)
localLabels = []
for row in rows:
try:
_keras_label = row[label_col]
except ValueError:
raise ValueError("Cannot extract encoded label array")
localLabels.append(_keras_label)
if not localLabels:
raise ValueError("Failed to load any labels from dataset, but labels are required")
y = np.stack(localLabels, axis=0)
assert y.shape[0] == X.shape[0], "number of features {} != number of labels {}".format(X.shape[0], y.shape[0])
return X, y
def _collectModels(self, kerasModelBytesRDD):
"""
Collect Keras models on workers to MLlib Models on the driver.
:param kerasModelBytesRDD: RDD of (param_map, model_bytes) tuples
:return: generator of (index, MLlib model) tuples
"""
for (i, param_map, model_bytes) in kerasModelBytesRDD.collect():
model_filename = kmutil.bytes_to_h5file(model_bytes)
print(i,model_filename,param_map)
yield i, self._copyValues(KerasTransformer(modelFile=model_filename),
extra=param_map)