Complete this ...
The code thus far
# Hard set these 2 params and do separate studies on each
MINIMUM_LEVELS = 2
MAXIMUM_LEVELS = 2
NAMESPACE = "kubeflow"
JOB_NAME = "NLPtrainTask0001"
def objective(
parameters,
# Hard set these 2 params and do separate studies on each
minimum_levels=MINIMUM_LEVELS,
maximum_levels=MAXIMUM_LEVELS):
import tensorflow as tf
import tensorflow_text
from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor
from keras_nlp.layers import PositionEmbedding
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten
import pandas as pd
import numpy as np
from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\
import SimpleCerebrosRandomSearch
import pendulum
from cerebros.units.units import DenseUnit
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
from ast import literal_eval
import time
embedding_dim =\
parameters["embedding_dim"]
activation =\
parameters["activation"]
predecessor_level_connection_affinity_factor_first =\
parameters["predecessor_level_connection_affinity_factor_first"]
predecessor_level_connection_affinity_factor_main =\
parameters["predecessor_level_connection_affinity_factor_main"]
max_consecutive_lateral_connections =\
parameters["max_consecutive_lateral_connections"]
p_lateral_connection =\
parameters["p_lateral_connection"]
num_lateral_connection_tries_per_unit =\
parameters["num_lateral_connection_tries_per_unit"]
learning_rate =\
parameters["learning_rate"]
epochs =\
parameters["epochs"]
batch_size =\
parameters["batch_size"]
# Add a second dropout so dropout can be implemented within
# cerebros blocks and can separate from the beachhead
# dropout layer.
dropout =\
parameters["dropout"]
maximum_units_per_level =\
parameters["maximum_units_per_level"]
maximum_neurons_per_unit =\
parameters["maximum_neurons_per_unit"]
df = pd.read_csv("Phishing_Email.csv")
df = df[df['Email Text'].apply(lambda x: isinstance(x, str))]
df.reset_index(drop=True, inplace=True)
label_mapping = {"Safe Email": 0, "Phishing Email": 1}
df["Binary Label"] = df["Email Type"].map(label_mapping)
X = df["Email Text"].to_numpy()
y = df["Binary Label"].to_numpy()
X, y = shuffle(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.85, shuffle=False)
baseline_train_x = tf.constant(X_train)
baseline_train_y = tf.constant(y_train, dtype=tf.int8)
training_x = [baseline_train_x]
train_labels = [baseline_train_y]
INPUT_SHAPES = [()]
OUTPUT_SHAPES = [1]
# -------------------------------- GPT2 Model Definition --------------------------------
class TokenizerLayer(tf.keras.layers.Layer): # TokenizerLayer definition (as before)
def __init__(self, max_seq_length, **kwargs):
super(TokenizerLayer, self).__init__(**kwargs)
self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en")
self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length)
self.max_seq_length = max_seq_length
def call(self, inputs):
prep = self.preprocessor([inputs])
return prep['token_ids']
def get_config(self):
config = super(TokenizerLayer, self).get_config()
config.update({'max_seq_length': self.max_seq_length})
return config
@classmethod
def from_config(cls, config):
return cls(max_seq_length=config['max_seq_length'])
max_seq_length = 1024
inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length)
VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size()
tokens = gp2_tokenizer(inp)
embedded =\
tf.keras.layers.Embedding(
input_dim=VOCABULARY_SIZE,
output_dim=embedding_dim,
input_length=max_seq_length,
mask_zero=True)(tokens)
position_embedding =\
PositionEmbedding(
sequence_length=max_seq_length,
initializer="uniform")(embedded)
x = tf.keras.layers.Concatenate()([
embedded,
position_embedding])
x = tf.keras.layers.Dropout(dropout)(x)
flattened = tf.keras.layers.Flatten()(x)
cerebros_base_model = tf.keras.Model(inputs=inp, outputs=flattened)
# -------------------------------- Cerebros AutoML Search --------------------------------
TIME = pendulum.now(tz='America/New_York').__str__()[:16].replace('T', '_').replace(':', '_').replace('-', '_')
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'
meta_trial_number = 42 # irrelevant unless in distributed training
cerebros_automl = SimpleCerebrosRandomSearch(
unit_type=DenseUnit,
input_shapes=INPUT_SHAPES,
output_shapes=OUTPUT_SHAPES,
training_data=training_x,
labels=train_labels,
validation_split=0.35,
direction='maximize',
metric_to_rank_by="val_binary_accuracy",
minimum_levels=minimum_levels,
maximum_levels=maximum_levels,
minimum_units_per_level=4,
maximum_units_per_level=maximum_units_per_level,
minimum_neurons_per_unit=1,
maximum_neurons_per_unit=maximum_neurons_per_unit,
activation=activation,
final_activation='sigmoid',
number_of_architecture_moities_to_try=5,
number_of_tries_per_architecture_moity=1,
minimum_skip_connection_depth=1,
maximum_skip_connection_depth=7,
predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
seed=8675309,
max_consecutive_lateral_connections=max_consecutive_lateral_connections,
gate_after_n_lateral_connections=3,
gate_activation_function=simple_sigmoid,
p_lateral_connection=p_lateral_connection,
p_lateral_connection_decay=zero_95_exp_decay,
num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
learning_rate=learning_rate,
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=[tf.keras.metrics.BinaryAccuracy(),
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall()],
epochs=epochs,
project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
model_graphs='model_graphs',
batch_size=batch_size,
meta_trial_number=meta_trial_number,
base_models=[cerebros_base_model],
train_data_dtype=tf.string
)
# -------------------------------- Run Search and Report Metric --------------------------------
cerebros_t0 = time.time()
result = cerebros_automl.run_random_search()
cerebros_t1 = time.time()
cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
models_tried = 5 #moities_to_try * tries_per_moity #Corrected
cerebros_time_per_model = cerebros_time_all_models_min / models_tried
print(f"val_binary_accuracy={result}")
import kubeflow.katib as katib
# [2] Create hyperparameter search space.
parameters = {
"embedding_dim": katib.search.int(min=10, max=50, step=1),
"activation": katib.search.categorical(["relu","gelu", "elu"]),
"predecessor_level_connection_affinity_factor_first": katib.search.double(min=0.1, max=50, step=0.1),
"predecessor_level_connection_affinity_factor_main": katib.search.double(min=0.1, max=50, step=0.1),
"max_consecutive_lateral_connections": katib.search.int(min=1, max=50, step=1),
"p_lateral_connection": katib.search.double(min=0.1, max=50, step=0.1),
"num_lateral_connection_tries_per_unit": katib.search.int(min=1, max=50, step=1),
"learning_rate": katib.search.double(min=10 ** -5, max=0.3, step=10 ** -5),
"epochs": katib.search.int(min=1, max=25, step=1),
"batch_size": katib.search.int(min=1, max=35, step=1),
"dropout": katib.search.double(min=0.05, max=0.95, step=0.05),
"maximum_units_per_level": katib.search.int(min=5, max=10, step=1),
"maximum_neurons_per_unit": katib.search.int(min=1, max=9, step=1)
}
# [3] Create Katib Experiment: MV-TPE, 7 random trials
# Up to 24 considered in suggestion search,
# 2 concurrent and up to 25 trials in total (maybe change this to 15, 25 will take about 2 days to run).
katib_client = katib.KatibClient(namespace=NAMESPACE)
algorithm_config = {
"algorithm_name": "multivariate-tpe",
"algorithm_settings": [
{"name": "n_startup_trials", "value": "7"},
{"name": "n_ei_candidates", "value": "24"},
{"name": "random_state", "value": "42"}
]
}
katib_client.tune(
name=JOB_NAME,
objective=objective,
parameters=parameters,
objective_metric_name="val_binary_accuracy",
algorithm_name = "multivariate-tpe",
algorithm_settings= algorithm_config,
max_trial_count=25,
parallel_trial_count=2,
resources_per_trial={"cpu": "8", "memory": "24Gi"},
)
# [4] Wait until Katib Experiment is complete
katib_client.wait_for_experiment_condition(name=JOB_NAME)
# [5] Get the best hyperparameters.
print(katib_client.get_optimal_hyperparameters(JOB_NAME))