cerebros-core-algorithm-alpha icon indicating copy to clipboard operation
cerebros-core-algorithm-alpha copied to clipboard

mv-tpe-nlp-study

Open david-thrower opened this issue 7 months ago • 0 comments

Complete this ...

The code thus far







# Hard set these 2 params and do separate studies on each 
MINIMUM_LEVELS = 2
MAXIMUM_LEVELS = 2
NAMESPACE = "kubeflow"
JOB_NAME = "NLPtrainTask0001"


def objective(
        parameters, 
        
        # Hard set these 2 params and do separate studies on each
        minimum_levels=MINIMUM_LEVELS, 
        maximum_levels=MAXIMUM_LEVELS):

    import tensorflow as tf
    import tensorflow_text
    from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor
    from keras_nlp.layers import PositionEmbedding
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Input, Flatten
    import pandas as pd
    import numpy as np
    from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\
            import SimpleCerebrosRandomSearch
    import pendulum
    from cerebros.units.units import DenseUnit
    from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
            import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
    from ast import literal_eval
    import time
    
    embedding_dim =\
            parameters["embedding_dim"]
    activation =\
            parameters["activation"]
    predecessor_level_connection_affinity_factor_first =\
            parameters["predecessor_level_connection_affinity_factor_first"]
    predecessor_level_connection_affinity_factor_main =\
            parameters["predecessor_level_connection_affinity_factor_main"] 
    max_consecutive_lateral_connections =\
            parameters["max_consecutive_lateral_connections"]
    p_lateral_connection =\
            parameters["p_lateral_connection"]
    num_lateral_connection_tries_per_unit =\
            parameters["num_lateral_connection_tries_per_unit"]
    learning_rate =\
            parameters["learning_rate"]
    epochs =\
            parameters["epochs"]
    batch_size =\
            parameters["batch_size"]
    # Add a second dropout so dropout can be implemented within
    # cerebros blocks and can separate from the beachhead
    # dropout layer.  
    dropout =\
            parameters["dropout"]
    maximum_units_per_level =\
            parameters["maximum_units_per_level"]
    maximum_neurons_per_unit =\
            parameters["maximum_neurons_per_unit"] 


    df = pd.read_csv("Phishing_Email.csv")
    df = df[df['Email Text'].apply(lambda x: isinstance(x, str))]
    df.reset_index(drop=True, inplace=True)
    label_mapping = {"Safe Email": 0, "Phishing Email": 1}
    df["Binary Label"] = df["Email Type"].map(label_mapping)
    X = df["Email Text"].to_numpy()
    y = df["Binary Label"].to_numpy()
    X, y = shuffle(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.85, shuffle=False)

    baseline_train_x = tf.constant(X_train)
    baseline_train_y = tf.constant(y_train, dtype=tf.int8)

    training_x  = [baseline_train_x]
    train_labels = [baseline_train_y]

    INPUT_SHAPES  = [()]
    OUTPUT_SHAPES = [1]

    # -------------------------------- GPT2 Model Definition --------------------------------
    class TokenizerLayer(tf.keras.layers.Layer):  # TokenizerLayer definition (as before)
        def __init__(self, max_seq_length, **kwargs):
            super(TokenizerLayer, self).__init__(**kwargs)
            self.tokenizer = GPT2Tokenizer.from_preset("gpt2_extra_large_en")
            self.preprocessor = GPT2Preprocessor(self.tokenizer, sequence_length=max_seq_length)
            self.max_seq_length = max_seq_length

        def call(self, inputs):
            prep = self.preprocessor([inputs])
            return prep['token_ids']

        def get_config(self):
            config = super(TokenizerLayer, self).get_config()
            config.update({'max_seq_length': self.max_seq_length})
            return config

        @classmethod
        def from_config(cls, config):
            return cls(max_seq_length=config['max_seq_length'])

    max_seq_length = 1024

    inp = tf.keras.layers.Input(shape=(), dtype=tf.string)
    gp2_tokenizer = TokenizerLayer(max_seq_length=max_seq_length)
    VOCABULARY_SIZE = gp2_tokenizer.tokenizer.vocabulary_size()
    tokens = gp2_tokenizer(inp)

    embedded =\
            tf.keras.layers.Embedding(
                    input_dim=VOCABULARY_SIZE,
                    output_dim=embedding_dim,
                    input_length=max_seq_length,
                    mask_zero=True)(tokens)
    position_embedding =\
            PositionEmbedding(
                    sequence_length=max_seq_length,
                    initializer="uniform")(embedded)

    x = tf.keras.layers.Concatenate()([
             embedded,
             position_embedding])
    x = tf.keras.layers.Dropout(dropout)(x)
    flattened = tf.keras.layers.Flatten()(x)

    cerebros_base_model = tf.keras.Model(inputs=inp, outputs=flattened)

    # -------------------------------- Cerebros AutoML Search --------------------------------

    TIME = pendulum.now(tz='America/New_York').__str__()[:16].replace('T', '_').replace(':', '_').replace('-', '_')
    PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'
    meta_trial_number = 42  # irrelevant unless in distributed training

    cerebros_automl = SimpleCerebrosRandomSearch(
        unit_type=DenseUnit,
        input_shapes=INPUT_SHAPES,
        output_shapes=OUTPUT_SHAPES,
        training_data=training_x,
        labels=train_labels,
        validation_split=0.35,
        direction='maximize',
        metric_to_rank_by="val_binary_accuracy",
        minimum_levels=minimum_levels,
        maximum_levels=maximum_levels,
        minimum_units_per_level=4,
        maximum_units_per_level=maximum_units_per_level,
        minimum_neurons_per_unit=1,
        maximum_neurons_per_unit=maximum_neurons_per_unit,
        activation=activation,
        final_activation='sigmoid',
        number_of_architecture_moities_to_try=5,
        number_of_tries_per_architecture_moity=1,
        minimum_skip_connection_depth=1,
        maximum_skip_connection_depth=7,
        predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
        predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
        predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
        predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
        predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
        seed=8675309,
        max_consecutive_lateral_connections=max_consecutive_lateral_connections,
        gate_after_n_lateral_connections=3,
        gate_activation_function=simple_sigmoid,
        p_lateral_connection=p_lateral_connection,
        p_lateral_connection_decay=zero_95_exp_decay,
        num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
        learning_rate=learning_rate,
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.BinaryAccuracy(),
                 tf.keras.metrics.Precision(),
                 tf.keras.metrics.Recall()],
        epochs=epochs,
        project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
        model_graphs='model_graphs',
        batch_size=batch_size,
        meta_trial_number=meta_trial_number,
        base_models=[cerebros_base_model],
        train_data_dtype=tf.string
    )

    # -------------------------------- Run Search and Report Metric --------------------------------
    cerebros_t0 = time.time()
    result = cerebros_automl.run_random_search()
    cerebros_t1 = time.time()
    cerebros_time_all_models_min = (cerebros_t1 - cerebros_t0) / 60
    models_tried = 5  #moities_to_try * tries_per_moity #Corrected
    cerebros_time_per_model = cerebros_time_all_models_min / models_tried

    print(f"val_binary_accuracy={result}")


import kubeflow.katib as katib

# [2] Create hyperparameter search space.
parameters = {
    "embedding_dim": katib.search.int(min=10, max=50, step=1),
    "activation": katib.search.categorical(["relu","gelu", "elu"]),
    "predecessor_level_connection_affinity_factor_first": katib.search.double(min=0.1, max=50, step=0.1),
    "predecessor_level_connection_affinity_factor_main": katib.search.double(min=0.1, max=50, step=0.1),
    "max_consecutive_lateral_connections": katib.search.int(min=1, max=50, step=1),
    "p_lateral_connection":  katib.search.double(min=0.1, max=50, step=0.1),
    "num_lateral_connection_tries_per_unit": katib.search.int(min=1, max=50, step=1),
    "learning_rate": katib.search.double(min=10 ** -5, max=0.3, step=10 ** -5),
    "epochs": katib.search.int(min=1, max=25, step=1),
    "batch_size": katib.search.int(min=1, max=35, step=1),
    "dropout":  katib.search.double(min=0.05, max=0.95, step=0.05),
    "maximum_units_per_level": katib.search.int(min=5, max=10, step=1),
    "maximum_neurons_per_unit": katib.search.int(min=1, max=9, step=1)
}



# [3] Create Katib Experiment: MV-TPE, 7 random trials
# Up to 24 considered in suggestion search,
# 2 concurrent and up to 25 trials in total (maybe change this to 15, 25 will take about 2 days to run). 
katib_client = katib.KatibClient(namespace=NAMESPACE)


algorithm_config = {
    "algorithm_name": "multivariate-tpe",
    "algorithm_settings": [
        {"name": "n_startup_trials", "value": "7"},
        {"name": "n_ei_candidates", "value": "24"},
        {"name": "random_state", "value": "42"}
    ]
}



katib_client.tune(
    name=JOB_NAME,
    objective=objective,
    parameters=parameters,
    objective_metric_name="val_binary_accuracy",
    algorithm_name = "multivariate-tpe",
    algorithm_settings= algorithm_config,
    max_trial_count=25,
    parallel_trial_count=2,
    resources_per_trial={"cpu": "8", "memory": "24Gi"},
)

# [4] Wait until Katib Experiment is complete
katib_client.wait_for_experiment_condition(name=JOB_NAME)

# [5] Get the best hyperparameters.
print(katib_client.get_optimal_hyperparameters(JOB_NAME))


david-thrower avatar Apr 04 '25 23:04 david-thrower