ai-blog
ai-blog copied to clipboard
Attention layer model fails with 'Could not find valid device for node.'
Hello, I am trying to get code from https://blogs.rstudio.com/tensorflow/posts/2018-07-30-attention-layer/ and trying to reproduce this example.
The following is my code:
reticulate::use_condaenv("tf-gpu", required = TRUE)
library(keras)
use_implementation("tensorflow")
library(tensorflow)
tfe_enable_eager_execution()
library(tfdatasets)
library(purrr)
library(stringr)
library(reshape2)
library(viridis)
library(ggplot2)
library(tibble)
filepath <- file.path("data", "nld.txt")
lines <- readLines(filepath, n = 10000)
sentences <- str_split(lines, "\t")
str(sentences)
space_before_punct <- function(sentence) {
str_replace_all(sentence, "([?.!])", " \\1")
}
replace_special_chars <- function(sentence) {
str_replace_all(sentence, "[^a-zA-Z?.!,¿]+", " ")
}
add_tokens <- function(sentence) {
paste0("<start> ", sentence, " <stop>")
}
add_tokens <- Vectorize(add_tokens, USE.NAMES = FALSE)
preprocess_sentence <- compose(add_tokens,
str_squish,
replace_special_chars,
space_before_punct)
word_pairs <- map(sentences, preprocess_sentence)
create_index <- function(sentences) {
unique_words <- sentences %>% unlist() %>% paste(collapse = " ") %>%
str_split(pattern = " ") %>% .[[1]] %>% unique() %>% sort()
index <- data.frame(
word = unique_words,
index = 1:length(unique_words),
stringsAsFactors = FALSE
) %>%
add_row(word = "<pad>",
index = 0,
.before = 1)
index
}
word2index <- function(word, index_df) {
index_df[index_df$word == word, "index"]
}
index2word <- function(index, index_df) {
index_df[index_df$index == index, "word"]
}
src_index <- create_index(map(word_pairs, ~ .[[1]]))
target_index <- create_index(map(word_pairs, ~ .[[2]]))
sentence2digits <- function(sentence, index_df) {
map((sentence %>% str_split(pattern = " "))[[1]], function(word)
word2index(word, index_df))
}
sentlist2diglist <- function(sentence_list, index_df) {
map(sentence_list, function(sentence)
sentence2digits(sentence, index_df))
}
src_diglist <- sentlist2diglist(map(word_pairs, ~ .[[1]]), src_index)
src_maxlen <- map(src_diglist, length) %>% unlist() %>% max()
src_matrix <- pad_sequences(src_diglist, maxlen = src_maxlen, padding = "post")
target_diglist <- sentlist2diglist(map(word_pairs, ~ .[[2]]), target_index)
target_maxlen <- map(target_diglist, length) %>% unlist() %>% max()
target_matrix <- pad_sequences(target_diglist, maxlen = target_maxlen, padding = "post")
train_indices <-
sample(nrow(src_matrix), size = nrow(src_matrix) * 0.8)
validation_indices <- setdiff(1:nrow(src_matrix), train_indices)
x_train <- src_matrix[train_indices, ]
y_train <- target_matrix[train_indices, ]
str(x_train)
str(y_train)
x_valid <- src_matrix[validation_indices, ]
y_valid <- target_matrix[validation_indices, ]
str(x_valid)
str(y_valid)
buffer_size <- nrow(x_train)
# just for convenience, so we may get a glimpse at translation
# performance during training
train_sentences <- sentences[train_indices]
validation_sentences <- sentences[validation_indices]
validation_sample <- sample(validation_sentences, 5)
str(train_sentences)
batch_size <- 32
embedding_dim <- 64
gru_units <- 256
src_vocab_size <- nrow(src_index)
target_vocab_size <- nrow(target_index)
train_dataset <-
tensor_slices_dataset(keras_array(list(x_train, y_train))) %>%
dataset_shuffle(buffer_size = buffer_size) %>%
dataset_batch(batch_size, drop_remainder = TRUE)
str(train_dataset)
validation_dataset <-
tensor_slices_dataset(keras_array(list(x_valid, y_valid))) %>%
dataset_shuffle(buffer_size = buffer_size) %>%
dataset_batch(batch_size, drop_remainder = TRUE)
str(validation_dataset)
attention_encoder <-
function(gru_units,
embedding_dim,
src_vocab_size,
name = NULL) {
keras_model_custom(name = name, function(self) {
self$embedding <-
layer_embedding(
input_dim = src_vocab_size,
output_dim = embedding_dim
)
self$gru <-
layer_gru(
units = gru_units,
return_sequences = TRUE,
return_state = TRUE
)
function(inputs, mask = NULL) {
x <- inputs[[1]]
hidden <- inputs[[2]]
x <- self$embedding(x)
c(output, state) %<-% self$gru(x, initial_state = hidden)
list(output, state)
}
})
}
attention_decoder <-
function(object,
gru_units,
embedding_dim,
target_vocab_size,
name = NULL) {
keras_model_custom(name = name, function(self) {
self$gru <-
layer_gru(
units = gru_units,
return_sequences = TRUE,
return_state = TRUE
)
self$embedding <-
layer_embedding(input_dim = target_vocab_size,
output_dim = embedding_dim)
gru_units <- gru_units
self$fc <- layer_dense(units = target_vocab_size)
self$W1 <- layer_dense(units = gru_units)
self$W2 <- layer_dense(units = gru_units)
self$V <- layer_dense(units = 1L)
function(inputs, mask = NULL) {
x <- inputs[[1]]
hidden <- inputs[[2]]
encoder_output <- inputs[[3]]
hidden_with_time_axis <- k_expand_dims(hidden, 2)
score <- self$V(k_tanh(self$W1(encoder_output) +
self$W2(hidden_with_time_axis)))
attention_weights <- k_softmax(score, axis = 2)
context_vector <- attention_weights * encoder_output
context_vector <- k_sum(context_vector, axis = 2)
x <- self$embedding(x)
x <- k_concatenate(list(k_expand_dims(context_vector, 2), x), axis = 3)
c(output, state) %<-% self$gru(x)
output <- k_reshape(output, c(-1, gru_units))
x <- self$fc(output)
list(x, state, attention_weights)
}
})
}
encoder <- attention_encoder(
gru_units = gru_units,
embedding_dim = embedding_dim,
src_vocab_size = src_vocab_size
)
decoder <- attention_decoder(
gru_units = gru_units,
embedding_dim = embedding_dim,
target_vocab_size = target_vocab_size
)
optimizer <- tf$compat$v1$train$AdamOptimizer()
cx_loss <- function(y_true, y_pred) {
mask <- ifelse(y_true == 0L, 0, 1)
loss <-
tf$nn$sparse_softmax_cross_entropy_with_logits(labels = y_true,
logits = y_pred) * mask
tf$reduce_mean(loss)
}
n_epochs <- 50
encoder_init_hidden <- k_zeros(c(batch_size, gru_units))
for (epoch in seq_len(n_epochs)) {
total_loss <- 0
iteration <- 0
iter <- make_iterator_one_shot(train_dataset)
until_out_of_range({
batch <- iterator_get_next(iter)
loss <- 0
x <- batch[[1]]
y <- batch[[2]]
iteration <- iteration + 1
with(tf$GradientTape() %as% tape, {
c(enc_output, enc_hidden) %<-% encoder(list(x, encoder_init_hidden))
dec_hidden <- enc_hidden
dec_input <-
k_expand_dims(rep(list(
word2index("<start>", target_index)
), batch_size))
for (t in seq_len(target_maxlen - 1)) {
c(preds, dec_hidden, weights) %<-%
decoder(list(dec_input, dec_hidden, enc_output))
loss <- loss + cx_loss(y[, t], preds)
dec_input <- k_expand_dims(y[, t])
}
})
total_loss <-
total_loss + loss / k_cast_to_floatx(dim(y)[2])
paste0(
"Batch loss (epoch/batch): ",
epoch,
"/",
iter,
": ",
(loss / k_cast_to_floatx(dim(y)[2])) %>%
as.double() %>% round(4),
"\n"
)
variables <- c(encoder$variables, decoder$variables)
gradients <- tape$gradient(loss, variables)
optimizer$apply_gradients(
purrr::transpose(list(gradients, variables)),
global_step = tf$train$get_or_create_global_step()
)
})
paste0(
"Total loss (epoch): ",
epoch,
": ",
(total_loss / k_cast_to_floatx(buffer_size)) %>%
as.double() %>% round(4),
"\n"
)
}
this code fails with the following error:
2020-02-12 12:48:30.175011: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
Error: NotFoundError: Could not find valid device for node.
Node:{{node SparseSoftmaxCrossEntropyWithLogits}}
All kernels registered for op SparseSoftmaxCrossEntropyWithLogits :
device='CPU'; T in [DT_FLOAT]; Tlabels in [DT_INT32]
device='CPU'; T in [DT_FLOAT]; Tlabels in [DT_INT64]
device='CPU'; T in [DT_DOUBLE]; Tlabels in [DT_INT32]
device='CPU'; T in [DT_DOUBLE]; Tlabels in [DT_INT64]
device='CPU'; T in [DT_HALF]; Tlabels in [DT_INT32]
device='CPU'; T in [DT_HALF]; Tlabels in [DT_INT64]
device='GPU'; T in [DT_FLOAT]; Tlabels in [DT_INT32]
device='GPU'; T in [DT_FLOAT]; Tlabels in [DT_INT64]
device='GPU'; T in [DT_HALF]; Tlabels in [DT_INT32]
device='GPU'; T in [DT_HALF]; Tlabels in [DT_INT64]
[Op:SparseSoftmaxCrossEntropyWithLogits]
It is not clear to me what is the reason for this failure. Do you get the same result?