ai-blog icon indicating copy to clipboard operation
ai-blog copied to clipboard

Attention layer model fails with 'Could not find valid device for node.'

Open mg64ve opened this issue 4 years ago • 0 comments

Hello, I am trying to get code from https://blogs.rstudio.com/tensorflow/posts/2018-07-30-attention-layer/ and trying to reproduce this example.

The following is my code:


reticulate::use_condaenv("tf-gpu", required = TRUE)



library(keras)
use_implementation("tensorflow")

library(tensorflow)
tfe_enable_eager_execution()

library(tfdatasets)

library(purrr)
library(stringr)
library(reshape2)
library(viridis)
library(ggplot2)
library(tibble)

filepath <- file.path("data", "nld.txt")

lines <- readLines(filepath, n = 10000)
sentences <- str_split(lines, "\t")
str(sentences)

space_before_punct <- function(sentence) {
  str_replace_all(sentence, "([?.!])", " \\1")
}

replace_special_chars <- function(sentence) {
  str_replace_all(sentence, "[^a-zA-Z?.!,¿]+", " ")
}

add_tokens <- function(sentence) {
  paste0("<start> ", sentence, " <stop>")
}

add_tokens <- Vectorize(add_tokens, USE.NAMES = FALSE)

preprocess_sentence <- compose(add_tokens,
                               str_squish,
                               replace_special_chars,
                               space_before_punct)

word_pairs <- map(sentences, preprocess_sentence)

create_index <- function(sentences) {
  unique_words <- sentences %>% unlist() %>% paste(collapse = " ") %>%
    str_split(pattern = " ") %>% .[[1]] %>% unique() %>% sort()
  index <- data.frame(
    word = unique_words,
    index = 1:length(unique_words),
    stringsAsFactors = FALSE
  ) %>%
    add_row(word = "<pad>",
            index = 0,
            .before = 1)
  index
}

word2index <- function(word, index_df) {
  index_df[index_df$word == word, "index"]
}
index2word <- function(index, index_df) {
  index_df[index_df$index == index, "word"]
}

src_index <- create_index(map(word_pairs, ~ .[[1]]))
target_index <- create_index(map(word_pairs, ~ .[[2]]))

sentence2digits <- function(sentence, index_df) {
  map((sentence %>% str_split(pattern = " "))[[1]], function(word)
    word2index(word, index_df))
}

sentlist2diglist <- function(sentence_list, index_df) {
  map(sentence_list, function(sentence)
    sentence2digits(sentence, index_df))
}

src_diglist <- sentlist2diglist(map(word_pairs, ~ .[[1]]), src_index)
src_maxlen <- map(src_diglist, length) %>% unlist() %>% max()
src_matrix <- pad_sequences(src_diglist, maxlen = src_maxlen,  padding = "post")

target_diglist <- sentlist2diglist(map(word_pairs, ~ .[[2]]), target_index)
target_maxlen <- map(target_diglist, length) %>% unlist() %>% max()
target_matrix <- pad_sequences(target_diglist, maxlen = target_maxlen, padding = "post")

train_indices <-
  sample(nrow(src_matrix), size = nrow(src_matrix) * 0.8)

validation_indices <- setdiff(1:nrow(src_matrix), train_indices)

x_train <- src_matrix[train_indices, ]
y_train <- target_matrix[train_indices, ]

str(x_train)
str(y_train)

x_valid <- src_matrix[validation_indices, ]
y_valid <- target_matrix[validation_indices, ]

str(x_valid)
str(y_valid)

buffer_size <- nrow(x_train)

# just for convenience, so we may get a glimpse at translation 
# performance during training
train_sentences <- sentences[train_indices]
validation_sentences <- sentences[validation_indices]
validation_sample <- sample(validation_sentences, 5)

str(train_sentences)

batch_size <- 32
embedding_dim <- 64
gru_units <- 256

src_vocab_size <- nrow(src_index)
target_vocab_size <- nrow(target_index)

train_dataset <- 
  tensor_slices_dataset(keras_array(list(x_train, y_train)))  %>%
  dataset_shuffle(buffer_size = buffer_size) %>%
  dataset_batch(batch_size, drop_remainder = TRUE)

str(train_dataset)

validation_dataset <-
  tensor_slices_dataset(keras_array(list(x_valid, y_valid))) %>%
  dataset_shuffle(buffer_size = buffer_size) %>%
  dataset_batch(batch_size, drop_remainder = TRUE)

str(validation_dataset)


attention_encoder <-
  
  function(gru_units,
           embedding_dim,
           src_vocab_size,
           name = NULL) {
    
    keras_model_custom(name = name, function(self) {
      
      self$embedding <-
        layer_embedding(
          input_dim = src_vocab_size,
          output_dim = embedding_dim
        )
      
      self$gru <-
        layer_gru(
          units = gru_units,
          return_sequences = TRUE,
          return_state = TRUE
        )
      
      function(inputs, mask = NULL) {
        
        x <- inputs[[1]]
        hidden <- inputs[[2]]
        
        x <- self$embedding(x)
        c(output, state) %<-% self$gru(x, initial_state = hidden)
        
        list(output, state)
      }
    })
  }


attention_decoder <-
  function(object,
           gru_units,
           embedding_dim,
           target_vocab_size,
           name = NULL) {
    
    keras_model_custom(name = name, function(self) {
      
      self$gru <-
        layer_gru(
          units = gru_units,
          return_sequences = TRUE,
          return_state = TRUE
        )
      
      self$embedding <-
        layer_embedding(input_dim = target_vocab_size, 
                        output_dim = embedding_dim)
      
      gru_units <- gru_units
      self$fc <- layer_dense(units = target_vocab_size)
      self$W1 <- layer_dense(units = gru_units)
      self$W2 <- layer_dense(units = gru_units)
      self$V <- layer_dense(units = 1L)
      
      function(inputs, mask = NULL) {
        
        x <- inputs[[1]]
        hidden <- inputs[[2]]
        encoder_output <- inputs[[3]]
        
        hidden_with_time_axis <- k_expand_dims(hidden, 2)
        
        score <- self$V(k_tanh(self$W1(encoder_output) + 
                                 self$W2(hidden_with_time_axis)))
        
        attention_weights <- k_softmax(score, axis = 2)
        
        context_vector <- attention_weights * encoder_output
        context_vector <- k_sum(context_vector, axis = 2)
        
        x <- self$embedding(x)
        
        x <- k_concatenate(list(k_expand_dims(context_vector, 2), x), axis = 3)
        
        c(output, state) %<-% self$gru(x)
        
        output <- k_reshape(output, c(-1, gru_units))
        
        x <- self$fc(output)
        
        list(x, state, attention_weights)
        
      }
      
    })
  }

encoder <- attention_encoder(
  gru_units = gru_units,
  embedding_dim = embedding_dim,
  src_vocab_size = src_vocab_size
)

decoder <- attention_decoder(
  gru_units = gru_units,
  embedding_dim = embedding_dim,
  target_vocab_size = target_vocab_size
)


optimizer <- tf$compat$v1$train$AdamOptimizer()

cx_loss <- function(y_true, y_pred) {
  mask <- ifelse(y_true == 0L, 0, 1)
  loss <-
    tf$nn$sparse_softmax_cross_entropy_with_logits(labels = y_true,
                                                   logits = y_pred) * mask
  tf$reduce_mean(loss)
}


n_epochs <- 50

encoder_init_hidden <- k_zeros(c(batch_size, gru_units))

for (epoch in seq_len(n_epochs)) {
  
  total_loss <- 0
  iteration <- 0
  
  iter <- make_iterator_one_shot(train_dataset)
  
  until_out_of_range({
    
    batch <- iterator_get_next(iter)
    loss <- 0
    x <- batch[[1]]
    y <- batch[[2]]
    iteration <- iteration + 1
    
    with(tf$GradientTape() %as% tape, {
      c(enc_output, enc_hidden) %<-% encoder(list(x, encoder_init_hidden))
      
      dec_hidden <- enc_hidden
      dec_input <-
        k_expand_dims(rep(list(
          word2index("<start>", target_index)
        ), batch_size))
      
      
      for (t in seq_len(target_maxlen - 1)) {
        c(preds, dec_hidden, weights) %<-%
          decoder(list(dec_input, dec_hidden, enc_output))
        loss <- loss + cx_loss(y[, t], preds)
        
        dec_input <- k_expand_dims(y[, t])
      }
      
    })
    
    total_loss <-
      total_loss + loss / k_cast_to_floatx(dim(y)[2])
    
    paste0(
      "Batch loss (epoch/batch): ",
      epoch,
      "/",
      iter,
      ": ",
      (loss / k_cast_to_floatx(dim(y)[2])) %>% 
        as.double() %>% round(4),
      "\n"
    )
    
    variables <- c(encoder$variables, decoder$variables)
    gradients <- tape$gradient(loss, variables)
    
    optimizer$apply_gradients(
      purrr::transpose(list(gradients, variables)),
      global_step = tf$train$get_or_create_global_step()
    )
    
  })

  paste0(
    "Total loss (epoch): ",
    epoch,
    ": ",
    (total_loss / k_cast_to_floatx(buffer_size)) %>% 
      as.double() %>% round(4),
    "\n"
  )
}


this code fails with the following error:

2020-02-12 12:48:30.175011: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
Error: NotFoundError: Could not find valid device for node.
Node:{{node SparseSoftmaxCrossEntropyWithLogits}}
All kernels registered for op SparseSoftmaxCrossEntropyWithLogits :
  device='CPU'; T in [DT_FLOAT]; Tlabels in [DT_INT32]
  device='CPU'; T in [DT_FLOAT]; Tlabels in [DT_INT64]
  device='CPU'; T in [DT_DOUBLE]; Tlabels in [DT_INT32]
  device='CPU'; T in [DT_DOUBLE]; Tlabels in [DT_INT64]
  device='CPU'; T in [DT_HALF]; Tlabels in [DT_INT32]
  device='CPU'; T in [DT_HALF]; Tlabels in [DT_INT64]
  device='GPU'; T in [DT_FLOAT]; Tlabels in [DT_INT32]
  device='GPU'; T in [DT_FLOAT]; Tlabels in [DT_INT64]
  device='GPU'; T in [DT_HALF]; Tlabels in [DT_INT32]
  device='GPU'; T in [DT_HALF]; Tlabels in [DT_INT64]
 [Op:SparseSoftmaxCrossEntropyWithLogits]

It is not clear to me what is the reason for this failure. Do you get the same result?

mg64ve avatar Feb 12 '20 12:02 mg64ve