textrecipes Using pretrained models in step_tokenize

trafficstars

I'm trying to use bpe working in step_tokenize_sentencepiece.

Could I use some already trained model? Here some examples trying different approaches:

library(tidymodels)
library(textrecipes)
library(sentencepiece)

dataf <- data.frame(
  "text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
  "label_col" = c("pos", "neg", "neg", "pos", "neg")
)

test <- data.frame(
  "text" = c("negative results", "neg"),
  "label_col" = c("neg", "neg")
)

rec0 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_sentencepiece(text, vocabulary_size = 25) |> 
  step_tfidf(text) 

# It splits by character not by "bpe"
rec0 |> 
  prep() |> 
  juice() |>
  print()


# 0.853 for .pred_neg in the first case
rec0 |> 
  workflow(logistic_reg()) |> 
  fit(dataf) |> 
  augment(test, type = c("prob", "class"))

# I can't split by bpe in step_tokenize_sentencepiece
tryCatch({
    rec1 <- dataf |> 
      recipe(label_col ~ text) |> 
      step_tokenize_sentencepiece(text, vocabulary_size = 25, options = list(type = "bpe")) |> 
      step_tfidf(text) 

    rec1 |> 
      prep() |> 
      juice() |>
      print()
  },
  error = \(e) message(e)
)

# I can do it in step_tokenize_bpe
rec2 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_bpe(text) |>   
  step_tfidf(text) 

rec2 |> 
  prep() |> 
  juice() |>
  print()

# 1.00 for .pred_neg in the first case
rec2 |> 
  workflow(logistic_reg()) |> 
  fit(dataf) |> 
  augment(test, type = c("prob", "class"))


# Using a sentencepiece trained model before
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |> 
  tokenlist()

rec3 <- all |> 
  filter(data == "train") |>
  recipe(label_col ~ text) |> 
  step_tfidf(text) 

rec3 |> 
  prep() |> 
  juice() |>
  print()

rec3 |> 
  workflow(logistic_reg()) |> 
  fit(all |> filter(data == "train")) |> 
  augment(all |> filter(data == "test"), type = c("prob", "class"))



# Using a sentencepiece trained model before
# Using word embedings instead of step_tfidf
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
embeddings <- tibble(
  tokens = sentencepiece_encode(model, all$text, type = "subwords") |> unlist() |> unique(),
  ids = sentencepiece_encode(model, all$text, type = "ids") |> unlist() |> unique()
)
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |> 
  tokenlist()
rec4 <- all |> 
  filter(data == "train") |>
  recipe(label_col ~ text) |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "sum", keep_original_cols = TRUE, prefix = "sum") |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "mean", keep_original_cols = TRUE, prefix = "mean") |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "max", keep_original_cols = TRUE, prefix = "max") |> 
  step_rm(text)

rec4 |> 
  prep() |> 
  juice() |>
  print()

rec4 |> 
  workflow(logistic_reg()) |> 
  fit(all |> filter(data == "train")) |> 
  augment(all |> filter(data == "test"), type = c("prob", "class"))

Dec 10 '24 18:12 jrosell

first part:

the reason why you are getting characters when you were expecting BPE is because the vocabulary size is set to small. With a vocabulary_size of 25 it is too low to allow for combined characters.

library(textrecipes)

dataf <- data.frame(
  "text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
  "label_col" = c("pos", "neg", "neg", "pos")
)

rec0 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_sentencepiece(text, vocabulary_size = 100) |> 
  step_tfidf(text) 

rec0 |> 
  prep() |> 
  juice() |>
  names()
#>   [1] "label_col"             "tfidf_text_</s>"       "tfidf_text_<s>"       
#>   [4] "tfidf_text_<unk>"      "tfidf_text_▁"          "tfidf_text_▁b"        
#>   [7] "tfidf_text_▁bad"       "tfidf_text_▁g"         "tfidf_text_▁go"       
#>  [10] "tfidf_text_▁good"      "tfidf_text_▁n"         "tfidf_text_▁neg"      
#>  [13] "tfidf_text_▁o"         "tfidf_text_▁ou"        "tfidf_text_▁out"      
#>  [16] "tfidf_text_▁outcome"   "tfidf_text_▁p"         "tfidf_text_▁pos"      
#>  [19] "tfidf_text_▁positive"  "tfidf_text_▁r"         "tfidf_text_▁re"       
#>  [22] "tfidf_text_▁results"   "tfidf_text_▁s"         "tfidf_text_▁sen"      
#>  [25] "tfidf_text_▁sentiment" "tfidf_text_▁su"        "tfidf_text_▁super"    
#>  [28] "tfidf_text_a"          "tfidf_text_ad"         "tfidf_text_b"         
#>  [31] "tfidf_text_ba"         "tfidf_text_bad"        "tfidf_text_c"         
#>  [34] "tfidf_text_co"         "tfidf_text_com"        "tfidf_text_come"      
#>  [37] "tfidf_text_d"          "tfidf_text_e"          "tfidf_text_eg"        
#>  [40] "tfidf_text_en"         "tfidf_text_ent"        "tfidf_text_enti"      
#>  [43] "tfidf_text_er"         "tfidf_text_es"         "tfidf_text_esu"       
#>  [46] "tfidf_text_g"          "tfidf_text_go"         "tfidf_text_goo"       
#>  [49] "tfidf_text_i"          "tfidf_text_im"         "tfidf_text_it"        
#>  [52] "tfidf_text_iti"        "tfidf_text_itive"      "tfidf_text_iv"        
#>  [55] "tfidf_text_l"          "tfidf_text_lt"         "tfidf_text_lts"       
#>  [58] "tfidf_text_m"          "tfidf_text_me"         "tfidf_text_men"       
#>  [61] "tfidf_text_n"          "tfidf_text_ne"         "tfidf_text_neg"       
#>  [64] "tfidf_text_nt"         "tfidf_text_o"          "tfidf_text_od"        
#>  [67] "tfidf_text_om"         "tfidf_text_oo"         "tfidf_text_os"        
#>  [70] "tfidf_text_osi"        "tfidf_text_ou"         "tfidf_text_out"       
#>  [73] "tfidf_text_p"          "tfidf_text_pe"         "tfidf_text_per"       
#>  [76] "tfidf_text_po"         "tfidf_text_pos"        "tfidf_text_r"         
#>  [79] "tfidf_text_re"         "tfidf_text_s"          "tfidf_text_se"        
#>  [82] "tfidf_text_sen"        "tfidf_text_si"         "tfidf_text_su"        
#>  [85] "tfidf_text_sul"        "tfidf_text_sults"      "tfidf_text_sup"       
#>  [88] "tfidf_text_t"          "tfidf_text_tc"         "tfidf_text_tco"       
#>  [91] "tfidf_text_ti"         "tfidf_text_tim"        "tfidf_text_timent"    
#>  [94] "tfidf_text_tiv"        "tfidf_text_ts"         "tfidf_text_u"         
#>  [97] "tfidf_text_ul"         "tfidf_text_up"         "tfidf_text_ut"        
#> [100] "tfidf_text_v"          "tfidf_text_ve"

^{Created on 2024-12-10 with reprex v2.1.0}

Dec 10 '24 19:12 EmilHvitfeldt

it is not currently possible to use a pretrained model, but it is a valid thing to want to do.

Dec 10 '24 19:12 EmilHvitfeldt

textrecipes
textrecipes copied to clipboard

Using pretrained models in step_tokenize_sentencepiece

textrecipes textrecipes copied to clipboard

Using pretrained models in step_tokenize_sentencepiece

textrecipes
textrecipes copied to clipboard