textrecipes
textrecipes copied to clipboard
Using pretrained models in step_tokenize_sentencepiece
trafficstars
I'm trying to use bpe working in step_tokenize_sentencepiece.
Could I use some already trained model? Here some examples trying different approaches:
library(tidymodels)
library(textrecipes)
library(sentencepiece)
dataf <- data.frame(
"text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
"label_col" = c("pos", "neg", "neg", "pos", "neg")
)
test <- data.frame(
"text" = c("negative results", "neg"),
"label_col" = c("neg", "neg")
)
rec0 <- dataf |>
recipe(label_col ~ text) |>
step_tokenize_sentencepiece(text, vocabulary_size = 25) |>
step_tfidf(text)
# It splits by character not by "bpe"
rec0 |>
prep() |>
juice() |>
print()
# 0.853 for .pred_neg in the first case
rec0 |>
workflow(logistic_reg()) |>
fit(dataf) |>
augment(test, type = c("prob", "class"))
# I can't split by bpe in step_tokenize_sentencepiece
tryCatch({
rec1 <- dataf |>
recipe(label_col ~ text) |>
step_tokenize_sentencepiece(text, vocabulary_size = 25, options = list(type = "bpe")) |>
step_tfidf(text)
rec1 |>
prep() |>
juice() |>
print()
},
error = \(e) message(e)
)
# I can do it in step_tokenize_bpe
rec2 <- dataf |>
recipe(label_col ~ text) |>
step_tokenize_bpe(text) |>
step_tfidf(text)
rec2 |>
prep() |>
juice() |>
print()
# 1.00 for .pred_neg in the first case
rec2 |>
workflow(logistic_reg()) |>
fit(dataf) |>
augment(test, type = c("prob", "class"))
# Using a sentencepiece trained model before
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |>
tokenlist()
rec3 <- all |>
filter(data == "train") |>
recipe(label_col ~ text) |>
step_tfidf(text)
rec3 |>
prep() |>
juice() |>
print()
rec3 |>
workflow(logistic_reg()) |>
fit(all |> filter(data == "train")) |>
augment(all |> filter(data == "test"), type = c("prob", "class"))
# Using a sentencepiece trained model before
# Using word embedings instead of step_tfidf
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
embeddings <- tibble(
tokens = sentencepiece_encode(model, all$text, type = "subwords") |> unlist() |> unique(),
ids = sentencepiece_encode(model, all$text, type = "ids") |> unlist() |> unique()
)
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |>
tokenlist()
rec4 <- all |>
filter(data == "train") |>
recipe(label_col ~ text) |>
step_word_embeddings(text, embeddings = embeddings, aggregation = "sum", keep_original_cols = TRUE, prefix = "sum") |>
step_word_embeddings(text, embeddings = embeddings, aggregation = "mean", keep_original_cols = TRUE, prefix = "mean") |>
step_word_embeddings(text, embeddings = embeddings, aggregation = "max", keep_original_cols = TRUE, prefix = "max") |>
step_rm(text)
rec4 |>
prep() |>
juice() |>
print()
rec4 |>
workflow(logistic_reg()) |>
fit(all |> filter(data == "train")) |>
augment(all |> filter(data == "test"), type = c("prob", "class"))
first part:
the reason why you are getting characters when you were expecting BPE is because the vocabulary size is set to small. With a vocabulary_size of 25 it is too low to allow for combined characters.
library(textrecipes)
dataf <- data.frame(
"text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
"label_col" = c("pos", "neg", "neg", "pos")
)
rec0 <- dataf |>
recipe(label_col ~ text) |>
step_tokenize_sentencepiece(text, vocabulary_size = 100) |>
step_tfidf(text)
rec0 |>
prep() |>
juice() |>
names()
#> [1] "label_col" "tfidf_text_</s>" "tfidf_text_<s>"
#> [4] "tfidf_text_<unk>" "tfidf_text_▁" "tfidf_text_▁b"
#> [7] "tfidf_text_▁bad" "tfidf_text_▁g" "tfidf_text_▁go"
#> [10] "tfidf_text_▁good" "tfidf_text_▁n" "tfidf_text_▁neg"
#> [13] "tfidf_text_▁o" "tfidf_text_▁ou" "tfidf_text_▁out"
#> [16] "tfidf_text_▁outcome" "tfidf_text_▁p" "tfidf_text_▁pos"
#> [19] "tfidf_text_▁positive" "tfidf_text_▁r" "tfidf_text_▁re"
#> [22] "tfidf_text_▁results" "tfidf_text_▁s" "tfidf_text_▁sen"
#> [25] "tfidf_text_▁sentiment" "tfidf_text_▁su" "tfidf_text_▁super"
#> [28] "tfidf_text_a" "tfidf_text_ad" "tfidf_text_b"
#> [31] "tfidf_text_ba" "tfidf_text_bad" "tfidf_text_c"
#> [34] "tfidf_text_co" "tfidf_text_com" "tfidf_text_come"
#> [37] "tfidf_text_d" "tfidf_text_e" "tfidf_text_eg"
#> [40] "tfidf_text_en" "tfidf_text_ent" "tfidf_text_enti"
#> [43] "tfidf_text_er" "tfidf_text_es" "tfidf_text_esu"
#> [46] "tfidf_text_g" "tfidf_text_go" "tfidf_text_goo"
#> [49] "tfidf_text_i" "tfidf_text_im" "tfidf_text_it"
#> [52] "tfidf_text_iti" "tfidf_text_itive" "tfidf_text_iv"
#> [55] "tfidf_text_l" "tfidf_text_lt" "tfidf_text_lts"
#> [58] "tfidf_text_m" "tfidf_text_me" "tfidf_text_men"
#> [61] "tfidf_text_n" "tfidf_text_ne" "tfidf_text_neg"
#> [64] "tfidf_text_nt" "tfidf_text_o" "tfidf_text_od"
#> [67] "tfidf_text_om" "tfidf_text_oo" "tfidf_text_os"
#> [70] "tfidf_text_osi" "tfidf_text_ou" "tfidf_text_out"
#> [73] "tfidf_text_p" "tfidf_text_pe" "tfidf_text_per"
#> [76] "tfidf_text_po" "tfidf_text_pos" "tfidf_text_r"
#> [79] "tfidf_text_re" "tfidf_text_s" "tfidf_text_se"
#> [82] "tfidf_text_sen" "tfidf_text_si" "tfidf_text_su"
#> [85] "tfidf_text_sul" "tfidf_text_sults" "tfidf_text_sup"
#> [88] "tfidf_text_t" "tfidf_text_tc" "tfidf_text_tco"
#> [91] "tfidf_text_ti" "tfidf_text_tim" "tfidf_text_timent"
#> [94] "tfidf_text_tiv" "tfidf_text_ts" "tfidf_text_u"
#> [97] "tfidf_text_ul" "tfidf_text_up" "tfidf_text_ut"
#> [100] "tfidf_text_v" "tfidf_text_ve"
Created on 2024-12-10 with reprex v2.1.0
it is not currently possible to use a pretrained model, but it is a valid thing to want to do.