djl
djl copied to clipboard
The basicBertPreprocessors in the BertFullTokenizer class do not have a Processor for handling Chinese text.
trafficstars
There are differences between the basicBertPreprocessors in the BertFullTokenizer class and the tokenize method in BERT, which lack the handling of Chinese characters.
BertFullTokenizer
/**
* Get a list of {@link TextProcessor}s to process input text for Bert models.
*
* @param lowerCase whether to convert input to lowercase
* @return List of {@code TextProcessor}s
*/
public static List<TextProcessor> getPreprocessors(boolean lowerCase) {
List<TextProcessor> processors = new ArrayList<>(10);
processors.add(new TextCleaner(c -> c == 0 || c == 0xfffd || NlpUtils.isControl(c), '\0'));
processors.add(new TextCleaner(NlpUtils::isWhiteSpace, ' '));
processors.add(new LambdaProcessor(String::trim));
processors.add(new SimpleTokenizer());
if (lowerCase) {
processors.add(new LowerCaseConvertor());
}
processors.add(new UnicodeNormalizer(Normalizer.Form.NFD));
processors.add(
new TextCleaner(c -> Character.getType(c) == Character.NON_SPACING_MARK, '\0'));
processors.add(new PunctuationSeparator());
processors.add(new LambdaProcessor(String::trim));
return processors;
}
in BERT
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens