spacymoji
spacymoji copied to clipboard
Tokens not properly split when using emoji with modifiers
import spacy
from spacymoji import Emoji
def test():
nlp = spacy.load('en_core_web_sm')
emoji = Emoji(nlp, merge_spans=True)
nlp.add_pipe(emoji, first=True)
doc = nlp(
'Word!👍🏿')
for token in doc:
print (token)
doc = nlp(
'Word! 👍🏿')
for token in doc:
print(token)
doc = nlp(
'Word!👍')
for token in doc:
print(token)
return doc
Shows the problem. "Word!" is not correctly split into "Word" and "!", when the thumbs up has a dark skin tone modifier.
Try this
import spacy
from spacymoji import Emoji
nlp = spacy.load("en_core_web_sm")
emoji = Emoji(nlp, merge_spans=True)
nlp.add_pipe(emoji, first=True)
# case 1
doc = nlp('Word!👍🏿')
print([token.text for token in doc])
# case 2
doc = nlp('Word! 👍🏿')
print([token.text for token in doc])
Expected Output
['Word!', '👍🏿']
['Word', '!', '👍🏿']