TextAnalysis.jl
TextAnalysis.jl copied to clipboard
Performance issue with prepare!( doc, strip_stopwords)
Calling prepare!(StringDocument, strip_case | strip_stopwords)
even on a small ~3.4MB file takes forever to return (other than for small strings I have not seen this function finishing successfully).
function method_textanalysis(str_rec)
sdoc = StringDocument(str_rec)
prepare!(sdoc, strip_case | strip_stopwords)
return sdoc
end
I have tracked the slowness to the following function.
https://github.com/JuliaText/TextAnalysis.jl/blob/c8ae7a217d19f19d8c8e3e22da9ea5970ece40d4/src/preprocessing.jl#L253
function remove_patterns(s::AbstractString, rex::Regex)
iob = IOBuffer()
ibegin = 1
v=codeunits(s)
for m in eachmatch(rex, s)
len = m.match.offset-ibegin+1
next = nextind(s, lastindex(m.match)+m.match.offset)
if len > 0
Base.write_sub(iob, v, ibegin, len)
if next != length(s)+1
write(iob, ' ')
end
end
ibegin = next
end
len = length(v) - ibegin + 1
(len > 0) && Base.write_sub(iob, v, ibegin, len)
String(take!(iob))
end
Manually performing similar task takes ~1.4 second on a 3.4MB text file. Reason I say similar is because to eliminate stop words manually I first tokenize the document and then filter out the stop words. Which functionally is very different than executing regex on a large string and may not be the ideal approach for preserving the structure of document. (complete code at the end)
function method_manual(str_rec)
stop_words = Languages.stopwords(Languages.English())
str_rec = lowercase(str_rec)
word_tokens = tokenize(str_rec)
res = filter(x->!in(x, stop_words), word_tokens)
return res
end
I was wondering if there could be a more efficient way to perform this elimination of keywords from a String Document?
using Pkg
@info "Installing required packages.."
required_pkg = ["TextAnalysis", "Languages", "WordTokenizers"]
installed_pkg = Pkg.installed()
[(in(p, keys(installed_pkg)) || Pkg.add(p)) for p in required_pkg]
using TextAnalysis
using WordTokenizers
using Languages
"""
Download data if it does not already exists
"""
function download_data(;
url="http://www.gutenberg.org/files/2600/2600-0.txt",
localfile="2600-0.txt")
if !isfile(localfile)
download(url, localfile)
else
@info "file $(localfile) already exists, skipping download"
end
end
"""
Return data (~100MB uncompressed) in form of a string
"""
function getdata(fn="2600-0.txt")
download_data()
str_rec = read(fn, String)
end
"""
Pre Process data using TextAnalysis - strip_case | strip_stopwords
"""
function method_textanalysis(str_rec)
sdoc = StringDocument(str_rec)
#prepare!(sdoc, strip_case | strip_stopwords)
prepare!(sdoc, strip_stopwords)
return sdoc
end
"""
Pre Process data without using `prepare!`
"""
function method_manual(str_rec)
stop_words = Languages.stopwords(Languages.English())
str_rec = lowercase(str_rec)
word_tokens = tokenize(str_rec)
res = filter(x->!in(x, stop_words), word_tokens)
return res
end
"""
Main
"""
function main()
str_rec = getdata()
@info "Manual Pre Processing"
@time res2 = method_manual(str_rec)
@info "Pre Processing using TextAnalysis"
@time res1 = method_textanalysis(str_rec)
end
main()
FYI: quick test of using replace
while looping over stop_words
appears to be much faster than the existing remove_pattern
method ~<2s vs ~940s.
function preprocess(str_rec)
stop_words = Languages.stopwords(Languages.English())
str_rec = lowercase(str_rec)
for sw in stop_words
rex = Regex("\\b"*sw*"\\b")
str_rec = replace(str_rec, rex => "")
end
return str_rec
end
Not sure if my input is warranted but I just wanted to post a solution I found worked. However, this processes removes stop words from the return value of tokenize (using WordTokenizers)
STOPWORDS = stopwords(Languages.English()); # using Languages
"""
my_tokenize(text, sw)
return iterator for tokenized words in text with stopwords removed by default.
to return only stopwords in text, set argument sw to \'only\'
"""
function my_tokenize(text, sw::String="remove")
if sw == "remove"
return collect(word for word in tokenize(text) if !isin(word, STOPWORDS))
elseif sw == "only"
return collect(word for word in tokenize(text) if isin(word, STOPWORDS))
else
return collect(word for word in tokenize(text))
end
end
I then apply it like:
purpose = select(t_new, :purpose);
lower = lowercase.(purpose);
num_words = length.(my_tokenize.(lower));
I'm welcome to hearing improvements but this was fast and worked for my use case