simdutf8
simdutf8 copied to clipboard
Chunked iterator API like `Utf8Chunks`
I've wanted chunked UTF-8 decoding twice recently for different escaping routines, and have used simdutf8::compat::from_utf8
in a loop to achieve that. I would really like to be able to use an API like Utf8Chunks
from #[feature(utf8_lossy)]
or bstr::Utf8Chunks
, but with the faster validation of this crate. Utf8Chunks
avoids the disconnect between the length of the valid prefix and the prefix as a string. Additionally, I suspect an API for this could avoid some overhead from decoding in a loop.
I ended up writing something close to this:
pub fn from_utf8_lossy(mut v: &[u8]) -> Cow<'_, str> {
match simdutf8::compat::from_utf8(v) {
Ok(s) => s.into(),
Err(mut err) => {
let mut cleaned = String::with_capacity(v.len());
loop {
cleaned.push_str(unsafe { str::from_utf8_unchecked(&v[..err.valid_up_to()]) });
cleaned.push_str("\u{FFFD}");
if let Some(error_len) = err.error_len() {
v = &v[err.valid_up_to() + error_len..];
match simdutf8::compat::from_utf8(v) {
Ok(v) => cleaned.push_str(v),
Err(err1) => {
err = err1;
continue;
}
}
}
break cleaned.into();
}
}
}
}
Compare to the stdlib implementation of String::from_utf8_lossy
, which avoids any direct offset fiddling and unchecked conversions:
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
let mut iter = Utf8Chunks::new(v);
let first_valid = if let Some(chunk) = iter.next() {
let valid = chunk.valid();
if chunk.invalid().is_empty() {
debug_assert_eq!(valid.len(), v.len());
return Cow::Borrowed(valid);
}
valid
} else {
return Cow::Borrowed("");
};
const REPLACEMENT: &str = "\u{FFFD}";
let mut res = String::with_capacity(v.len());
res.push_str(first_valid);
res.push_str(REPLACEMENT);
for chunk in iter {
res.push_str(chunk.valid());
if !chunk.invalid().is_empty() {
res.push_str(REPLACEMENT);
}
}
Cow::Owned(res)
}