scalaprops
scalaprops copied to clipboard
Unicode String Gen
- https://github.com/apache/commons-lang/blob/LANG_3_4/src/main/java/org/apache/commons/lang3/RandomStringUtils.java
- https://github.com/msgpack4z/msgpack4z-core/blob/v0.3.2/src/test/scala/msgpack4z/StdSpec.scala#L16-L33
https://github.com/kxbmap/configs/blob/2acd37e039d4bc45ec46fbf359847897e7d615eb/core/src/test/scala/configs/testutil/instance/string.scala#L30-L50
Why don't you just use Character.isValidCodePoint(Int)
?
if (Character.isValidCodePoint(i)) String.valueOf(Character.toChars(i)) else /* retry code */
val unicodeString: Gen[String] = {
val chars = Gen.choose(0, 0x10FFFF).map { cp =>
if (Character.isBmpCodePoint(cp)) List(cp.toChar)
else List(Character.highSurrogate(cp), Character.lowSurrogate(cp))
}
Gen.listOf(chars).map(_.flatten.mkString)
}
references
- https://en.wikipedia.org/wiki/UTF-8 for how
Array[Byte]
is encoded - https://en.wikipedia.org/wiki/UTF-16 for how
Array[Char]
(akaString
) is encoded
probably want to have something like this to handle the surrogates during shrinking:
implicit val strShrinker: Shrink[String] = Shrink.shrink { txt =>
if (txt.isEmpty) Stream.empty[String]
else if (txt.length == 1) Stream("")
else {
val a = if (txt.head.isSurrogate) txt.drop(2) else txt.drop(1)
val b = if (txt.last.isSurrogate) txt.take(txt.length - 2) else txt.take(txt.length - 1)
Stream(a, b).filter(_ != txt).distinct
}
}