cmarkit overzealous item-marker escaping when rendering to commonmark

I'm using cmarkit to generate and format markdown content using the cmarkit_commonmark module. I've noticed that it adds a lot of unnecessary escape characters which can hurt readability.

let%expect_test "Issue: Escape unnecessary because it's immediately followed by non-whitespace" =
  roundtrip {|3.14 is pi|};
  [%expect {| 3\.14 is pi |}]
;;

let%expect_test "Issue: Escape unnecessary because it doesn't start a line." =
  roundtrip {|pi is 3.14|};
  [%expect {| pi is 3\.14 |}]
;;

let%expect_test "Issue: Escape unnecessary because it doesn't start a line." =
  roundtrip {|pi is approximately 3.|};
  [%expect {| pi is approximately 3\. |}]
;;

let%expect_test "GOOD! The escape is necessary to prevent being parsed as an ordered list" =
  roundtrip {|3\. is approximately pi|};
  [%expect {| 3\. is approximately pi |}]
;;

Feb 04 '24 15:02 TyOverby

This patch addresses tests 1 and 2, but does not address test 3. For that, I think that the renderer might need to scan back into the buffer to see if there's any non-whitespace preceeding the nearest newline.

diff --git a/src/cmarkit_commonmark.ml b/src/cmarkit_commonmark.ml
--- a/src/cmarkit_commonmark.ml
+++ b/src/cmarkit_commonmark.ml
@@ -71,11 +71,16 @@ let buffer_add_escaped_text b s =
   let esc_tilde s max prev next =
     not (Char.equal prev '~') && next <= max && s.[next] = '~'
   in
-  let esc_item_marker s i =
-    if i = 0 || i > 9 (* marker has from 1-9 digits *) then false else
-    let k = ref (i - 1) in
-    while !k >= 0 && Cmarkit_base.Ascii.is_digit s.[!k] do decr k done;
-    !k < 0
+  let esc_item_marker s i max =
+    let after_marker_is_whitespace = 
+      i + 1 >= max || Cmarkit_base.Ascii.is_white s.[i + 1]
+    in
+    after_marker_is_whitespace && (
+      if i = 0 || i > 9 (* marker has from 1-9 digits *) then false else
+      let k = ref (i - 1) in
+      while !k >= 0 && Cmarkit_base.Ascii.is_digit s.[!k] do decr k done;
+      !k < 0)
   in
   let flush b max start i =
     if start <= max then Buffer.add_substring b s start (i - start)
@@ -95,7 +100,7 @@ let buffer_add_escaped_text b s =
         flush b max start i; buffer_add_bslash_esc b c; loop b s max next c next
     | '!' when i = max ->
         flush b max start i; buffer_add_bslash_esc b c; loop b s max next c next
-    | '.' | ')' when esc_item_marker s i ->
+    | '.' | ')' when esc_item_marker s i max ->
         flush b max start i; buffer_add_bslash_esc b c; loop b s max next c next
     | '\\' | '<' | '>' | '[' | ']' | '*' | '_' | '$' | '|' ->
         flush b max start i; buffer_add_bslash_esc b c; loop b s max next c next

Feb 04 '24 15:02 TyOverby

Indeed this was recently tweaked in https://github.com/dbuenzli/cmarkit/commit/90fb75ed20a8dae7dd859b41dcd14e368489cc09, a bit carelessly it seems.

However I don't think your patch handles correctly the escape to prevent an ordered empty list item to be created at the beginning of a line.

Feb 04 '24 21:02 dbuenzli

Oh interesting; I didn't know about that parsing behavior! I'll admit I'm not super familiar with the cmarkit codebase yet; do you have an intuition for how best to solve this issue without regressing this case?

Feb 04 '24 21:02 TyOverby

I'd need to put back this in my head.

But that's the third case for list items. However this cannot happen in the middle of a paragraph (scroll one line up for the sentence). So we'd need only to make sure this doesn't happen only at the start of paragraph text. Basically we have one overeager escape here:

printf "1\\.\nWe need an escape here but not here\n1." | cmarkit commonmark
1\.
We need an escape here but not here
1\.

I think there is state in the rendering context that tracks this but the context is not available in this function, so maybe we'd need to trickle down the context there if we wanted to solve that case correctly.

Feb 04 '24 22:02 dbuenzli