jaq -L not supported

https://github.com/wader/jqjq needs it

Oct 10 '24 13:10 Freed-Wu

It would be fun if jaq could run jqjq 😄 some months ago i hacked around the lack of -L support (but it seem to have it now?) but then ran into lack of destructing support. You can test it like this if you want:

$ jaq -n -L . 'include "jqjq"; eval("1+2")'
Error: expected variable
   ╭─[/Users/wader/src/jqjq/jqjq.jq]
   │
49 │         | ([.[2:6], .[8:] | _fromhex]) as [$hi,$lo]
   ┆                                           ────┬────
   ┆                                               │
   ┆                                               ╰───── unexpected token
...

Oct 10 '24 13:10 wader

-L will be supported in jaq 2.0.

And yes, I would love to see jaq being able to run jqjq, but lack of destructuring support is indeed a pretty large road-block. Perhaps in jaq 3.0? :)

Oct 11 '24 08:10 01mf02

-L will be supported in jaq 2.0.

Ah yes i probably tested with current main branch

And yes, I would love to see jaq being able to run jqjq

Would be great and a fun project to help out with

but lack of destructuring support is indeed a pretty large road-block. Perhaps in jaq 3.0? :)

One of my favorite but sadly a bit hidden features of jq! sometimes i also dream about how some kind of pattern matching could work in jq 🤔

Oct 11 '24 09:10 wader

@wader, I was actually able to implement destructuring after all in jaq relatively quickly. :) My current prototype is in the patterns branch, if you wish to try it. I'll try running jqjq with it later myself.

Oct 14 '24 10:10 01mf02

🥳 nice, i gave it a try. with this jqjq changes i can get it to lex but parsing seems to bail out in the precedence climbing code, digging into that.

some notes:

match with ^...and m flag seem to match any line or something in jaq but not jq.
{ not need to be escaped in jq regexp but in jaq
gamma, getpath, setpath and delpaths not available in jaq

diff --git a/jqjq.jq b/jqjq.jq
index bc5e632..64d0fde 100644
--- a/jqjq.jq
+++ b/jqjq.jq
@@ -80,7 +80,7 @@ def lex:
     def _re($re; f):
       ( . as {$remain, $string_stack}
       | $remain
-      | match($re; "m").string
+      | match($re; "").string
       | f as $token
       | { result: ($token | del(.string_stack))
         , remain: $remain[length:]
@@ -153,8 +153,8 @@ def lex:
       // _re("^\\)";    {rparen: ., string_stack: ($string_stack[0:-1])})
       // _re("^\\[";    {lsquare: .})
       // _re("^\\]";    {rsquare: .})
-      // _re("^{";      {lcurly: .})
-      // _re("^}";      {rcurly: .})
+      // _re("^\\{";    {lcurly: .})
+      // _re("^\\}";    {rcurly: .})
       // _re("^\\.\\."; {dotdot: .})
       // _re("^\\.";    {dot: .})
       // _re("^\\?";    {qmark: .})
@@ -228,7 +228,7 @@ def parse:
         else false
         end;
 
-      ( _p("query1") as [$rest, $t]
+      ( debug({prec: .}) | _p("query1") as [$rest, $t]
       | $rest
       | def _f($t):
           ( .[0] as $next # peek next
@@ -272,6 +272,7 @@ def parse:
     def _scalar($type; c; f):
       ( . as [$first]
       | _consume(c)
+      | debug({aaa:.})
       | [ .
         , { term:
               ( $first
@@ -280,6 +281,7 @@ def parse:
               )
           }
         ]
+      | debug({bbb:.})
       );
 
     # {<keyval>...} where keyval is:
@@ -1060,7 +1062,7 @@ def parse:
         ]
       );
 
-    ( .# debug({_p: $type})
+    ( debug({_p: $type})
     | if $type == "query" then
         _op_prec_climb(0; false)
       elif $type == "keyval_query" then
@@ -1112,7 +1114,7 @@ def parse:
           // _p("recurse") # ".."
           ) as [$rest, $term]
         | $rest
-        | _repeat(_p("suffix")) as [$rest, $suffix_list]
+        | _repeat(empty | _p("suffix")) as [$rest, $suffix_list]
         | $rest
         | [ .
           , ( $term
@@ -1516,10 +1518,10 @@ def eval_ast($query; $path; $env; undefined_func):
                 ( a0 as $a0
                 | [[null], has($a0)]
                 )
-              elif $name == "delpaths/1" then
-                ( a0 as $a0
-                | [[null], delpaths($a0)]
-                )
+              # elif $name == "delpaths/1" then
+              #   ( a0 as $a0
+              #   | [[null], delpaths($a0)]
+              #   )
               elif $name == "explode/0"  then [[null], explode]
               elif $name == "implode/0"  then [[null], implode]
               elif $name == "tonumber/0" then [[null], tonumber]
@@ -1536,19 +1538,19 @@ def eval_ast($query; $path; $env; undefined_func):
                 | error($a0)
                 )
               elif $name == "halt_error/1" then [[null], halt_error(a0)]
-              elif $name == "getpath/1" then
-                ( a0 as $a0
-                | [ $path+$a0
-                  , getpath($a0)
-                  ]
-                )
-              elif $name == "setpath/2" then
-                ( a0 as $a0
-                | a1 as $a1
-                | [ []
-                  , setpath($a0; $a1)
-                  ]
-                )
+              # elif $name == "getpath/1" then
+              #   ( a0 as $a0
+              #   | [ $path+$a0
+              #     , getpath($a0)
+              #     ]
+              #   )
+              # elif $name == "setpath/2" then
+              #   ( a0 as $a0
+              #   | a1 as $a1
+              #   | [ []
+              #     , setpath($a0; $a1)
+              #     ]
+              #   )
               elif $name == "path/1" then
                 ( _e($args[0]; []; $query_env) as [$p, $_v]
                 # TODO: try/catch error
@@ -1577,7 +1579,7 @@ def eval_ast($query; $path; $env; undefined_func):
               elif $name == "expm1/0"       then [[null], expm1]
               elif $name == "fabs/0"        then [[null], fabs]
               elif $name == "floor/0"       then [[null], floor]
-              elif $name == "gamma/0"       then [[null], gamma]
+              # elif $name == "gamma/0"       then [[null], gamma]
               elif $name == "j0/0"          then [[null], j0]
               elif $name == "j1/0"          then [[null], j1]
               elif $name == "lgamma/0"      then [[null], lgamma]
@@ -2538,9 +2540,10 @@ def eval($expr; $globals; $builtins_env):
   # TODO: does not work with jq yet because issue with bind patterns
   # $ gojq -cn -L . 'include "jqjq"; {} | {a:1} | eval(".a") += 1'
   # {"a":2}
-  | if $path | . == [] or . == [null] then $value
-    else getpath($path)
-    end
+  # | if $path | . == [] or . == [null] then $value
+  #   else getpath($path)
+  #   end
+  | $value
   );
 def eval($expr):
   eval($expr; {}; _builtins_env);

Oct 14 '24 15:10 wader

@wader wrote:

match with ^...and m flag seem to match any line or something in jaq but not jq.

Just a reminder that jq's actual behavior regarding some of the regex flags is generally not a good guide. Some details are at https://github.com/jqlang/jq/issues/2663

@01mf02 -- I noticed that in your revision of the jq manual, you added a "Compatibility" note regarding jaq and the regex library. I was thinking that if you come across any instances where jaq's handling of regex flags is correct but differs from that of jq 1.7, it would be very helpful to add some details.

Oct 14 '24 19:10 pkoppstein

@pkoppstein true! haven't dugg any deeper what going on, here is a repro of the difference. seem to only happen when there is a \n\n

$ jq 'match("^\\s+"; "m")' <<< '"a\nb\n"'
$ jaq 'match("^\\s+"; "m")' <<< '"a\nb\n"'

$ jq 'match("^\\s+"; "m")' <<< '"a\n\nb\n"'
$ jaq 'match("^\\s+"; "m")' <<< '"a\n\nb\n"'
{
  "offset": 2,
  "length": 1,
  "string": "\n",
  "captures": []
}

Oct 15 '24 08:10 wader

🥳 nice, i gave it a try. with this jqjq changes i can get it to lex but parsing seems to bail out in the precedence climbing code, digging into that.

Great. Thanks for trying this out. Let us know if you need any help.

@pkoppstein true! haven't dugg any deeper what going on, here is a repro of the difference. seem to only happen when there is a \n\n

jaq enables regex's multi_line function when m is given. The behaviour that we see here is that ^ matches the \n (by definition), and because the \n just after it is a space character, the regex matches.

@01mf02 -- I noticed that in your revision of the jq manual, you added a "Compatibility" note regarding jaq and the regex library. I was thinking that if you come across any instances where jaq's handling of regex flags is correct but differs from that of jq 1.7, it would be very helpful to add some details.

I'm not sure whether I'm the right person to do this --- I'm not very fluent at regexes, and even when I come across a discrepancy between jq and jaq, I am not confident to decide whether jq's or jaq's behaviour is right. I also think that this might be a rabbit-hole from which one will have a hard time getting out. Because I'm sure that there are tons of differences between Oniguruma and regex, and I think that the jq manual is not a good point to document these differences. (That being said, there might be some value in documenting the most blatant differences, but it will be hard to determine where to draw the line.)

Oct 15 '24 08:10 01mf02

@01mf02 wrote:

this might be a rabbit-hole

Yes, but I think we could avoid that by focusing only on those cases where someone notices a discrepancy where (a) jaq and/or gojq is correct and (b) either jq 1.7 behavior is definitely wrong, or the jq manual is definitely in need of clarification in light of the correct behavior.

I would certainly be glad to help, and suspect @wader would too :-)

Oct 15 '24 09:10 pkoppstein

@pkoppstein, thanks for offering your help. I will first await the merging of the current state of the manual, before augmenting it with more content. So let us discuss the topic of regex documentation once this is done.

Oct 23 '24 10:10 01mf02

@01mf02 - I'm sorry I probably can't do much to help expedite the acceptance of your excellent work on the manual. Let me know if you think otherwise.

Oct 23 '24 10:10 pkoppstein

Spent some more time on jqjq support and think i found a bug/difference between jaq and jq when it comes to destructing. Managed to minimize it down to this:

$ jq -cn '[{a:123}] | debug | .[0] as {$a} | debug, $a'
["DEBUG:",[{"a":123}]]
["DEBUG:",[{"a":123}]]
[{"a":123}]
123

$ jaq -cn '[{a:123}] | debug | .[0] as {$a} | debug, $a'
["DEBUG:", [{"a":123}]]
["DEBUG:", {"a":123}]
{"a":123}
123

It seems like the as-binding does not passthru the input unchanged in the case of .[<index>] as {...} or [...]

Oct 28 '24 09:10 wader

Ran into this also, i would guess related to same issue:

$ jq -n '{} as $a | $a as {$t} | debug'
["DEBUG:",null]
null

$ jaq -n '{} as $a | $a as {$t} | debug'
["DEBUG:", {}]
{}

Oct 28 '24 11:10 wader

Hi @wader, just as I was setting out to look into jqjq myself right now, I find that you have beaten me to it. :) Thanks for uncovering this bug. I have corrected binding behaviour in 4560c86.

Oct 28 '24 13:10 01mf02

Perhaps it is interesting to mention at this point that jaq's destructuring follows the same rules as its indexing, which diverge from jq by failing to index any null value. That means that [] as [{$x}] | $x yields null in jq, but an error in jaq, because [] | .[0] | .x is a failure too. However, [] as [$x] | $x yields null for both jq and jaq.

Oct 28 '24 13:10 01mf02

Yeap noticed :) have done work workaround for it. I pushed my current progress here https://github.com/wader/jqjq/commits/jaq/ but be aware of lots of hacks :) with this at least "1+2" seem to kind of work but other expressions fails.

Will probably continue later today on it.

Oct 28 '24 13:10 wader

Hi @wader, just as I was setting out to look into jqjq myself right now, I find that you have beaten me to it. :) Thanks for uncovering this bug. I have corrected binding behaviour in 4560c86.

That was quick! 🥳 let me know how it goes, and i will probably continue later today also

Oct 28 '24 13:10 wader

Thanks for mentioning the jaq branch of jqjq. I found that quite a few things already work, using:

jaq -n -L . 'include "jqjq"; eval($FILTER; {}; {})

This evaluates the filter without using any globals or builtins.

For $FILTER, the following work and yield the expected output:

1+2*3 --- yay, precedences
1.4e2
[1, 2]
{a: 1, b: (2, 3)}
def f: 1; f
if true then 0 else 1 end
reduce (1, 2) as $x (0; .+$x)
1 | .+2

What does not work:

., .[], .[0], .a, 1 as $x | $x: These fail with: "cannot use null as iterable (array or object)". For some of these, parsing seems to fail (.), for some others, execution.

Oct 28 '24 14:10 01mf02

Concerning the lexing speed:

jaq -L . -n 'include "jqjq"; _builtins_src | lex'

is indeed slower than jq right now (1.2 seconds in jaq vs. 0.4 seconds in jq). If it is much slower than that for you: Did you compile jaq with cargo build --release?

I just made some small experiment, and the performance problem seems to be related to slow regex execution. So I swapped the underlying regex library in 0840b1b010fd230be0c29011fd12654a75ce9363 to regex-lite, and this brings down the execution time in jaq to 0.4 seconds as well. :)

Oct 28 '24 15:10 01mf02

Thanks for mentioning the jaq branch of jqjq. I found that quite a few things already work, using:
jaq -n -L . 'include "jqjq"; eval($FILTER; {}; {})
This evaluates the filter without using any globals or builtins.

Hmm it seem to get stuck with eval/1, will have to be investigated.

For $FILTER, the following work and yield the expected output:

1+2*3 --- yay, precedences

1.4e2

[1, 2]

{a: 1, b: (2, 3)}

def f: 1; f

if true then 0 else 1 end

reduce (1, 2) as $x (0; .+$x)

1 | .+2

🥳

What does not work:

., .[], .[0], .a, 1 as $x | $x: These fail with: "cannot use null as iterable (array or object)". For some of these, parsing seems to fail (.), for some others, execution.

I guess some null checks etc is needed in some places, i've neem bit sloppy with using null it seems :)

Oct 28 '24 19:10 wader

Concerning the lexing speed:
jaq -L . -n 'include "jqjq"; _builtins_src | lex'
is indeed slower than jq right now (1.2 seconds in jaq vs. 0.4 seconds in jq). If it is much slower than that for you: Did you compile jaq with cargo build --release?

Doh, yes you correct, i build without --release. Now i've updated to latest main and built with release target, much faster 👍

I just made some small experiment, and the performance problem seems to be related to slow regex execution. So I swapped the underlying regex library in 0840b1b to regex-lite, and this brings down the execution time in jaq to 0.4 seconds as well. :)

When i read about regex-list it seem to focus on binary size, compile time and some less features (less unicode support?) over performance? but it was faster in this case also? ... oh they mean compile time as in time compiling a regex not rust build time? :)

BTW for regex heavy filters maybe a regex cache would be an interesting idea? there has been some talks in jq issues about various ways of doing, ex in https://github.com/jqlang/jq/issues/2856, maybe a LRU cache etc? i've also experimented a bit with how /regex/ literals could work in jq, that would be neat and also simplify how to reuse a compiled regex.

Oct 28 '24 19:10 wader

Any idea what is going on with these? https://github.com/wader/jqjq/blob/master/jqjq.jq#L1945-L1946

Without any changes it seem expressions like 1+2 just output 1, but if i add ... | debug or ... | . to both it seem to work. Some tail or return optimisation going wrong? 🤔

Oct 28 '24 20:10 wader

What does not work:

., .[], .[0], .a, 1 as $x | $x: These fail with: "cannot use null as iterable (array or object)". For some of these, parsing seems to fail (.), for some others, execution.

Now all of those work in the jaq branch, and i think all of them were different variants of null index. Current fixes are mostly temp workarounds, should try figure something nicer. I've started to annotate with # TODO: jaq for things that needs a revisit.

Oct 28 '24 21:10 wader

Oh seems eval/1 now works, not sure what fixed it, and also this 🥳

$ jaq -L . 'include "jqjq"; eval("def f: 1,8; [f,f] | map(.+105) | implode")' <<< '{"a":123}'
"jqjq"

Oct 28 '24 21:10 wader

Any plan to add --args, would make it possible to use the jqjq shell wrapper with jaq, ./jqjq --jq jaq .... But beware the semantics of --args, it's a bit confusing. Let me illustrate:

$ jq -n '$ARGS' --args a b c -- d
{
  "positional": [
    "a",
    "b",
    "c",
    "d"
  ],
  "named": {}
}

$ jq -n --args '$ARGS' a b c -- d
{
  "positional": [
    "a",
    "b",
    "c",
    "d"
  ],
  "named": {}
}

So skips -- and the positional are the ones after the first non-option (the filter).

Oct 28 '24 21:10 wader

Found another bug i think, "" | test("") outputs false with jaq, was minified from test("^\\s*$") in the test parse code. I tried to debug the matches function but my rust is not good enough yet. Read the regex crate documentation about empty matches but didn't see that would make it skip them by default.

But with an ugly hack, that is now in the jaq brach, i managed to run the tests! but not sure i would trust them just yet :) i did notice the test result output look a bit wrong so needs some work.

$ jaq -n -L . -rRs 'include "jqjq"; . | jqjq(["", "--run-tests"]; {})' < jqjq.test
<cut>
OK: line 1230: null | eval("eval(\"empty\")") -> null
DIFF: line 1238: null | (.a | eval(".a, .b.c")) = 123 -> {"a":{"a":123,"b":{"c":123}}}
  Expected: {"a":{"a":123,"b":{"c":123}}}
    Actual: 123
OK: line 1244: null | try eval("{a b}") catch "failed" -> "failed"
OK: line 1249: null | try eval(". as {$a $b} | .") catch "failed" -> "failed"
191 of 253 tests passed
null

Oct 29 '24 08:10 wader

Hi @wader, amazing work! Thanks so much for it.

Doh, yes you correct, i build without --release. Now i've updated to latest main and built with release target, much faster 👍

It has happened to (nearly) everyone ... ;)

When i read about regex-list it seem to focus on binary size, compile time and some less features (less unicode support?) over performance? but it was faster in this case also? ... oh they mean compile time as in time compiling a regex not rust build time? :)

I think that the statement in regex-lite refers to Rust build time. Still, I believe that by having a reduced feature set, the time to compile a regular expression is also reduced. This is probably not advertised by regex-lite because they assume that you are usually not compiling thousands of regular expressions. :) Which brings us to your next point ...

BTW for regex heavy filters maybe a regex cache would be an interesting idea? there has been some talks in jq issues about various ways of doing, ex in jqlang/jq#2856, maybe a LRU cache etc? i've also experimented a bit with how /regex/ literals could work in jq, that would be neat and also simplify how to reuse a compiled regex.

I have already thought about this the last few days. And following your comment, I just implemented regex caching with an LRU cache. However, the result is underwhelming: Using jaq -L . -n 'include "jqjq"; _builtins_src | lex' as benchmark, we get 0.35 seconds without caching and 0.30 seconds with caching. I also tried jaq -n 'limit(100000; repeat("--")) | match("^-="; "").string', and here, we get 0.20 seconds without caching and 0.16 seconds with caching.

Given that the performance improvement is not that large, I will probably not integrate this into jaq.

For reference, here is the diff to reproduce the experiment:

diff --git a/jaq-std/Cargo.toml b/jaq-std/Cargo.toml
index b9c3f84..db4d8da 100644
--- a/jaq-std/Cargo.toml
+++ b/jaq-std/Cargo.toml
@@ -29,6 +29,7 @@ libm = { version = "0.2.7", optional = true }
 aho-corasick = { version = "1.0", optional = true }
 base64 = { version = "0.22", optional = true }
 urlencoding = { version = "2.1.3", optional = true }
+lru = "0.12.5"
 
 [dev-dependencies]
 jaq-json = { path = "../jaq-json" }
diff --git a/jaq-std/src/regex.rs b/jaq-std/src/regex.rs
index 3378306..e7a9ee5 100644
--- a/jaq-std/src/regex.rs
+++ b/jaq-std/src/regex.rs
@@ -4,7 +4,7 @@ use alloc::string::{String, ToString};
 use alloc::vec::Vec;
 use regex_lite::{self as regex, Error, Regex, RegexBuilder};
 
-#[derive(Copy, Clone, Default)]
+#[derive(Copy, Clone, PartialEq, Eq, Default, Hash)]
 pub struct Flags {
     // global search
     g: bool,
@@ -65,6 +65,25 @@ impl Flags {
         let mut builder = RegexBuilder::new(re);
         self.impact(&mut builder).build()
     }
+
+    #[cfg(feature = "std")]
+    pub fn regex_cached(self, re: &str) -> Result<Regex, Error> {
+        use core::cell::RefCell;
+        use core::num::NonZeroUsize;
+        use lru::LruCache;
+
+        std::thread_local! {
+            static CACHE: RefCell<LruCache<(String, Flags), Result<Regex, Error>>> =
+                RefCell::new(LruCache::new(NonZeroUsize::new(64).unwrap()));
+        }
+
+        CACHE.with(|cache| {
+            cache
+                .borrow_mut()
+                .get_or_insert((re.to_string(), self), || self.regex(re))
+                .clone()
+        })
+    }
 }
 
 /// Mapping between byte and character indices.

Oct 29 '24 09:10 01mf02

Without any changes it seem expressions like 1+2 just output 1, but if i add ... | debug or ... | . to both it seem to work. Some tail or return optimisation going wrong? 🤔

Oh, oh, that looks indeed like tail recursion gone wrong. I'm looking into it, but so far, I was not able to reproduce it with a smaller example ...

Oct 29 '24 09:10 01mf02

Any plan to add --args, would make it possible to use the jqjq shell wrapper with jaq, ./jqjq --jq jaq ....

That should actually not be that hard to do with trailing_var_arg.

Oct 29 '24 09:10 01mf02

Hi @wader, amazing work! Thanks so much for it.

Thank and same to you! also great that jqjq gets to be used for something useful!

Doh, yes you correct, i build without --release. Now i've updated to latest main and built with release target, much faster 👍

It has happened to (nearly) everyone ... ;)

👯

Given that the performance improvement is not that large, I will probably not integrate this into jaq.

Aha that's a bummer and a bit unintuitive 🤔 but maybe compiling a small regex:s is quite fast. Maybe something to revisit in the future if other things gets more optimised?

Oct 29 '24 10:10 wader