-L not supported
https://github.com/wader/jqjq needs it
It would be fun if jaq could run jqjq 😄 some months ago i hacked around the lack of -L support (but it seem to have it now?) but then ran into lack of destructing support. You can test it like this if you want:
$ jaq -n -L . 'include "jqjq"; eval("1+2")'
Error: expected variable
╭─[/Users/wader/src/jqjq/jqjq.jq]
│
49 │ | ([.[2:6], .[8:] | _fromhex]) as [$hi,$lo]
┆ ────┬────
┆ │
┆ ╰───── unexpected token
...
-L will be supported in jaq 2.0.
And yes, I would love to see jaq being able to run jqjq, but lack of destructuring support is indeed a pretty large road-block. Perhaps in jaq 3.0? :)
-Lwill be supported in jaq 2.0.
Ah yes i probably tested with current main branch
And yes, I would love to see jaq being able to run jqjq
Would be great and a fun project to help out with
but lack of destructuring support is indeed a pretty large road-block. Perhaps in jaq 3.0? :)
One of my favorite but sadly a bit hidden features of jq! sometimes i also dream about how some kind of pattern matching could work in jq 🤔
@wader, I was actually able to implement destructuring after all in jaq relatively quickly. :)
My current prototype is in the patterns branch, if you wish to try it.
I'll try running jqjq with it later myself.
🥳 nice, i gave it a try. with this jqjq changes i can get it to lex but parsing seems to bail out in the precedence climbing code, digging into that.
some notes:
matchwith^...andmflag seem to match any line or something in jaq but not jq.{not need to be escaped in jq regexp but in jaqgamma,getpath,setpathanddelpathsnot available in jaq
diff --git a/jqjq.jq b/jqjq.jq
index bc5e632..64d0fde 100644
--- a/jqjq.jq
+++ b/jqjq.jq
@@ -80,7 +80,7 @@ def lex:
def _re($re; f):
( . as {$remain, $string_stack}
| $remain
- | match($re; "m").string
+ | match($re; "").string
| f as $token
| { result: ($token | del(.string_stack))
, remain: $remain[length:]
@@ -153,8 +153,8 @@ def lex:
// _re("^\\)"; {rparen: ., string_stack: ($string_stack[0:-1])})
// _re("^\\["; {lsquare: .})
// _re("^\\]"; {rsquare: .})
- // _re("^{"; {lcurly: .})
- // _re("^}"; {rcurly: .})
+ // _re("^\\{"; {lcurly: .})
+ // _re("^\\}"; {rcurly: .})
// _re("^\\.\\."; {dotdot: .})
// _re("^\\."; {dot: .})
// _re("^\\?"; {qmark: .})
@@ -228,7 +228,7 @@ def parse:
else false
end;
- ( _p("query1") as [$rest, $t]
+ ( debug({prec: .}) | _p("query1") as [$rest, $t]
| $rest
| def _f($t):
( .[0] as $next # peek next
@@ -272,6 +272,7 @@ def parse:
def _scalar($type; c; f):
( . as [$first]
| _consume(c)
+ | debug({aaa:.})
| [ .
, { term:
( $first
@@ -280,6 +281,7 @@ def parse:
)
}
]
+ | debug({bbb:.})
);
# {<keyval>...} where keyval is:
@@ -1060,7 +1062,7 @@ def parse:
]
);
- ( .# debug({_p: $type})
+ ( debug({_p: $type})
| if $type == "query" then
_op_prec_climb(0; false)
elif $type == "keyval_query" then
@@ -1112,7 +1114,7 @@ def parse:
// _p("recurse") # ".."
) as [$rest, $term]
| $rest
- | _repeat(_p("suffix")) as [$rest, $suffix_list]
+ | _repeat(empty | _p("suffix")) as [$rest, $suffix_list]
| $rest
| [ .
, ( $term
@@ -1516,10 +1518,10 @@ def eval_ast($query; $path; $env; undefined_func):
( a0 as $a0
| [[null], has($a0)]
)
- elif $name == "delpaths/1" then
- ( a0 as $a0
- | [[null], delpaths($a0)]
- )
+ # elif $name == "delpaths/1" then
+ # ( a0 as $a0
+ # | [[null], delpaths($a0)]
+ # )
elif $name == "explode/0" then [[null], explode]
elif $name == "implode/0" then [[null], implode]
elif $name == "tonumber/0" then [[null], tonumber]
@@ -1536,19 +1538,19 @@ def eval_ast($query; $path; $env; undefined_func):
| error($a0)
)
elif $name == "halt_error/1" then [[null], halt_error(a0)]
- elif $name == "getpath/1" then
- ( a0 as $a0
- | [ $path+$a0
- , getpath($a0)
- ]
- )
- elif $name == "setpath/2" then
- ( a0 as $a0
- | a1 as $a1
- | [ []
- , setpath($a0; $a1)
- ]
- )
+ # elif $name == "getpath/1" then
+ # ( a0 as $a0
+ # | [ $path+$a0
+ # , getpath($a0)
+ # ]
+ # )
+ # elif $name == "setpath/2" then
+ # ( a0 as $a0
+ # | a1 as $a1
+ # | [ []
+ # , setpath($a0; $a1)
+ # ]
+ # )
elif $name == "path/1" then
( _e($args[0]; []; $query_env) as [$p, $_v]
# TODO: try/catch error
@@ -1577,7 +1579,7 @@ def eval_ast($query; $path; $env; undefined_func):
elif $name == "expm1/0" then [[null], expm1]
elif $name == "fabs/0" then [[null], fabs]
elif $name == "floor/0" then [[null], floor]
- elif $name == "gamma/0" then [[null], gamma]
+ # elif $name == "gamma/0" then [[null], gamma]
elif $name == "j0/0" then [[null], j0]
elif $name == "j1/0" then [[null], j1]
elif $name == "lgamma/0" then [[null], lgamma]
@@ -2538,9 +2540,10 @@ def eval($expr; $globals; $builtins_env):
# TODO: does not work with jq yet because issue with bind patterns
# $ gojq -cn -L . 'include "jqjq"; {} | {a:1} | eval(".a") += 1'
# {"a":2}
- | if $path | . == [] or . == [null] then $value
- else getpath($path)
- end
+ # | if $path | . == [] or . == [null] then $value
+ # else getpath($path)
+ # end
+ | $value
);
def eval($expr):
eval($expr; {}; _builtins_env);
@wader wrote:
match with ^...and m flag seem to match any line or something in jaq but not jq.
Just a reminder that jq's actual behavior regarding some of the regex flags is generally not a good guide. Some details are at https://github.com/jqlang/jq/issues/2663
@01mf02 -- I noticed that in your revision of the jq manual, you added a "Compatibility" note regarding jaq and the regex library. I was thinking that if you come across any instances where jaq's handling of regex flags is correct but differs from that of jq 1.7, it would be very helpful to add some details.
@pkoppstein true! haven't dugg any deeper what going on, here is a repro of the difference. seem to only happen when there is a \n\n
$ jq 'match("^\\s+"; "m")' <<< '"a\nb\n"'
$ jaq 'match("^\\s+"; "m")' <<< '"a\nb\n"'
$ jq 'match("^\\s+"; "m")' <<< '"a\n\nb\n"'
$ jaq 'match("^\\s+"; "m")' <<< '"a\n\nb\n"'
{
"offset": 2,
"length": 1,
"string": "\n",
"captures": []
}
🥳 nice, i gave it a try. with this jqjq changes i can get it to lex but parsing seems to bail out in the precedence climbing code, digging into that.
Great. Thanks for trying this out. Let us know if you need any help.
@pkoppstein true! haven't dugg any deeper what going on, here is a repro of the difference. seem to only happen when there is a
\n\n
jaq enables regex's multi_line function when m is given. The behaviour that we see here is that ^ matches the \n (by definition), and because the \n just after it is a space character, the regex matches.
@01mf02 -- I noticed that in your revision of the jq manual, you added a "Compatibility" note regarding jaq and the regex library. I was thinking that if you come across any instances where jaq's handling of regex flags is correct but differs from that of jq 1.7, it would be very helpful to add some details.
I'm not sure whether I'm the right person to do this --- I'm not very fluent at regexes, and even when I come across a discrepancy between jq and jaq, I am not confident to decide whether jq's or jaq's behaviour is right. I also think that this might be a rabbit-hole from which one will have a hard time getting out. Because I'm sure that there are tons of differences between Oniguruma and regex, and I think that the jq manual is not a good point to document these differences. (That being said, there might be some value in documenting the most blatant differences, but it will be hard to determine where to draw the line.)
@01mf02 wrote:
this might be a rabbit-hole
Yes, but I think we could avoid that by focusing only on those cases where someone notices a discrepancy where (a) jaq and/or gojq is correct and (b) either jq 1.7 behavior is definitely wrong, or the jq manual is definitely in need of clarification in light of the correct behavior.
I would certainly be glad to help, and suspect @wader would too :-)
@pkoppstein, thanks for offering your help. I will first await the merging of the current state of the manual, before augmenting it with more content. So let us discuss the topic of regex documentation once this is done.
@01mf02 - I'm sorry I probably can't do much to help expedite the acceptance of your excellent work on the manual. Let me know if you think otherwise.
Spent some more time on jqjq support and think i found a bug/difference between jaq and jq when it comes to destructing. Managed to minimize it down to this:
$ jq -cn '[{a:123}] | debug | .[0] as {$a} | debug, $a'
["DEBUG:",[{"a":123}]]
["DEBUG:",[{"a":123}]]
[{"a":123}]
123
$ jaq -cn '[{a:123}] | debug | .[0] as {$a} | debug, $a'
["DEBUG:", [{"a":123}]]
["DEBUG:", {"a":123}]
{"a":123}
123
It seems like the as-binding does not passthru the input unchanged in the case of .[<index>] as {...} or [...]
Ran into this also, i would guess related to same issue:
$ jq -n '{} as $a | $a as {$t} | debug'
["DEBUG:",null]
null
$ jaq -n '{} as $a | $a as {$t} | debug'
["DEBUG:", {}]
{}
Hi @wader, just as I was setting out to look into jqjq myself right now, I find that you have beaten me to it. :) Thanks for uncovering this bug. I have corrected binding behaviour in 4560c86.
Perhaps it is interesting to mention at this point that jaq's destructuring follows the same rules as its indexing, which diverge from jq by failing to index any null value. That means that [] as [{$x}] | $x yields null in jq, but an error in jaq, because [] | .[0] | .x is a failure too. However, [] as [$x] | $x yields null for both jq and jaq.
Yeap noticed :) have done work workaround for it. I pushed my current progress here https://github.com/wader/jqjq/commits/jaq/ but be aware of lots of hacks :) with this at least "1+2" seem to kind of work but other expressions fails.
Will probably continue later today on it.
Hi @wader, just as I was setting out to look into jqjq myself right now, I find that you have beaten me to it. :) Thanks for uncovering this bug. I have corrected binding behaviour in 4560c86.
That was quick! 🥳 let me know how it goes, and i will probably continue later today also
Thanks for mentioning the jaq branch of jqjq. I found that quite a few things already work, using:
jaq -n -L . 'include "jqjq"; eval($FILTER; {}; {})
This evaluates the filter without using any globals or builtins.
For $FILTER, the following work and yield the expected output:
1+2*3--- yay, precedences1.4e2[1, 2]{a: 1, b: (2, 3)}def f: 1; fif true then 0 else 1 endreduce (1, 2) as $x (0; .+$x)1 | .+2
What does not work:
., .[], .[0], .a, 1 as $x | $x: These fail with: "cannot use null as iterable (array or object)".
For some of these, parsing seems to fail (.), for some others, execution.
Concerning the lexing speed:
jaq -L . -n 'include "jqjq"; _builtins_src | lex'
is indeed slower than jq right now (1.2 seconds in jaq vs. 0.4 seconds in jq).
If it is much slower than that for you: Did you compile jaq with cargo build --release?
I just made some small experiment, and the performance problem seems to be related to slow regex execution. So I swapped the underlying regex library in 0840b1b010fd230be0c29011fd12654a75ce9363 to regex-lite, and this brings down the execution time in jaq to 0.4 seconds as well. :)
Thanks for mentioning the jaq branch of jqjq. I found that quite a few things already work, using:
jaq -n -L . 'include "jqjq"; eval($FILTER; {}; {})This evaluates the filter without using any globals or builtins.
Hmm it seem to get stuck with eval/1, will have to be investigated.
For $FILTER, the following work and yield the expected output:
1+2*3--- yay, precedences1.4e2[1, 2]{a: 1, b: (2, 3)}def f: 1; fif true then 0 else 1 endreduce (1, 2) as $x (0; .+$x)1 | .+2
🥳
What does not work:
.,.[],.[0],.a,1 as $x | $x: These fail with: "cannot use null as iterable (array or object)". For some of these, parsing seems to fail (.), for some others, execution.
I guess some null checks etc is needed in some places, i've neem bit sloppy with using null it seems :)
Concerning the lexing speed:
jaq -L . -n 'include "jqjq"; _builtins_src | lex'is indeed slower than jq right now (1.2 seconds in jaq vs. 0.4 seconds in jq). If it is much slower than that for you: Did you compile jaq with
cargo build --release?
Doh, yes you correct, i build without --release. Now i've updated to latest main and built with release target, much faster 👍
I just made some small experiment, and the performance problem seems to be related to slow regex execution. So I swapped the underlying regex library in 0840b1b to
regex-lite, and this brings down the execution time in jaq to 0.4 seconds as well. :)
When i read about regex-list it seem to focus on binary size, compile time and some less features (less unicode support?) over performance? but it was faster in this case also? ... oh they mean compile time as in time compiling a regex not rust build time? :)
BTW for regex heavy filters maybe a regex cache would be an interesting idea? there has been some talks in jq issues about various ways of doing, ex in https://github.com/jqlang/jq/issues/2856, maybe a LRU cache etc? i've also experimented a bit with how /regex/ literals could work in jq, that would be neat and also simplify how to reuse a compiled regex.
Any idea what is going on with these? https://github.com/wader/jqjq/blob/master/jqjq.jq#L1945-L1946
Without any changes it seem expressions like 1+2 just output 1, but if i add ... | debug or ... | . to both it seem to work. Some tail or return optimisation going wrong? 🤔
What does not work:
.,.[],.[0],.a,1 as $x | $x: These fail with: "cannot use null as iterable (array or object)". For some of these, parsing seems to fail (.), for some others, execution.
Now all of those work in the jaq branch, and i think all of them were different variants of null index. Current fixes are mostly temp workarounds, should try figure something nicer. I've started to annotate with # TODO: jaq for things that needs a revisit.
Oh seems eval/1 now works, not sure what fixed it, and also this 🥳
$ jaq -L . 'include "jqjq"; eval("def f: 1,8; [f,f] | map(.+105) | implode")' <<< '{"a":123}'
"jqjq"
Any plan to add --args, would make it possible to use the jqjq shell wrapper with jaq, ./jqjq --jq jaq .... But beware the semantics of --args, it's a bit confusing. Let me illustrate:
$ jq -n '$ARGS' --args a b c -- d
{
"positional": [
"a",
"b",
"c",
"d"
],
"named": {}
}
$ jq -n --args '$ARGS' a b c -- d
{
"positional": [
"a",
"b",
"c",
"d"
],
"named": {}
}
So skips -- and the positional are the ones after the first non-option (the filter).
Found another bug i think, "" | test("") outputs false with jaq, was minified from test("^\\s*$") in the test parse code. I tried to debug the matches function but my rust is not good enough yet. Read the regex crate documentation about empty matches but didn't see that would make it skip them by default.
But with an ugly hack, that is now in the jaq brach, i managed to run the tests! but not sure i would trust them just yet :) i did notice the test result output look a bit wrong so needs some work.
$ jaq -n -L . -rRs 'include "jqjq"; . | jqjq(["", "--run-tests"]; {})' < jqjq.test
<cut>
OK: line 1230: null | eval("eval(\"empty\")") -> null
DIFF: line 1238: null | (.a | eval(".a, .b.c")) = 123 -> {"a":{"a":123,"b":{"c":123}}}
Expected: {"a":{"a":123,"b":{"c":123}}}
Actual: 123
OK: line 1244: null | try eval("{a b}") catch "failed" -> "failed"
OK: line 1249: null | try eval(". as {$a $b} | .") catch "failed" -> "failed"
191 of 253 tests passed
null
Hi @wader, amazing work! Thanks so much for it.
Doh, yes you correct, i build without
--release. Now i've updated to latest main and built with release target, much faster 👍
It has happened to (nearly) everyone ... ;)
When i read about regex-list it seem to focus on binary size, compile time and some less features (less unicode support?) over performance? but it was faster in this case also? ... oh they mean compile time as in time compiling a regex not rust build time? :)
I think that the statement in regex-lite refers to Rust build time. Still, I believe that by having a reduced feature set, the time to compile a regular expression is also reduced. This is probably not advertised by regex-lite because they assume that you are usually not compiling thousands of regular expressions. :) Which brings us to your next point ...
BTW for regex heavy filters maybe a regex cache would be an interesting idea? there has been some talks in jq issues about various ways of doing, ex in jqlang/jq#2856, maybe a LRU cache etc? i've also experimented a bit with how
/regex/literals could work in jq, that would be neat and also simplify how to reuse a compiled regex.
I have already thought about this the last few days. And following your comment, I just implemented regex caching with an LRU cache. However, the result is underwhelming: Using jaq -L . -n 'include "jqjq"; _builtins_src | lex' as benchmark, we get 0.35 seconds without caching and 0.30 seconds with caching.
I also tried jaq -n 'limit(100000; repeat("--")) | match("^-="; "").string', and here, we get 0.20 seconds without caching and 0.16 seconds with caching.
Given that the performance improvement is not that large, I will probably not integrate this into jaq.
For reference, here is the diff to reproduce the experiment:
diff --git a/jaq-std/Cargo.toml b/jaq-std/Cargo.toml
index b9c3f84..db4d8da 100644
--- a/jaq-std/Cargo.toml
+++ b/jaq-std/Cargo.toml
@@ -29,6 +29,7 @@ libm = { version = "0.2.7", optional = true }
aho-corasick = { version = "1.0", optional = true }
base64 = { version = "0.22", optional = true }
urlencoding = { version = "2.1.3", optional = true }
+lru = "0.12.5"
[dev-dependencies]
jaq-json = { path = "../jaq-json" }
diff --git a/jaq-std/src/regex.rs b/jaq-std/src/regex.rs
index 3378306..e7a9ee5 100644
--- a/jaq-std/src/regex.rs
+++ b/jaq-std/src/regex.rs
@@ -4,7 +4,7 @@ use alloc::string::{String, ToString};
use alloc::vec::Vec;
use regex_lite::{self as regex, Error, Regex, RegexBuilder};
-#[derive(Copy, Clone, Default)]
+#[derive(Copy, Clone, PartialEq, Eq, Default, Hash)]
pub struct Flags {
// global search
g: bool,
@@ -65,6 +65,25 @@ impl Flags {
let mut builder = RegexBuilder::new(re);
self.impact(&mut builder).build()
}
+
+ #[cfg(feature = "std")]
+ pub fn regex_cached(self, re: &str) -> Result<Regex, Error> {
+ use core::cell::RefCell;
+ use core::num::NonZeroUsize;
+ use lru::LruCache;
+
+ std::thread_local! {
+ static CACHE: RefCell<LruCache<(String, Flags), Result<Regex, Error>>> =
+ RefCell::new(LruCache::new(NonZeroUsize::new(64).unwrap()));
+ }
+
+ CACHE.with(|cache| {
+ cache
+ .borrow_mut()
+ .get_or_insert((re.to_string(), self), || self.regex(re))
+ .clone()
+ })
+ }
}
/// Mapping between byte and character indices.
Without any changes it seem expressions like
1+2just output1, but if i add... | debugor... | .to both it seem to work. Some tail or return optimisation going wrong? 🤔
Oh, oh, that looks indeed like tail recursion gone wrong. I'm looking into it, but so far, I was not able to reproduce it with a smaller example ...
Any plan to add
--args, would make it possible to use the jqjq shell wrapper with jaq,./jqjq --jq jaq ....
That should actually not be that hard to do with trailing_var_arg.
Hi @wader, amazing work! Thanks so much for it.
Thank and same to you! also great that jqjq gets to be used for something useful!
Doh, yes you correct, i build without
--release. Now i've updated to latest main and built with release target, much faster 👍It has happened to (nearly) everyone ... ;)
👯
Given that the performance improvement is not that large, I will probably not integrate this into jaq.
Aha that's a bummer and a bit unintuitive 🤔 but maybe compiling a small regex:s is quite fast. Maybe something to revisit in the future if other things gets more optimised?