elixir
elixir copied to clipboard
fix: check for duplicates in keyword validate
closes #14584
I've run benchmarks with Benchee, results below. Unfortunately, it seems that as the lists grow bigger, the execution time difference becomes very significative. The test cases are very artificial, but the impact is there.
in the benchmark, Keyword.validate is the version included in this PR, while the other 2 versions are in the module below:
defmodule BenchmarkKeyword do
defmodule FrequencyMap do
def validate(keyword, values) when is_list(keyword) and is_list(values) do
case dedup_values(values) do
:ok ->
validate(keyword, values, [], [], [])
{:error, bad_keys} ->
{:error, bad_keys}
end
end
defp dedup_values(values, acc \\ %{})
defp dedup_values([], _acc) do
:ok
end
defp dedup_values([{key, _} | rest], acc) when is_atom(key) and is_map_key(acc, key) do
{:error, [key]}
end
defp dedup_values([key | rest], acc) when is_atom(key) and is_map_key(acc, key) do
{:error, [key]}
end
defp dedup_values([h | rest], acc) do
dedup_values(rest, Map.put(acc, h, 1))
end
defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys)
when is_atom(key) do
case find_key!(key, values1, values2) do
{values1, values2} ->
validate(keyword, values1, values2, [pair | acc], bad_keys)
:error ->
case find_key!(key, values2, values1) do
{values1, values2} ->
validate(keyword, values1, values2, [pair | acc], bad_keys)
:error ->
validate(keyword, values1, values2, acc, [key | bad_keys])
end
end
end
defp validate([], values1, values2, acc, []) do
with {:ok, acc} <- move_pairs!(acc, {[], %{}}),
{:ok, acc} <- move_pairs!(values2, acc),
{:ok, {list, _freq_map}} <- move_pairs!(values1, acc) do
{:ok, list}
end
end
defp validate([], _values1, _values2, _acc, bad_keys) do
{:error, bad_keys}
end
defp validate([pair | _], _values1, _values2, _acc, []) do
raise ArgumentError,
"expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
end
defp find_key!(key, [key | rest], acc), do: {rest, acc}
defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
defp find_key!(_key, [], _acc), do: :error
defp move_pairs!([key | rest], acc) when is_atom(key),
do: move_pairs!(rest, acc)
defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key),
do: move_pairs!(rest, [pair | acc])
defp move_pairs!([], acc), do: {:ok, acc}
defp move_pairs!([other | _], _) do
raise ArgumentError,
"expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
end
# defp move_pairs!([key | rest], acc) when is_atom(key),
# do: move_pairs!(rest, acc)
# defp move_pairs!([{key, _} | _], {_acc, frequency_map})
# when is_atom(key) and is_map_key(frequency_map, key) do
# {:error, [key]}
# end
# defp move_pairs!([{key, _} = pair | rest], {acc, frequency_map}) when is_atom(key),
# do: move_pairs!(rest, {[pair | acc], Map.put(frequency_map, key, 1)})
# defp move_pairs!([], acc), do: {:ok, acc}
# defp move_pairs!([other | _], _) do
# raise ArgumentError,
# "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
# end
end
def run do
Benchee.run(
%{
"validate with dedup (freq map)" => &apply(__MODULE__.FrequencyMap, :validate, &1),
"validate without dedup" => &apply(__MODULE__, :old_validate, &1),
"validate with dedup" => &apply(Keyword, :validate, &1)
},
inputs: %{
"10 duplicate start" => build_lists(10, :start),
# "10 duplicate end" => build_lists(10, :end),
"100 duplicate start" => build_lists(100, :start),
# "100 duplicate end" => build_lists(100, :end),
"10000 duplicate start" => build_lists(10000, :start)
# "10000 duplicate end" => build_lists(10000, :end)
}
)
:ok
end
defp build_lists(len, duplicate) do
list =
for idx <- 1..len do
key = idx |> to_string() |> String.pad_leading(5, "0") |> String.to_atom()
{key, idx}
end
l2 =
case duplicate do
:end ->
list ++ [List.last(list)]
:start ->
[hd(list)] ++ list
end
[list, l2]
end
def old_validate(keyword, values) when is_list(keyword) and is_list(values) do
validate(keyword, values, [], [], [])
end
defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys) when is_atom(key) do
case find_key!(key, values1, values2) do
{values1, values2} ->
validate(keyword, values1, values2, [pair | acc], bad_keys)
:error ->
case find_key!(key, values2, values1) do
{values1, values2} ->
validate(keyword, values1, values2, [pair | acc], bad_keys)
:error ->
validate(keyword, values1, values2, acc, [key | bad_keys])
end
end
end
defp validate([], values1, values2, acc, []) do
list = move_pairs!(values1, move_pairs!(values2, acc))
{:ok, list}
end
defp validate([], _values1, _values2, _acc, bad_keys) do
{:error, bad_keys}
end
defp validate([pair | _], _values1, _values2, _acc, []) do
raise ArgumentError,
"expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
end
defp find_key!(key, [key | rest], acc), do: {rest, acc}
defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
defp find_key!(_key, [], _acc), do: :error
defp move_pairs!([key | rest], acc) when is_atom(key),
do: move_pairs!(rest, acc)
defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key),
do: move_pairs!(rest, [pair | acc])
defp move_pairs!([], acc),
do: acc
defp move_pairs!([other | _], _) do
raise ArgumentError,
"expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
end
end
BenchmarkKeyword.run
Operating System: macOS
CPU Information: Apple M1 Pro
Number of Available Cores: 10
Available memory: 32 GB
Elixir 1.20.0-dev
Erlang 27.2.1
JIT enabled: true
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: 10 duplicate start, 100 duplicate start, 10000 duplicate start
Estimated total run time: 1 min 3 s
Benchmarking validate with dedup with input 10 duplicate start ...
Benchmarking validate with dedup with input 100 duplicate start ...
Benchmarking validate with dedup with input 10000 duplicate start ...
Benchmarking validate with dedup (freq map) with input 10 duplicate start ...
Benchmarking validate with dedup (freq map) with input 100 duplicate start ...
Benchmarking validate with dedup (freq map) with input 10000 duplicate start ...
Benchmarking validate without dedup with input 10 duplicate start ...
Benchmarking validate without dedup with input 100 duplicate start ...
Benchmarking validate without dedup with input 10000 duplicate start ...
Calculating statistics...
Formatting results...
##### With input 10 duplicate start #####
Name ips average deviation median 99th %
validate without dedup 6.44 M 155.26 ns ±23763.10% 83 ns 208 ns
validate with dedup 1.53 M 654.71 ns ±2965.51% 583 ns 750 ns
validate with dedup (freq map) 0.89 M 1127.21 ns ±1322.90% 1042 ns 1250 ns
Comparison:
validate without dedup 6.44 M
validate with dedup 1.53 M - 4.22x slower +499.45 ns
validate with dedup (freq map) 0.89 M - 7.26x slower +971.95 ns
##### With input 100 duplicate start #####
Name ips average deviation median 99th %
validate without dedup 1795.30 K 0.56 μs ±4502.22% 0.46 μs 0.67 μs
validate with dedup 167.53 K 5.97 μs ±270.78% 5.67 μs 12.38 μs
validate with dedup (freq map) 69.11 K 14.47 μs ±21.05% 13.88 μs 24.97 μs
Comparison:
validate without dedup 1795.30 K
validate with dedup 167.53 K - 10.72x slower +5.41 μs
validate with dedup (freq map) 69.11 K - 25.98x slower +13.91 μs
##### With input 10000 duplicate start #####
Name ips average deviation median 99th %
validate without dedup 15.27 K 65.51 μs ±29.19% 67.67 μs 97.05 μs
validate with dedup 1.02 K 984.61 μs ±12.41% 947.92 μs 1469.24 μs
validate with dedup (freq map) 0.48 K 2089.97 μs ±8.77% 2044.79 μs 2709.85 μs
Comparison:
validate without dedup 15.27 K
validate with dedup 1.02 K - 15.03x slower +919.10 μs
validate with dedup (freq map) 0.48 K - 31.90x slower +2024.46 μs
:ok
The benchmark has evaluation warnings. Can you make sure to move all benchmarking code to a module and then call it instead? Thanks!
Here is the fastest version I could come up with:
defmodule New do
def validate(keyword, values) when is_list(keyword) and is_list(values) do
validate(keyword, values, [], [], [])
end
defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys)
when is_atom(key) do
case find_key!(key, values1, values2) do
{new_values1, new_values2} ->
find_dup!(key, values2)
find_dup!(key, new_values1)
validate(keyword, new_values1, new_values2, [pair | acc], bad_keys)
:error ->
case find_key!(key, values2, values1) do
{values1, values2} ->
find_dup!(key, values1)
validate(keyword, values1, values2, [pair | acc], bad_keys)
:error ->
validate(keyword, values1, values2, acc, [key | bad_keys])
end
end
end
defp validate([], values1, values2, acc, []) do
list = move_pairs!(values1, move_pairs!(values2, acc))
{:ok, list}
end
defp validate([], _values1, _values2, _acc, bad_keys) do
{:error, bad_keys}
end
defp validate([pair | _], _values1, _values2, _acc, []) do
raise ArgumentError,
"expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
end
defp find_key!(key, [key | rest], acc), do: {rest, acc}
defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
defp find_key!(_key, [], _acc), do: :error
defp find_dup!(key, [key | _]), do: raise("omg")
defp find_dup!(key, [{key, _} | _]), do: raise("omg")
defp find_dup!(key, [_ | tail]), do: find_dup!(key, tail)
defp find_dup!(_key, []), do: :ok
defp move_pairs!([key | rest], acc) when is_atom(key) do
find_dup!(key, rest)
move_pairs!(rest, acc)
end
defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key) do
find_dup!(key, rest)
move_pairs!(rest, [pair | acc])
end
defp move_pairs!([], acc),
do: acc
defp move_pairs!([other | _], _) do
raise ArgumentError,
"expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
end
end
It is 50% slower here for 10 entries but the issue is that it scales linearly (due to the nature of the problem).
@polvalente any parting thoughts? Should we instead improve the docs?
@josevalim how do you feel about us adding an extra opts argument and adding the validation as an opt-in?
Then, even if it's %-wise slow, for small lists that's still fast enough
Honestly, unsure. Nobody will use it if it is opt-in but it is not fast enough to be opt-out.
(Sorry just thinking aloud here)
Since the right argument is typically a literal known at compile-time, this feels like a good candidate for static analysis (a credo check, or maybe compiler warning but this feels tricky to do "without cheating").
This could be even be broader than this particular use case, since while there might be some legit cases for keywords/proplists with duplicated keys, these feel more the exception than the rule.
Arguably most calls with literals opts that have duplicated keys are accidental and could be caught with a similar check.
The legit exceptions where duplicated keys are actually intended will make this tricky to implement though, maybe a list of functions to include/exclude in a credo check config?