elixir icon indicating copy to clipboard operation
elixir copied to clipboard

fix: check for duplicates in keyword validate

Open polvalente opened this issue 5 months ago • 2 comments

closes #14584

I've run benchmarks with Benchee, results below. Unfortunately, it seems that as the lists grow bigger, the execution time difference becomes very significative. The test cases are very artificial, but the impact is there.

in the benchmark, Keyword.validate is the version included in this PR, while the other 2 versions are in the module below:

defmodule BenchmarkKeyword do
  defmodule FrequencyMap do
    def validate(keyword, values) when is_list(keyword) and is_list(values) do
      case dedup_values(values) do
        :ok ->
          validate(keyword, values, [], [], [])

        {:error, bad_keys} ->
          {:error, bad_keys}
      end
    end

    defp dedup_values(values, acc \\ %{})

    defp dedup_values([], _acc) do
      :ok
    end

    defp dedup_values([{key, _} | rest], acc) when is_atom(key) and is_map_key(acc, key) do
      {:error, [key]}
    end

    defp dedup_values([key | rest], acc) when is_atom(key) and is_map_key(acc, key) do
      {:error, [key]}
    end

    defp dedup_values([h | rest], acc) do
      dedup_values(rest, Map.put(acc, h, 1))
    end

    defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys)
         when is_atom(key) do
      case find_key!(key, values1, values2) do
        {values1, values2} ->
          validate(keyword, values1, values2, [pair | acc], bad_keys)

        :error ->
          case find_key!(key, values2, values1) do
            {values1, values2} ->
              validate(keyword, values1, values2, [pair | acc], bad_keys)

            :error ->
              validate(keyword, values1, values2, acc, [key | bad_keys])
          end
      end
    end

    defp validate([], values1, values2, acc, []) do
      with {:ok, acc} <- move_pairs!(acc, {[], %{}}),
           {:ok, acc} <- move_pairs!(values2, acc),
           {:ok, {list, _freq_map}} <- move_pairs!(values1, acc) do
        {:ok, list}
      end
    end

    defp validate([], _values1, _values2, _acc, bad_keys) do
      {:error, bad_keys}
    end

    defp validate([pair | _], _values1, _values2, _acc, []) do
      raise ArgumentError,
            "expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
    end

    defp find_key!(key, [key | rest], acc), do: {rest, acc}
    defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
    defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
    defp find_key!(_key, [], _acc), do: :error

    defp move_pairs!([key | rest], acc) when is_atom(key),
      do: move_pairs!(rest, acc)

    defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key),
      do: move_pairs!(rest, [pair | acc])

    defp move_pairs!([], acc), do: {:ok, acc}

    defp move_pairs!([other | _], _) do
      raise ArgumentError,
            "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
    end

    # defp move_pairs!([key | rest], acc) when is_atom(key),
    #   do: move_pairs!(rest, acc)

    # defp move_pairs!([{key, _} | _], {_acc, frequency_map})
    #      when is_atom(key) and is_map_key(frequency_map, key) do
    #   {:error, [key]}
    # end

    # defp move_pairs!([{key, _} = pair | rest], {acc, frequency_map}) when is_atom(key),
    #   do: move_pairs!(rest, {[pair | acc], Map.put(frequency_map, key, 1)})

    # defp move_pairs!([], acc), do: {:ok, acc}

    # defp move_pairs!([other | _], _) do
    #   raise ArgumentError,
    #         "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
    # end
  end

  def run do
    Benchee.run(
      %{
        "validate with dedup (freq map)" => &apply(__MODULE__.FrequencyMap, :validate, &1),
        "validate without dedup" => &apply(__MODULE__, :old_validate, &1),
        "validate with dedup" => &apply(Keyword, :validate, &1)
      },
      inputs: %{
        "10 duplicate start" => build_lists(10, :start),
        # "10 duplicate end" => build_lists(10, :end),
        "100 duplicate start" => build_lists(100, :start),
        # "100 duplicate end" => build_lists(100, :end),
        "10000 duplicate start" => build_lists(10000, :start)
        # "10000 duplicate end" => build_lists(10000, :end)
      }
    )

    :ok
  end

  defp build_lists(len, duplicate) do
    list =
      for idx <- 1..len do
        key = idx |> to_string() |> String.pad_leading(5, "0") |> String.to_atom()
        {key, idx}
      end

    l2 =
      case duplicate do
        :end ->
          list ++ [List.last(list)]

        :start ->
          [hd(list)] ++ list
      end

    [list, l2]
  end

  def old_validate(keyword, values) when is_list(keyword) and is_list(values) do
    validate(keyword, values, [], [], [])
  end

  defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys) when is_atom(key) do
    case find_key!(key, values1, values2) do
      {values1, values2} ->
        validate(keyword, values1, values2, [pair | acc], bad_keys)

      :error ->
        case find_key!(key, values2, values1) do
          {values1, values2} ->
            validate(keyword, values1, values2, [pair | acc], bad_keys)

          :error ->
            validate(keyword, values1, values2, acc, [key | bad_keys])
        end
    end
  end

  defp validate([], values1, values2, acc, []) do
    list = move_pairs!(values1, move_pairs!(values2, acc))
    {:ok, list}
  end

  defp validate([], _values1, _values2, _acc, bad_keys) do
    {:error, bad_keys}
  end

  defp validate([pair | _], _values1, _values2, _acc, []) do
    raise ArgumentError,
          "expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
  end

  defp find_key!(key, [key | rest], acc), do: {rest, acc}
  defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
  defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
  defp find_key!(_key, [], _acc), do: :error

  defp move_pairs!([key | rest], acc) when is_atom(key),
    do: move_pairs!(rest, acc)

  defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key),
    do: move_pairs!(rest, [pair | acc])

  defp move_pairs!([], acc),
    do: acc

  defp move_pairs!([other | _], _) do
    raise ArgumentError,
          "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
  end
end
 BenchmarkKeyword.run
Operating System: macOS
CPU Information: Apple M1 Pro
Number of Available Cores: 10
Available memory: 32 GB
Elixir 1.20.0-dev
Erlang 27.2.1
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: 10 duplicate start, 100 duplicate start, 10000 duplicate start
Estimated total run time: 1 min 3 s

Benchmarking validate with dedup with input 10 duplicate start ...
Benchmarking validate with dedup with input 100 duplicate start ...
Benchmarking validate with dedup with input 10000 duplicate start ...
Benchmarking validate with dedup (freq map) with input 10 duplicate start ...
Benchmarking validate with dedup (freq map) with input 100 duplicate start ...
Benchmarking validate with dedup (freq map) with input 10000 duplicate start ...
Benchmarking validate without dedup with input 10 duplicate start ...
Benchmarking validate without dedup with input 100 duplicate start ...
Benchmarking validate without dedup with input 10000 duplicate start ...
Calculating statistics...
Formatting results...

##### With input 10 duplicate start #####
Name                                     ips        average  deviation         median         99th %
validate without dedup                6.44 M      155.26 ns ±23763.10%          83 ns         208 ns
validate with dedup                   1.53 M      654.71 ns  ±2965.51%         583 ns         750 ns
validate with dedup (freq map)        0.89 M     1127.21 ns  ±1322.90%        1042 ns        1250 ns

Comparison: 
validate without dedup                6.44 M
validate with dedup                   1.53 M - 4.22x slower +499.45 ns
validate with dedup (freq map)        0.89 M - 7.26x slower +971.95 ns

##### With input 100 duplicate start #####
Name                                     ips        average  deviation         median         99th %
validate without dedup             1795.30 K        0.56 μs  ±4502.22%        0.46 μs        0.67 μs
validate with dedup                 167.53 K        5.97 μs   ±270.78%        5.67 μs       12.38 μs
validate with dedup (freq map)       69.11 K       14.47 μs    ±21.05%       13.88 μs       24.97 μs

Comparison: 
validate without dedup             1795.30 K
validate with dedup                 167.53 K - 10.72x slower +5.41 μs
validate with dedup (freq map)       69.11 K - 25.98x slower +13.91 μs

##### With input 10000 duplicate start #####
Name                                     ips        average  deviation         median         99th %
validate without dedup               15.27 K       65.51 μs    ±29.19%       67.67 μs       97.05 μs
validate with dedup                   1.02 K      984.61 μs    ±12.41%      947.92 μs     1469.24 μs
validate with dedup (freq map)        0.48 K     2089.97 μs     ±8.77%     2044.79 μs     2709.85 μs

Comparison: 
validate without dedup               15.27 K
validate with dedup                   1.02 K - 15.03x slower +919.10 μs
validate with dedup (freq map)        0.48 K - 31.90x slower +2024.46 μs
:ok

polvalente avatar Jun 17 '25 16:06 polvalente

The benchmark has evaluation warnings. Can you make sure to move all benchmarking code to a module and then call it instead? Thanks!

josevalim avatar Jun 17 '25 17:06 josevalim

Here is the fastest version I could come up with:

defmodule New do
    def validate(keyword, values) when is_list(keyword) and is_list(values) do
      validate(keyword, values, [], [], [])
    end

    defp validate([{key, _} = pair | keyword], values1, values2, acc, bad_keys)
         when is_atom(key) do
      case find_key!(key, values1, values2) do
        {new_values1, new_values2} ->
          find_dup!(key, values2)
          find_dup!(key, new_values1)
          validate(keyword, new_values1, new_values2, [pair | acc], bad_keys)

        :error ->
          case find_key!(key, values2, values1) do
            {values1, values2} ->
              find_dup!(key, values1)
              validate(keyword, values1, values2, [pair | acc], bad_keys)

            :error ->
              validate(keyword, values1, values2, acc, [key | bad_keys])
          end
      end
    end

    defp validate([], values1, values2, acc, []) do
      list = move_pairs!(values1, move_pairs!(values2, acc))
      {:ok, list}
    end

    defp validate([], _values1, _values2, _acc, bad_keys) do
      {:error, bad_keys}
    end

    defp validate([pair | _], _values1, _values2, _acc, []) do
      raise ArgumentError,
            "expected a keyword list as first argument, got invalid entry: #{inspect(pair)}"
    end

    defp find_key!(key, [key | rest], acc), do: {rest, acc}
    defp find_key!(key, [{key, _} | rest], acc), do: {rest, acc}
    defp find_key!(key, [head | tail], acc), do: find_key!(key, tail, [head | acc])
    defp find_key!(_key, [], _acc), do: :error

    defp find_dup!(key, [key | _]), do: raise("omg")
    defp find_dup!(key, [{key, _} | _]), do: raise("omg")
    defp find_dup!(key, [_ | tail]), do: find_dup!(key, tail)
    defp find_dup!(_key, []), do: :ok

    defp move_pairs!([key | rest], acc) when is_atom(key) do
      find_dup!(key, rest)
      move_pairs!(rest, acc)
    end

    defp move_pairs!([{key, _} = pair | rest], acc) when is_atom(key) do
      find_dup!(key, rest)
      move_pairs!(rest, [pair | acc])
    end

    defp move_pairs!([], acc),
      do: acc

    defp move_pairs!([other | _], _) do
      raise ArgumentError,
            "expected the second argument to be a list of atoms or tuples, got: #{inspect(other)}"
    end
  end

It is 50% slower here for 10 entries but the issue is that it scales linearly (due to the nature of the problem).

josevalim avatar Jun 21 '25 11:06 josevalim

@polvalente any parting thoughts? Should we instead improve the docs?

josevalim avatar Jul 10 '25 15:07 josevalim

@josevalim how do you feel about us adding an extra opts argument and adding the validation as an opt-in?

Then, even if it's %-wise slow, for small lists that's still fast enough

polvalente avatar Jul 10 '25 15:07 polvalente

Honestly, unsure. Nobody will use it if it is opt-in but it is not fast enough to be opt-out.

josevalim avatar Jul 10 '25 15:07 josevalim

(Sorry just thinking aloud here)

Since the right argument is typically a literal known at compile-time, this feels like a good candidate for static analysis (a credo check, or maybe compiler warning but this feels tricky to do "without cheating").

This could be even be broader than this particular use case, since while there might be some legit cases for keywords/proplists with duplicated keys, these feel more the exception than the rule. Arguably most calls with literals opts that have duplicated keys are accidental and could be caught with a similar check.

The legit exceptions where duplicated keys are actually intended will make this tricky to implement though, maybe a list of functions to include/exclude in a credo check config?

sabiwara avatar Jul 10 '25 21:07 sabiwara