xtensor icon indicating copy to clipboard operation
xtensor copied to clipboard

xt::view overhead in loop

Open artofnothingness opened this issue 2 years ago • 2 comments

This code runs 5-10x times faster with raw unchecked indexing on all dimensions

  using T = float;
  using Tensor = xt::xarray<T>;
  
  auto start_line_points = xt::view(batch_of_lines, xt::all(), xt::range(_, -1));
  auto end_line_points = xt::view(batch_of_lines, xt::all(), xt::range(1, _));

  Tensor diff = end_line_points - start_line_points;
  Tensor sq_norm = xt::norm_sq(
      diff, {diff.dimension() - 1}, xt::evaluation_strategy::immediate);
   static constexpr double eps = 1e-3;

  for (size_t b = 0; b < closest_points.shape()[0]; ++b) {
    for (size_t t = 0; t < closest_points.shape()[1]; ++t) {
      if (abs(sq_norm.unchecked(b, t)) < eps) {
        xt::view(closest_points, b, t) = xt::view(start_line_points, b, t);
        continue;
      }

      for (size_t p = 0; p < closest_points.shape()[2]; ++p) {
        auto curr_closest_pt = xt::view(closest_points, b, t, p);
        auto curr_pt = xt::view(path_points, p);
        auto curr_start_pt = xt::view(start_line_points, b, t);
        auto curr_end_pt = xt::view(end_line_points, b, t);
        auto curr_line_diff = xt::view(diff, b, t);

        T u = ((curr_pt.unchecked(0) - curr_start_pt.unchecked(0)) * curr_line_diff.unchecked(0) +
               (curr_pt.unchecked(1) - curr_start_pt.unchecked(1)) * curr_line_diff.unchecked(1)) /
              sq_norm.unchecked(b, t);

        if (u <= 0)
          curr_closest_pt = curr_start_pt;
        else if (u >= 1)
          curr_closest_pt = curr_end_pt;
        else
          curr_closest_pt = curr_start_pt + u * curr_line_diff;
      }
    }
  }

With unchecked undexing on all dimensions:

  

  for (size_t b = 0; b < closest_points.shape()[0]; ++b) {
    for (size_t t = 0; t < closest_points.shape()[1]; ++t) {
      if (abs(sq_norm.unchecked(b, t)) < eps) {
        xt::view(closest_points, b, t) = xt::view(start_line_points, b, t);
        continue;
      }

      for (size_t p = 0; p < closest_points.shape()[2]; ++p) {
        T u = ((path_points.unchecked(p, 0) - start_line_points.unchecked(b, t, 0)) * diff.unchecked(b, t, 0) +
               (path_points.unchecked(p, 1) - start_line_points.unchecked(b, t, 1)) * diff.unchecked(b, t, 1)) /
              sq_norm.unchecked(b, t);

        if (u <= 0) {
          closest_points.unchecked(b, t, p, 0) = start_line_points.unchecked(b, t, 0);
          closest_points.unchecked(b, t, p, 1) = start_line_points.unchecked(b, t, 1);
        } else if (u >= 1) {
          closest_points.unchecked(b, t, p, 0) = end_line_points.unchecked(b, t, 0);
          closest_points.unchecked(b, t, p, 1) = end_line_points.unchecked(b, t, 1);
        }
        else {
          closest_points.unchecked(b, t, p, 0) = start_line_points.unchecked(b, t, 0) + u * diff.unchecked(b, t);
          closest_points.unchecked(b, t, p, 1) = start_line_points.unchecked(b, t, 1) + u * diff.unchecked(b, t);
        }
      }
    }
  }

Does xt::view brings much overhead ?

artofnothingness avatar Aug 28 '21 11:08 artofnothingness

Your example is not very minimal so a bit hard to judge and discuss.

That being said:

  • In general xt::view is currently unfortunately not very efficient.
  • Even if xt::view would be overhauled in some (near) future, using indices will always be faster: in xt::view you always need to create some overhead.
  • If you use a fixed-rank array I'm not sure that unchecked is still faster than operator() if you compile without assertions. But on this point I'm not entirely sure. Didn't we discuss this once @JohanMabille ?

tdegeus avatar Aug 30 '21 07:08 tdegeus

@tdegeus Sorry for verbosity. Overall here is looping by 3 dimensions, and in first chunk of code i used views to get xarray of points (x, y), and manipulate them, instead of indexing, and its came out much slower.

artofnothingness avatar Aug 30 '21 12:08 artofnothingness