datashader Optimize colorize using matmul and inplace operations

As the title says, this attempts to optimize the colorize part of the shade operation by avoiding temporary copies and performing a single matmul operation rather than multiple dot operations. In my testing this is about a 10% speedup. My guess is that this could result in even better performance for systems with MKL support.

Aug 26 '25 13:08 philippjfr

Codecov Report

:white_check_mark: All modified and coverable lines are covered by tests. :white_check_mark: Project coverage is 88.34%. Comparing base (f44670c) to head (36a703c). :warning: Report is 2 commits behind head on main.

Additional details and impacted files

@@           Coverage Diff           @@
##             main    #1437   +/-   ##
=======================================
  Coverage   88.33%   88.34%           
=======================================
  Files          96       96           
  Lines       18901    18908    +7     
=======================================
+ Hits        16696    16704    +8     
+ Misses       2205     2204    -1

:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.

:rocket: New features to boost your workflow:

:snowflake: Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

Aug 26 '25 13:08 codecov[bot]

CodSpeed Instrumentation Performance Report

Merging #1437 will degrade performances by 17.88%

_{Comparing optimize_colorize (36a703c) with main (a4d57be)}

Summary

❌ 1 regression
✅ 42 untouched

:warning: Please fix the performance issues or acknowledge them on CodSpeed.

Benchmarks breakdown

	Benchmark	`BASE`	`HEAD`	Change
❌	`test_layout[forceatlas2_layout]`	68.7 ms	83.6 ms	-17.88%

Sep 08 '25 10:09 codspeed-hq[bot]

My profiling code

import time

import numpy as np
import pandas as pd
import datashader as ds

N = int(10e6)
C = 20

def gen_data(N=int(10e6), C=20):
    xy = np.random.randn(int(N), 2)
    c = np.random.choice([chr(65+i) for i in range(C)], size=N)
    df = pd.DataFrame(xy, columns=['x', 'y'])
    df['c'] = pd.Series(c).astype('category')
    return df

def profile(df, size=1000):
    W = H = size
    cvs = ds.Canvas(plot_width=W, plot_height=H)
    agg = cvs.points(df, x='x', y='y', agg=ds.count_cat('c'))

    pre = time.monotonic()
    ds.transfer_functions.shade(agg)
    return time.monotonic()-pre

# Warmup
df = gen_data(C=1)
profile(df, size=10)

results = []
for c in (1, 5, 10, 20):
    df = gen_data(C=c)
    for s in range(1000, 6000, 1000):
        timing = profile(df, size=s)
        results.append((c, s, timing))

Sep 10 '25 15:09 philippjfr

Current status:

current = 6b0982b, before = fb2c16e, main = 184ef3c

Benchmark code


import time
import numpy as np
import pandas as pd
import datashader as ds
import datashader.transfer_functions as tf

N = int(10e6)
C = 20


class Profile:
    def __init__(self, output_file):
        import cProfile

        self.profiler = cProfile.Profile()
        self.output_file = output_file

    def __enter__(self):
        self.profiler.enable()
        return self.profiler

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.profiler.disable()
        self.profiler.dump_stats(self.output_file)


class LineProfileContext:
    def __init__(self, output_file):
        from line_profiler import LineProfiler

        self.profiler = LineProfiler()
        self.output_file = output_file
        self.functions_to_profile = []

    def add_function(self, func):
        """Add a function to be profiled line-by-line"""
        self.profiler.add_function(func)
        self.functions_to_profile.append(func)
        return func

    def __enter__(self):
        self.profiler.enable_by_count()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.profiler.disable_by_count()

        self.profiler.dump_stats(self.output_file)
        self.profiler.print_stats()


def gen_data(N=N, C=C):
    np.random.seed(1)
    xy = np.random.randn(int(N), 2)
    c = np.random.choice([chr(65 + i) for i in range(C)], size=N)
    df = pd.DataFrame(xy, columns=["x", "y"])
    df["c"] = pd.Series(c).astype("category")
    return df


def profile(df, size=1000):
    W = H = size
    cvs = ds.Canvas(plot_width=W, plot_height=H)
    agg = cvs.points(df, x="x", y="y", agg=ds.count_cat("c"))

    tf.shade(agg)  # warm up
    pre = time.monotonic()

    # with LineProfileContext("line_profile.lprof") as line_profiler:
    #     line_profiler.add_function(tf._colorize)
    #     tf.shade(agg)

    # with Profile(output_file="optional.perf"):
    #     ds.transfer_functions.shade(agg)
    return time.monotonic() - pre


# Warmup
df = gen_data(C=20)
profile(df, size=5000)

results = []
for c in (1, 5, 10, 20):
    df = gen_data(C=c)
    for s in range(1000, 6000, 1000):
        timing = profile(df, size=s)
        results.append((c, s, timing))
        print(f"{c=}, {s=}, {timing=}")

Plotting


current = [  # 6b0982b
    dict(c=1, s=1000, timing=0.07571580299918423),
    dict(c=1, s=2000, timing=0.295644159999938),
    dict(c=1, s=3000, timing=0.6464670440000191),
    dict(c=1, s=4000, timing=1.1230143669999961),
    dict(c=1, s=5000, timing=1.7188509200004773),
    dict(c=5, s=1000, timing=0.11476161499922455),
    dict(c=5, s=2000, timing=0.44561883799906354),
    dict(c=5, s=3000, timing=0.9790756620004686),
    dict(c=5, s=4000, timing=1.7118233849996614),
    dict(c=5, s=5000, timing=2.6856131889999233),
    dict(c=10, s=1000, timing=0.14612587800002075),
    dict(c=10, s=2000, timing=0.5675542549997772),
    dict(c=10, s=3000, timing=1.2379318599996623),
    dict(c=10, s=4000, timing=2.2251677369986282),
    dict(c=10, s=5000, timing=3.397321854999973),
    dict(c=20, s=1000, timing=0.1993868179997662),
    dict(c=20, s=2000, timing=0.8214870430001611),
    dict(c=20, s=3000, timing=1.7614306820014463),
    dict(c=20, s=4000, timing=3.0943053329992836),
    dict(c=20, s=5000, timing=4.7508491489988955),
]

before = [  # fb2c16e
    dict(c=1, s=1000, timing=0.07645769699956873),
    dict(c=1, s=2000, timing=0.3170905290007795),
    dict(c=1, s=3000, timing=0.7142776969994884),
    dict(c=1, s=4000, timing=1.2551025209995714),
    dict(c=1, s=5000, timing=1.9599227520011482),
    dict(c=5, s=1000, timing=0.16230177999932494),
    dict(c=5, s=2000, timing=0.5520949959991412),
    dict(c=5, s=3000, timing=1.2177506650004943),
    dict(c=5, s=4000, timing=2.171157504999428),
    dict(c=5, s=5000, timing=3.3560801679996075),
    dict(c=10, s=1000, timing=0.2009295749994635),
    dict(c=10, s=2000, timing=0.7160231019988714),
    dict(c=10, s=3000, timing=1.6094946339999296),
    dict(c=10, s=4000, timing=2.7828460880009516),
    dict(c=10, s=5000, timing=4.274540911001168),
    dict(c=20, s=1000, timing=0.2542700350004452),
    dict(c=20, s=2000, timing=0.9284682460001932),
    dict(c=20, s=3000, timing=2.0608999519990903),
    dict(c=20, s=4000, timing=3.6744658019997587),
    dict(c=20, s=5000, timing=5.747611536000477),
]
main = [  # 184ef3c
    dict(c=1, s=1000, timing=0.0718935530003364),
    dict(c=1, s=2000, timing=0.31208833799973945),
    dict(c=1, s=3000, timing=0.7055044320004527),
    dict(c=1, s=4000, timing=1.2214937410008133),
    dict(c=1, s=5000, timing=1.899291293000715),
    dict(c=5, s=1000, timing=0.1668667740013916),
    dict(c=5, s=2000, timing=0.6655240790005337),
    dict(c=5, s=3000, timing=1.5014597809986299),
    dict(c=5, s=4000, timing=2.5980365989998973),
    dict(c=5, s=5000, timing=4.086923677999948),
    dict(c=10, s=1000, timing=0.2160664650000399),
    dict(c=10, s=2000, timing=0.8515692499986471),
    dict(c=10, s=3000, timing=1.879354708999017),
    dict(c=10, s=4000, timing=3.3537094929997693),
    dict(c=10, s=5000, timing=5.179373872999349),
    dict(c=20, s=1000, timing=0.3006982959996094),
    dict(c=20, s=2000, timing=1.1935125410000182),
    dict(c=20, s=3000, timing=2.72798394000165),
    dict(c=20, s=4000, timing=4.852396742000565),
    dict(c=20, s=5000, timing=7.331704080999771),
]

import hvplot.pandas
import pandas as pd

fn = (
    lambda x: pd.DataFrame(eval(x))
    .hvplot.bar(x="s", y="timing", by="c", title=x)
    .opts(show_grid=True)
)

(fn("current") + fn("before") + fn("main")).cols(1)

Sep 12 '25 16:09 hoxbro