pixell Faster almxfl?

optweight.alm_c_utils.lmul is allegedly 80% faster than pixell.curvedsky.almxfl https://github.com/AdriJD/optweight/blob/0b286eba73d712b4b3a54ebcdd0371de495432d0/cython/alm_c_utils.pyx#L157

@amaurea @adrijd keeping this open until we understand this / incorporate Adri's version into pixell

Re lmul, yes:

Perhaps worth opening a pixell issue for this, I agree we should streamline all of these dependencies/duplicated functionality across our core codes.

Originally posted by @zatkins2 in https://github.com/simonsobs/mnms/issues/9#issuecomment-2397513461

Oct 07 '24 17:10 msyriac

What shape arrays is this for? Do you have a test case?

Oct 07 '24 18:10 amaurea

Sorry I did not see this, I will make a test case

Jan 09 '25 23:01 zatkins2

from optweight import alm_c_utils
from pixell import curvedsky
import numpy as np
import time

lmax = 10800
ainfo = curvedsky.alm_info(lmax=lmax)

# double prec, matmul
ps = np.eye(6)[..., None]
ps = np.tile(ps, (1, 1, lmax+1))

pixell_times = []
optweight_times = []
lfilter = np.random.randn(*(6, 6, lmax+1)).astype(np.float64)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex128)
out = np.empty_like(alm)
for i in range(10):
    t0 = time.time()
    ainfo.lmul(alm, lfilter, out=out)
    pixell_times.append(time.time() - t0)

    t0 = time.time()
    alm_c_utils.lmul(alm, lfilter, ainfo, alm_out=out)
    optweight_times.append(time.time() - t0)
print('double prec, matmul')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')

# single prec, matmul
pixell_times = []
optweight_times = []
lfilter = np.random.randn(*(6, 6, lmax+1)).astype(np.float32)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex64)
out = np.empty_like(alm)
for i in range(10):
    t0 = time.time()
    ainfo.lmul(alm, lfilter, out=out)
    pixell_times.append(time.time() - t0)

    t0 = time.time()
    alm_c_utils.lmul(alm, lfilter, ainfo, alm_out=out)
    optweight_times.append(time.time() - t0)
print('single prec, matmul')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')

lmax = 21600
ainfo = curvedsky.alm_info(lmax=lmax)

# double prec, scalar
ps = np.ones(lmax+1)

pixell_times = []
optweight_times = []
lfilter = np.random.randn(lmax+1).astype(np.float64)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex128)
out = np.empty_like(alm)
for i in range(10):
    t0 = time.time()
    ainfo.lmul(alm, lfilter, out=out)
    pixell_times.append(time.time() - t0)

    t0 = time.time()
    alm_c_utils.lmul(alm[None], lfilter[None], ainfo, alm_out=out[None]) # optweight needs mat
    optweight_times.append(time.time() - t0)
print('double prec, scalar')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')

# single prec, scalar
pixell_times = []
optweight_times = []
lfilter = np.random.randn(lmax+1).astype(np.float32)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex64)
out = np.empty_like(alm)
for i in range(10):
    t0 = time.time()
    ainfo.lmul(alm, lfilter, out=out)
    pixell_times.append(time.time() - t0)

    t0 = time.time()
    alm_c_utils.lmul(alm[None], lfilter[None], ainfo, alm_out=out[None]) # optweight needs mat
    optweight_times.append(time.time() - t0)
print('single prec, scalar')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')

gives

double prec, matmul
pixell time = 1.4922 +/- 0.0477
optweight time = 1.5899 +/- 0.0140
single prec, matmul
pixell time = 1.4856 +/- 0.0236
optweight time = 1.2219 +/- 0.0291
double prec, scalar
pixell time = 0.8013 +/- 0.1647
optweight time = 0.3169 +/- 0.0128
single prec, scalar
pixell time = 0.4011 +/- 0.0816
optweight time = 0.2053 +/- 0.0065

Obviously the compilation of optweight against mkl would present a challenge if the speedup is due to that. Also the matmul case is probably more common so perhaps the speedup is not as dramatic (20% for single precision).

Jan 10 '25 00:01 zatkins2