Faster almxfl?
optweight.alm_c_utils.lmul is allegedly 80% faster than pixell.curvedsky.almxfl
https://github.com/AdriJD/optweight/blob/0b286eba73d712b4b3a54ebcdd0371de495432d0/cython/alm_c_utils.pyx#L157
@amaurea @adrijd keeping this open until we understand this / incorporate Adri's version into pixell
Re
lmul, yes:
Perhaps worth opening a pixell issue for this, I agree we should streamline all of these dependencies/duplicated functionality across our core codes.
Originally posted by @zatkins2 in https://github.com/simonsobs/mnms/issues/9#issuecomment-2397513461
What shape arrays is this for? Do you have a test case?
Sorry I did not see this, I will make a test case
from optweight import alm_c_utils
from pixell import curvedsky
import numpy as np
import time
lmax = 10800
ainfo = curvedsky.alm_info(lmax=lmax)
# double prec, matmul
ps = np.eye(6)[..., None]
ps = np.tile(ps, (1, 1, lmax+1))
pixell_times = []
optweight_times = []
lfilter = np.random.randn(*(6, 6, lmax+1)).astype(np.float64)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex128)
out = np.empty_like(alm)
for i in range(10):
t0 = time.time()
ainfo.lmul(alm, lfilter, out=out)
pixell_times.append(time.time() - t0)
t0 = time.time()
alm_c_utils.lmul(alm, lfilter, ainfo, alm_out=out)
optweight_times.append(time.time() - t0)
print('double prec, matmul')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')
# single prec, matmul
pixell_times = []
optweight_times = []
lfilter = np.random.randn(*(6, 6, lmax+1)).astype(np.float32)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex64)
out = np.empty_like(alm)
for i in range(10):
t0 = time.time()
ainfo.lmul(alm, lfilter, out=out)
pixell_times.append(time.time() - t0)
t0 = time.time()
alm_c_utils.lmul(alm, lfilter, ainfo, alm_out=out)
optweight_times.append(time.time() - t0)
print('single prec, matmul')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')
lmax = 21600
ainfo = curvedsky.alm_info(lmax=lmax)
# double prec, scalar
ps = np.ones(lmax+1)
pixell_times = []
optweight_times = []
lfilter = np.random.randn(lmax+1).astype(np.float64)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex128)
out = np.empty_like(alm)
for i in range(10):
t0 = time.time()
ainfo.lmul(alm, lfilter, out=out)
pixell_times.append(time.time() - t0)
t0 = time.time()
alm_c_utils.lmul(alm[None], lfilter[None], ainfo, alm_out=out[None]) # optweight needs mat
optweight_times.append(time.time() - t0)
print('double prec, scalar')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')
# single prec, scalar
pixell_times = []
optweight_times = []
lfilter = np.random.randn(lmax+1).astype(np.float32)
alm = curvedsky.rand_alm(ps, lmax=lmax).astype(np.complex64)
out = np.empty_like(alm)
for i in range(10):
t0 = time.time()
ainfo.lmul(alm, lfilter, out=out)
pixell_times.append(time.time() - t0)
t0 = time.time()
alm_c_utils.lmul(alm[None], lfilter[None], ainfo, alm_out=out[None]) # optweight needs mat
optweight_times.append(time.time() - t0)
print('single prec, scalar')
print(f'pixell time = {np.mean(pixell_times):0.4f} +/- {np.std(pixell_times):0.4f}')
print(f'optweight time = {np.mean(optweight_times):0.4f} +/- {np.std(optweight_times):0.4f}')
gives
double prec, matmul
pixell time = 1.4922 +/- 0.0477
optweight time = 1.5899 +/- 0.0140
single prec, matmul
pixell time = 1.4856 +/- 0.0236
optweight time = 1.2219 +/- 0.0291
double prec, scalar
pixell time = 0.8013 +/- 0.1647
optweight time = 0.3169 +/- 0.0128
single prec, scalar
pixell time = 0.4011 +/- 0.0816
optweight time = 0.2053 +/- 0.0065
Obviously the compilation of optweight against mkl would present a challenge if the speedup is due to that. Also the matmul case is probably more common so perhaps the speedup is not as dramatic (20% for single precision).