parlaylib
parlaylib copied to clipboard
Long compile time
Is there a way to reduce the compile time when using Parlay? I know it's a template-heavy library, and there's no simple solution, but I'm curious if there's anything I might be missing. In larger projects, the long compile times can make iterations quite painful. For instance, even a simple code snippet of less than 150 lines takes 25 seconds to compile:
#include <iostream>
// #include "../code/cpp/infra/infra.hpp"
#include "parlay/primitives.hpp"
#define vec parlay::sequence
template <bool checked = false>
double computeJS (double p, double q) {
if constexpr (!checked) {
p = std::max(0.0, p);
q = std::max(0.0, q);
}
double inv_half = 2.0 / (p + q);
double left = p == 0 ? 0 : p * std::log2(inv_half * p);
double right = q == 0 ? 0 : q * std::log2(inv_half * q);
return (left + right) * 0.5;
}
template <typename Float = double>
struct Bin {
Float p, dp, js, q;
Bin() { p = dp = js = q = 0; }
Bin(Float p_, Float dp_, Float q_) {
p = p_; dp = dp_; q = q_;
js = computeJS(p, q);
}
};
template <typename Float = double>
auto makeHists(auto && zbYear, auto && zbDim,
auto && gbid,
//auto && freq,
auto && subcandYears, // A vector of vectors
int NsubYear,
auto && targetYears,
auto && dimw)
{
auto inv_w = 1.0 / std::accumulate(dimw.begin(), dimw.end(), 0.0);
for (auto && u: dimw) u *= inv_w;
auto nyear = *parlay::max_element(zbYear) + 1;
auto ndim = *parlay::max_element(zbDim) + 1;
auto ngbid = *parlay::max_element(gbid) + 1;
auto dimTotalFreq = parlay::reduce_by_index(parlay::depzip(
parlay::slice(zbDim.begin(), zbDim.end()),
parlay::delayed_tabulate(zbDim.size(), [](auto i)->unsigned { return 1; })),
ndim);
double targetScaleRatio = nyear / double(targetYears.size());
double subScaleRatio = nyear / double(NsubYear);
auto rst = parlay::tabulate(subcandYears.size(), [ngbid](auto i)->auto {
return vec<Bin<Float>> (ngbid); });
auto & rst0 = rst.front();
auto whichIn = [](auto && x, auto && y, unsigned n)->auto {
vec<bool> indi(n, false);
parlay::for_each(y, [&](auto && u)->void { indi[u] = true; });
return parlay::filter(parlay::iota(unsigned(x.size())),
[&](auto i)->bool {return indi[x[i]]; });
};
auto targetYearsInd = whichIn(zbYear, targetYears, nyear);
parlay::for_each(targetYearsInd, [&](auto i)->void {
auto & b = rst0[gbid[i]];
auto tfreq = dimTotalFreq[zbDim[i]];
auto q = 1 * targetScaleRatio / tfreq * dimw[zbDim[i]];
auto ptr = (std::atomic<Float>*)(&b.q);
ptr->fetch_add(q);
});
auto subYearsInd = whichIn (zbYear, parlay::slice(
subcandYears.front().begin(), subcandYears.front().end()), nyear);
parlay::for_each(subYearsInd, [&](auto i)->void {
auto & b = rst0[gbid[i]];
auto ptr = (std::atomic<Float>*)(&b.p);
auto tfreq = dimTotalFreq[zbDim[i]];
auto p = 1 * subScaleRatio / tfreq * dimw[zbDim[i]];
ptr->fetch_add(p);
});
parlay::parFor(0, zbYear.size(), [&](size_t i)->void {
auto & b = rst0[gbid[i]];
auto tfreq = dimTotalFreq[zbDim[i]];
b.dp = 1 * subScaleRatio / tfreq * dimw[zbDim[i]];
});
parlay::parFor(1, rst.size(), [&](int t)->void {
auto & v = rst[t];
auto subcat = parlay::slice(
subcandYears[t].begin(), subcandYears[t].begin() + NsubYear);
vec<bool> indi(nyear, false);
for (auto && u: subcat) indi[u] = true;
for (size_t i = 0, iend = zbYear.size(); i < iend; ++i) {
if (!indi[zbYear[i]]) continue;
auto & b = v[gbid[i]];
auto tfreq = dimTotalFreq[zbDim[i]];
b.p += 1 * subScaleRatio / tfreq * dimw[zbDim[i]];
}
for (size_t i = 0, iend = rst0.size(); i < iend; ++i) {
v[i].q = rst0[i].q;
v[i].dp = rst0[i].dp;
}
});
parlay::for_each(rst, [&](auto & x)->void {
for (auto & u: x) u.js = computeJS (u.p, u.q);
});
auto yearImpact = parlay::group_by_index(
parlay::depzip(parlay::slice(zbYear.begin(), zbYear.end()),
parlay::slice( gbid.begin(), gbid.end())), nyear);
return std::pair(std::move(yearImpact), std::move(rst));
}
int main() {
unsigned n = 1e6;
auto zbYear = parlay::delayed_tabulate(n, [n](auto i)->auto {
return unsigned(rand() % n);
});
auto rst = makeHists(
parlay::delayed_tabulate(n, [n](auto i)->auto {return unsigned(rand() % n);}),
parlay::delayed_tabulate(n, [n](auto i)->auto {return unsigned(rand() % n);}),
parlay::delayed_tabulate(n, [n](auto i)->auto {return unsigned(rand() % n);}),
parlay::delayed_tabulate(n, [n](auto i)->auto {
return parlay::detab(1000, [n](auto i)->auto {
return unsigned(rand() % 100);});
}), 10000,
parlay::delayed_tabulate(n, [n](auto i)->auto {return unsigned(rand() % n);}),
parlay::delayed_tabulate(n, [n](auto i)->auto { return 0.1; }));
std::cout << rst.first[0][0] << ", " << rst.second[0].size() << "\n";
}
#undef vec
The compile command is g++ -std=c++20 tempFiles/longcomptime.cpp -ftree-vectorize -march=native -O2 -pthread -o tempFiles/longcomptime.
Any suggestion would be greatly appreciated!