snmalloc icon indicating copy to clipboard operation
snmalloc copied to clipboard

add LoongArch aal

Open SchrodingerZhu opened this issue 4 years ago • 11 comments

Signed-off-by: SchrodingerZhu [email protected] Just to show the portability of snmalloc to loongarch. https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN

$/home/schrodinger/Documents/qemu/build/qemu-loongarch64 -L $LA_PATH/../target/usr/ perf-singlethread-1
Count:  32768, Size:     16, ZeroMem: 0, Write: 0:     25995159 ns
Count:  32768, Size:     16, ZeroMem: 0, Write: 1:     23428680 ns
Count:  32768, Size:     16, ZeroMem: 1, Write: 0:     25952950 ns
Count:  32768, Size:     16, ZeroMem: 1, Write: 1:     25649431 ns
Count:  32768, Size:     32, ZeroMem: 0, Write: 0:     23379167 ns
Count:  32768, Size:     32, ZeroMem: 0, Write: 1:     23449238 ns
Count:  32768, Size:     32, ZeroMem: 1, Write: 0:     26265615 ns
Count:  32768, Size:     32, ZeroMem: 1, Write: 1:     26149518 ns
Count:  32768, Size:     64, ZeroMem: 0, Write: 0:     24059021 ns
Count:  32768, Size:     64, ZeroMem: 0, Write: 1:     23810705 ns
Count:  32768, Size:     64, ZeroMem: 1, Write: 0:     26568643 ns
Count:  32768, Size:     64, ZeroMem: 1, Write: 1:     26486249 ns
Count:  32768, Size:    128, ZeroMem: 0, Write: 0:     25350150 ns
Count:  32768, Size:    128, ZeroMem: 0, Write: 1:     25165214 ns
Count:  32768, Size:    128, ZeroMem: 1, Write: 0:     28383734 ns
Count:  32768, Size:    128, ZeroMem: 1, Write: 1:     28094443 ns
Count:   1024, Size:   4096, ZeroMem: 0, Write: 0:      2374569 ns
Count:   1024, Size:   4096, ZeroMem: 0, Write: 1:      2036175 ns
Count:   1024, Size:   4096, ZeroMem: 1, Write: 0:      2870639 ns
Count:   1024, Size:   4096, ZeroMem: 1, Write: 1:      2554837 ns
Count:   1024, Size:   8192, ZeroMem: 0, Write: 0:      5612747 ns
Count:   1024, Size:   8192, ZeroMem: 0, Write: 1:      4376410 ns
Count:   1024, Size:   8192, ZeroMem: 1, Write: 0:      5351678 ns
Count:   1024, Size:   8192, ZeroMem: 1, Write: 1:      5368780 ns
Count:   1024, Size:  16384, ZeroMem: 0, Write: 0:     11227398 ns
Count:   1024, Size:  16384, ZeroMem: 0, Write: 1:      9032074 ns
Count:   1024, Size:  16384, ZeroMem: 1, Write: 0:     11434806 ns
Count:   1024, Size:  16384, ZeroMem: 1, Write: 1:     11662212 ns
Count:   1024, Size:  32768, ZeroMem: 0, Write: 0:     21082663 ns
Count:   1024, Size:  32768, ZeroMem: 0, Write: 1:     17188197 ns
Count:   1024, Size:  32768, ZeroMem: 1, Write: 0:     24380463 ns
Count:   1024, Size:  32768, ZeroMem: 1, Write: 1:     24069601 ns
Count:   1024, Size:  65536, ZeroMem: 0, Write: 0:     40695674 ns
Count:   1024, Size:  65536, ZeroMem: 0, Write: 1:     32872525 ns
Count:   1024, Size:  65536, ZeroMem: 1, Write: 0:     48841939 ns
Count:   1024, Size:  65536, ZeroMem: 1, Write: 1:     48756758 ns
Count:   1024, Size: 131072, ZeroMem: 0, Write: 0:     84693715 ns
Count:   1024, Size: 131072, ZeroMem: 0, Write: 1:     66228266 ns
Count:   1024, Size: 131072, ZeroMem: 1, Write: 0:    105022838 ns
Count:   1024, Size: 131072, ZeroMem: 1, Write: 1:    104210255 ns

All multi-thread tests currently failed with segfault. But it seems that the process quits on thread creation. I may investigate it later.

SchrodingerZhu avatar Oct 13 '21 12:10 SchrodingerZhu

🕙[ 20:27:32 ] ❯ /home/schrodinger/Documents/qemu/build/qemu-loongarch64 -L $LA_PATH/../target/usr/ -strace func-first_operation-malloc
236957 brk(NULL) = 0x00000001201a4000
236957 brk(0x00000001201a4ba0) = 0x00000001201a4ba0
236957 set_tid_address(4833558736,4833463728,4833560480,4833560592,4833463736,4833463624) = 236957
236957 set_robust_list(4833558752,24,4833560480,1,4833463736,4833463624) = -1 errno=38 (Function not implemented)
236957 uname(0x4000802fd8) = 0
236957 prlimit64(0,3,0,274886308168,274886309184,274886309176) = 0
236957 readlinkat(AT_FDCWD,"/proc/self/exe",0x0000004000802090,4096) = 70
236957 getrandom(4833540248,8,1,4096,-4096,274886309176) = 8
236957 brk(0x00000001201c8ba0) = 0x00000001201c8ba0
236957 brk(0x00000001201cc000) = 0x00000001201cc000
236957 mprotect(0x0000000120158000,245760,PROT_READ) = 0
236957 futex(0x0000000120197bb0,FUTEX_PRIVATE_FLAG|FUTEX_WAKE,2147483647,NULL,0x000000012019f840,538572864) = 0
236957 futex(0x0000000120197bc0,FUTEX_PRIVATE_FLAG|FUTEX_WAKE,2147483647,NULL,0x0000000120130604,538117636) = 0
236957 newfstatat(1,"",0x0000004000802ee8,0x1000) = 0
.236957 write(1,0x201b6b10,1) = 1
236957 rt_sigaction(33,0x0000004000802cf8,NULL) = 0
236957 rt_sigprocmask(SIG_UNBLOCK,0x0000004000802ee8,NULL) = 0
236957 mmap(NULL,8404992,PROT_NONE,MAP_PRIVATE|MAP_ANONYMOUS|0x20000,-1,0) = 0x0000004000804000
236957 mprotect(0x0000004000808000,8388608,PROT_READ|PROT_WRITE) = 0
236957 rt_sigprocmask(SIG_BLOCK,0x0000000120137f30,0x0000004000802ee0) = 0
236957 clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID,child_stack=0x0000004001006cd0,parent_tidptr=0x0000004001007540,tls=0x0000004001007c10,child_tidptr=0x0000004001007540) = 236959
236957 rt_sigprocmask(SIG_SETMASK,0x0000004000802ee0,NULL) = 0
236957 rt_sigprocmask(SIG_SETMASK,0x0000004001007b60,NULL) = 0
--- SIGSEGV {si_signo=SIGSEGV, si_code=1, si_addr=NULL} ---
fish: Job 1, '/home/schrodinger/Documents/qem…' terminated by signal SIGSEGV (Address boundary error)

SchrodingerZhu avatar Oct 13 '21 13:10 SchrodingerZhu

Is it possible to get a stack trace? My guess is that something about how thread local state is being initialised is interacting badly with our thread local state.

mjp41 avatar Oct 13 '21 13:10 mjp41

Could we expand this to a general MIPS AAL? (We previously had an attempt at such a thing in https://github.com/microsoft/snmalloc/pull/203/files#diff-7503afa2c6fdec12f6118f90f82c0134f49d1e5817cfa301f37972b6c0a5dca5 but I was not planning to renew it as part of the CHERI effort because the CL has all but abandoned MIPS.)

NoCpuCycleCounters is probably Loongson specific. Are dbar 0 (for pause) and preld 0, %0, 0 (for prefetch) also specific to Loongson? A quick glance at https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00087-2B-MIPS64BIS-AFP-6.06.pdf suggests that there are standard opcodes for pause and prefetch, but perhaps these are different?

nwf avatar Oct 13 '21 13:10 nwf

@nwf loongarch seems to be a new arch released this year. (Though it is closely related to previous MIPS ISA variant maintained by Loongson.)

SchrodingerZhu avatar Oct 13 '21 13:10 SchrodingerZhu

Great to see more architectures being added! We now have CI testing for via QEMU but I don't think Loongarch is currently supported by the Ubuntu 20.04 qemu user mode package. Do you know when it's likely to be or if there's a PPA with a newer QEMU that we can use?

davidchisnall avatar Oct 13 '21 14:10 davidchisnall

some update: loongson guys ran the tests on 3A5000 and all tests passed. Guess it was QEMU bug.

SchrodingerZhu avatar Oct 17 '21 03:10 SchrodingerZhu

I see from https://wiki.qemu.org/ChangeLog/5.1#MIPS that at least one Loongson family CPU is supported as of 5.1. https://packages.ubuntu.com/search?keywords=qemu makes it look like Ubuntu just skipped over 5.1, but 5.2 is available in 21.04.

nwf avatar Oct 18 '21 12:10 nwf

Those are the old MIPS Loongson CPUs, not the Loongarch ones (Loongson 3 5000 and later). It looks as if the LoongArch support started to land in QEMU last month, so probably isn't in a release yet?

davidchisnall avatar Oct 18 '21 13:10 davidchisnall

while we may need to wait a bit longer for a qemu release with loongarch support, I would like to call for a deeper investigation on the asm I have used here.

dbar is the memory fence hint according to the manual that prevents eager reordering of instruction. I used this because I saw loongson submitted their linux kernel patches using dbar to relax the spin loops.

preld seems to fetch a single cache size of data while loongarch also provides preldx where user can specify the stride size, gap, and count of data prefetching. I don't see snmalloc previous suggests any optimal size used in its internal prefetching so I just used the default one sized by cache line.

What you think of the above settings?

SchrodingerZhu avatar Oct 18 '21 14:10 SchrodingerZhu

dbar is the memory fence hint according to the manual that prevents eager reordering of instruction. I used this because I saw loongson submitted their linux kernel patches using dbar to relax the spin loops.

That sounds as if it's what we want. We expect pause to let the CPU avoid pushing a pile of loads of the same memory address into the load pipeline and then having them all retire as fast as possible (heating up the CPU) while spinning.

preld seems to fetch a single cache size of data while loongarch also provides preldx where user can specify the stride size, gap, and count of data prefetching. I don't see snmalloc previous suggests any optimal size used in its internal prefetching so I just used the default one sized by cache line.

That's a great question. Currently, the only use of the prefetch is in the MPSCQ, which uses it to prefetch the next queue element while processing a linked list. As such, one cache line is fine (I believe the entries here will never span a cache line - they're two pointers and at least two-pointer aligned). We should probably document that properly though.

davidchisnall avatar Oct 18 '21 14:10 davidchisnall

Just tested with snmalloc main branch and the latest cross-tools at https://github.com/loongson/build-tools. Still a lot of mess.

  • The ar tool gives FPE.
  • many operations still fail with min_page_size= 0x1000 due to madvise returning EINVAL.

@xen0n is there any recent progress on improving the situation?

#pragma once

#if __SIZEOF_POINTER__ == 8
#  define SNMALLOC_VA_BITS_64
#else
#  define SNMALLOC_VA_BITS_32
#endif

#include <cstddef>
namespace snmalloc
{
  /**
   * Loongarch-specific architecture abstraction layer.
   */
  class AAL_LoongArch
  {
  public:
    /**
     * Bitmap of AalFeature flags
     */
    static constexpr uint64_t aal_features =
      IntegerPointers | NoCpuCycleCounters;

    static constexpr enum AalName aal_name = LoongArch;

    static constexpr size_t smallest_page_size = 0x1000;

    /**
     * On pipelined processors, notify the core that we are in a spin loop and
     * that speculative execution past this point may not be a performance gain.
     */
    static inline void pause()
    {
      __asm__ __volatile__("dbar 0" : : : "memory");
    }

    /**
     * PRELD reads a cache-line of data from memory in advance into the Cache.
     * The access address is the 12bit immediate number of the value in the
     * general register rj plus the symbol extension.
     *
     * The processor learns from the hint in the PRELD instruction what type
     * will be acquired and which level of Cache the data to be taken back fill
     * in, hint has 32 optional values (0 to 31), 0 represents load to level 1
     * Cache If the Cache attribute of the access address of the PRELD
     * instruction is not cached, then the instruction cannot generate a memory
     * access action and is treated as a NOP instruction. The PRELD instruction
     * will not trigger any exceptions related to MMU or address.
     */
    static inline void prefetch(void* ptr)
    {
      __asm__ volatile("preld 0, %0, 0" : "=r"(ptr));
    }
  };

  using AAL_Arch = AAL_LoongArch;
} // namespace snmalloc

SchrodingerZhu avatar Jun 29 '22 23:06 SchrodingerZhu

close due to #553

SchrodingerZhu avatar Aug 22 '22 15:08 SchrodingerZhu