tantivy
tantivy copied to clipboard
Investigate the number of sync effectively happening.
Upon commit, how many fsync are happening? We sync explicitly in order to detect errors... But I think rust also sync_all upon drop.
See #1221 in which the performance are especially bad on MacOS
Here is a slightly curated output of strace -fT for a process that simply add a doc and commits.
mmap(NULL, 1523712, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
getcwd( <unfinished ...>
<... mmap resumed>) = 0x7f9de3a8a000 <0.000020>
<... getcwd resumed>"/home/fulmicoton/git/number-of-flush", 512) = 37 <0.000022>
openat(AT_FDCWD, "/home/fulmicoton/git/number-of-flush/./idx/.tmpXhgFST", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0600) = 4 <0.000088>
close(4) = 0 <0.000021>
rename("/home/fulmicoton/git/number-of-flush/./idx/.tmpXhgFST", "./idx/.managed.json") = 0 <0.000147>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 4 <0.000019>
fsync(4 <unfinished ...> // tmp file for managed json
<... fsync resumed>) = 0 <0.000814>
close(4) = 0 <0.000017>
openat(AT_FDCWD, "./idx/9bb03e1f890b4c4b93c31707f84e236f.store", O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666) = 4 <0.000034>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 5 <0.000018>
fsync(5) = 0 <0.000720>
close(5) = 0 <0.000106>
getcwd("/home/fulmicoton/git/number-of-flush", 512) = 37 <0.000018>
openat(AT_FDCWD, "/home/fulmicoton/git/number-of-flush/./idx/.tmprc4vM9", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0600) = 5 <0.000048>
close(5) = 0 <0.000018>
rename("/home/fulmicoton/git/number-of-flush/./idx/.tmprc4vM9", "./idx/.managed.json") = 0 <0.000118>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 5 <0.000033>
fsync(5) = 0 <0.000764>
close(5) = 0 <0.000020>
openat(AT_FDCWD, "./idx/9bb03e1f890b4c4b93c31707f84e236f.fast", O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666) = 5 <0.000048>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 6 <0.000020>
fsync(6) = 0 <0.000718>
close(6) = 0 <0.000017>
getcwd("/home/fulmicoton/git/number-of-flush", 512) = 37 <0.000018>
openat(AT_FDCWD, "/home/fulmicoton/git/number-of-flush/./idx/.tmpxeRI1X", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0600) = 6 <0.000034>
close(6) = 0 <0.000017>
rename("/home/fulmicoton/git/number-of-flush/./idx/.tmpxeRI1X", "./idx/.managed.json") = 0 <0.000094>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 6 <0.000035>
fsync(6) = 0 <0.000845>
close(6) = 0 <0.000022>
openat(AT_FDCWD, "./idx/9bb03e1f890b4c4b93c31707f84e236f.fieldnorm", O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666) = 6 <0.000050>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 7 <0.000018>
fsync(7) = 0 <0.000713>
close(7) = 0 <0.000017>
getcwd("/home/fulmicoton/git/number-of-flush", 512) = 37 <0.000018>
openat(AT_FDCWD, "/home/fulmicoton/git/number-of-flush/./idx/.tmpIc8Zxr", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0600) = 7 <0.000036>
close(7) = 0 <0.000017>
rename("/home/fulmicoton/git/number-of-flush/./idx/.tmpIc8Zxr", "./idx/.managed.json") = 0 <0.000117>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 7 <0.000019>
fsync(7) = 0 <0.000766>
close(7) = 0 <0.000018>
openat(AT_FDCWD, "./idx/9bb03e1f890b4c4b93c31707f84e236f.term", O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666) = 7 <0.000055>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 8 <0.000021>
fsync(8) = 0 <0.000878>
close(8) = 0 <0.000017>
getcwd("/home/fulmicoton/git/number-of-flush", 512) = 37 <0.000019>
openat(AT_FDCWD, "/home/fulmicoton/git/number-of-flush/./idx/.tmpNvXfj3", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0600) = 8 <0.000034>
close(8) = 0 <0.000018>
rename("/home/fulmicoton/git/number-of-flush/./idx/.tmpNvXfj3", "./idx/.managed.json") = 0 <0.000114>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 8 <0.000019>
fsync(8) = 0 <0.000721>
close(8) = 0 <0.000018>
openat(AT_FDCWD, "./idx/9bb03e1f890b4c4b93c31707f84e236f.idx", O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666) = 8 <0.000033>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 9 <0.000022>
fsync(9) = 0 <0.000642>
close(9) = 0 <0.000017>
getcwd("/home/fulmicoton/git/number-of-flush", 512) = 37 <0.000016>
openat(AT_FDCWD, "/home/fulmicoton/git/number-of-flush/./idx/.tmp6V6wGu", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0600) = 9 <0.000032>
close(9) = 0 <0.000017>
rename("/home/fulmicoton/git/number-of-flush/./idx/.tmp6V6wGu", "./idx/.managed.json") = 0 <0.000075>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 9 <0.000041>
fsync(9) = 0 <0.000958>
close(9) = 0 <0.000078>
openat(AT_FDCWD, "./idx/9bb03e1f890b4c4b93c31707f84e236f.pos", O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666) = 9 <0.000071>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC) = 10 <0.000020>
fsync(10) = 0 <0.000735>
close(10) = 0 <0.000019>
mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9de3989000 <0.000030>
mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9de3888000 <0.000018>
mmap(NULL, 266240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9de801d000 <0.000021>
mmap(NULL, 1052672, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9de3787000 <0.000017>
fsync(6) = 0 <0.000879>
fsync(6) = 0 <0.000772>
fsync(6) = 0 <0.000767>
close(6) = 0 <0.000024>
openat(AT_FDCWD, "./idx/9bb03e1f890b4c4b93c31707f84e236f.fieldnorm", O_RDONLY|O_CLOEXEC) = 6 <0.000044>
statx(6, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_ALL|STATX_MNT_ID, stx_attributes=0, stx_mode=S_IFREG|0664, stx_size=106, ...}) = 0 <0.000024>
newfstatat(6, "", {st_mode=S_IFREG|0664, st_size=106, ...}, AT_EMPTY_PATH) = 0 <0.000018>
mmap(NULL, 106, PROT_READ, MAP_SHARED, 6, 0) = 0x7f9de801c000 <0.000029>
close(6) = 0 <0.000018>
mmap(NULL, 962560, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9de369c000 <0.000019>
fsync(9) = 0 <0.000858>
fsync(7) = 0 <0.000923>
fsync(5) = 0 <0.000858>
fsync(5) = 0 <0.000634>
close(5) = 0 <0.000023>
fsync(7) = 0 <0.000619>
fsync(7) = 0 <0.000722>
close(7) = 0 <0.000019>
fsync(8) = 0 <0.000763>
fsync(8) = 0 <0.000627>
close(8) = 0 <0.000020>
fsync(9) = 0 <0.000612>
fsync(9) = 0 <0.000617>
close(9) = 0 <0.000020>
fsync(4) = 0 <0.000752>
fsync(4) = 0 <0.000620>
close(4) = 0 <0.000023>
clone(child_stack=0x7f9de3dfddf0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTIDstrace: Process 17765 attached
, parent_tid=[17765], tls=0x7f9de3dfe640, child_tidptr=0x7f9de3dfe910) = 17765 <0.000186>
mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...>
<... mmap resumed>) = 0x7f9de8b59000 <0.000028>
getcwd( <unfinished ...>
<... getcwd resumed>"/home/fulmicoton/git/number-of-flush", 512) = 37 <0.000075>
openat(AT_FDCWD, "/home/fulmicoton/git/number-of-flush/./idx/.tmpyYOgWo", O_RDWR|O_CREAT|O_EXCL|O_CLOEXEC, 0600 <unfinished ...>
<... openat resumed>) = 4 <0.000085>
close(4 <unfinished ...>
<... close resumed>) = 0 <0.000014>
rename("/home/fulmicoton/git/number-of-flush/./idx/.tmpyYOgWo", "./idx/meta.json" <unfinished ...>
<... rename resumed>) = 0 <0.000180>
openat(AT_FDCWD, "./idx", O_RDONLY|O_CLOEXEC <unfinished ...>
<... openat resumed>) = 4 <0.000042>
fsync(4 <unfinished ...>
<... fsync resumed>) = 0 <0.000756>
close(4) = 0 <0.000095>
openat(AT_FDCWD, "./idx/.tantivy-meta.lock", O_WRONLY|O_CREAT|O_CLOEXEC, 0666) = 4 <0.000115>
flock(4, LOCK_EX) = 0 <0.000027>
close(4) = 0 <0.000023>
) = 1 <0.000023>
close(3) = 0 <0.000031>
This confirms fsync is the most costly operation, and all fsync are "expensive".
Then we have 28 fsync calls... Which is waaaaay too much.
Here is some detail:
fsync(4 <unfinished ...> // idx directory
<... fsync resumed>) = 0 <0.000814>
fsync(5) = 0 <0.000720> // idx directory.
fsync(5) = 0 <0.000764> // idx directory
fsync(6) = 0 <0.000718> // idx directory
fsync(6) = 0 <0.000845> // idx directory
fsync(7) = 0 <0.000713> // idx directory
fsync(7) = 0 <0.000766> // idx directory
fsync(8) = 0 <0.000878> // term file
fsync(8) = 0 <0.000721> // idx directory
fsync(9) = 0 <0.000642> // idx directory
fsync(9) = 0 <0.000958> // idx directory
fsync(10) = 0 <0.000735> // idx directory
fsync(6) = 0 <0.000879> // fieldnorm
fsync(6) = 0 <0.000772> // fieldnorm
fsync(6) = 0 <0.000767> // fieldnorm
fsync(9) = 0 <0.000858> // pos
fsync(7) = 0 <0.000923> // term
fsync(5) = 0 <0.000858> // fast
fsync(5) = 0 <0.000634> // fast
fsync(7) = 0 <0.000619> // term
fsync(7) = 0 <0.000722> // term
fsync(8) = 0 <0.000763> // idx
fsync(8) = 0 <0.000627> // idx
fsync(9) = 0 <0.000612> // pos
fsync(9) = 0 <0.000617> // pos
fsync(4) = 0 <0.000752> // store
fsync(4) = 0 <0.000620> // store
fsync(4 <unfinished ...> // idx directory
<... fsync resumed>) = 0 <0.000756>
First surprise: Because we keep the list of files in managed.json, we end up doing way to much flush calls. Second surprise: The atomic write dance does not do any flush on the temporary file. I am not sure this is safe. Third surprise: Each component file triggers 3 flushes. (1 or 2 was expected).
It might be time to search for ways to fix this.
For the second surprise: tempfile does not do that for us. We need to call sync_all manually. I think we can consider that a bug in tantivy.
We do not really care about metadata. sync_data should be ok. Besides: it does update file metadata when needed (like len).
Some work was done in https://github.com/quickwit-inc/tantivy/commit/c3cc93406d9656a5674c2f62159f00516034710d. It:
- the fixes the bug by adding some sync
- replaces all fsync by fdatasync
- removes some fsyncs.
On linux on a in tiny commit, it does not make much difference, but maybe it helps on MacOS.
Reducing further more the number of syncs in #1228. Now we are down to 16.