Possible heap corruption when allocating after running multiple threads
Description
I apologize in advance for having a lot of code here, I've been trying to narrow this down to a simpler repro case, but I keep running into situations where removing seemingly unrelated parts of the code stops the bug from triggering.
I'm seeing inconsistent segfaults in system/alloc.nim rawAlloc after running some threaded code. All of the created threads should have been joined by the time the segfault triggers, it's like the heap is getting corrupted.
I've been able to reproduce by running the binary 100 times in a row to have higher confidence in whether a change I made affected it:
nim c main.nim
for i in {1..100}; do
./main || break
done
Repro code:
main.nim:
import md5
import std/atomics
import std/cpuinfo
import std/monotimes
import strformat
import strutils
import sugar
import terminal
import times
const batchSize = 8000
type Shared = ref object
prefix: string
done: Atomic[bool]
counter: Atomic[uint32]
sol: Atomic[uint32]
proc worker(arg: (Shared, int)) {.thread.} =
let (s, v) = arg
while not s.done.load(moRelaxed):
let offset = s.counter.fetchAdd(batchSize, moRelaxed)
for n in 0'u32 ..< batchSize:
let i = offset + n
let h = (s.prefix & $i).toMD5
if h[0] == 0 and h[1] == 0:
var val = s.sol.load(moRelaxed)
while i < val and not s.sol.compareExchangeWeak(val, i, moRelaxed):
discard
s.done.store(true, moRelaxed)
proc solve(input: string): uint32 =
let s = Shared(prefix: input)
s.sol.store(uint32.high)
var thr: array[20, Thread[(Shared, int)]]
doAssert countProcessors() < thr.len
for i in 0 ..< countProcessors():
# Needs to have tuple args to fail
createThread(thr[i], worker, (s, 15))
joinThreads(thr)
s.sol.load
proc part1(input: string): string =
$input.solve
proc colorizeTime(t: float): string =
# Crash happens here
let s = fmt"{t:.3f}"
if t < 0.5:
ansiForegroundColorCode(fgGreen) & s & ansiResetCode
elif t < 1.0:
ansiForegroundColorCode(fgYellow) & s & ansiResetCode
else:
ansiForegroundColorCode(fgRed) & s & ansiResetCode
proc timeit(f: (string) -> string, inp: string): (string, float) =
let startT = getMonoTime()
var ans = f(inp)
let endT = getMonoTime()
let t = endT - startT
return (ans, float(t.inMicroseconds) / 1_000_000)
proc run() =
# Needs the strip call to fail
let contents = readFile("input.txt").strip
let (ans, t) = timeit(part1, contents)
stdout.write fmt"Part 1: "
echo "$1 Elapsed time $2 seconds".format(align(ans, 54), colorizeTime(t))
run()
input.txt:
ckczppom
I tried inlining the contents of input.txt into the source, but that also stopped the crashes.
Nim Version
Nim Compiler Version 2.0.2 [MacOSX: arm64] Compiled at 2023-12-15 Copyright (c) 2006-2023 by Andreas Rumpf
active boot switches: -d:release -d:nimUseLinenoise
Current Output
Traceback (most recent call last)
/Users/dan/Code/nim_threading_test/main.nim(69) main
/Users/dan/Code/nim_threading_test/main.nim(67) run
/Users/dan/Code/nim_threading_test/main.nim(49) colorizeTime
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/system/alloc.nim(1052) alloc
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/system/alloc.nim(868) rawAlloc
SIGSEGV: Illegal storage access. (Attempt to read from nil?)
Expected Output
Part 1: 34311 Elapsed time 0.NNN seconds
Possible Solution
No response
Additional Information
I can also reproduce compiling with -d:useMalloc, which results in the following trace instead:
main(92449,0x1d6c2d000) malloc: Heap corruption detected, free list is damaged at 0x600002ef8360
*** Incorrect guard value: 221136115696464
main(92449,0x1d6c2d000) malloc: *** set a breakpoint in malloc_error_break to debug
Part 1: Traceback (most recent call last)
/Users/dan/Code/nim_threading_test/main.nim(69) main
/Users/dan/Code/nim_threading_test/main.nim(67) run
/Users/dan/Code/nim_threading_test/main.nim(47) colorizeTime
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/pure/strformat.nim(519) formatValue
/opt/homebrew/Cellar/nim/2.0.2/nim/lib/pure/strutils.nim(2485) formatBiggestFloat
SIGABRT: Abnormal termination.
I can also reproduce compiling with --mm:arc, but cannot reproduce running with --mm:refc (though refc seems to be giving an incorrect answer, that seems like a separate issue).
If anyone is having trouble reproducing and would like extra debugging information, let me know what I can do to supplement.
I'd like to blame it on sharing a ref across threads given the following works on my end.
import md5
import std/[atomics, cpuinfo, monotimes, strformat, strutils, sugar, terminal, times]
const batchSize = 8000
type
SharedObj = object
prefix: string
done: Atomic[bool]
counter: Atomic[uint32]
sol: Atomic[uint32]
Shared = ref SharedObj
proc worker(arg: (ptr SharedObj, int)) {.thread.} =
let (s, v) = arg
while not s.done.load(moRelaxed):
let offset = s.counter.fetchAdd(batchSize, moRelaxed)
for n in 0'u32 ..< batchSize:
let i = offset + n
let h = (s.prefix & $i).toMD5
if h[0] == 0 and h[1] == 0:
var val = s.sol.load(moRelaxed)
while i < val and not s.sol.compareExchangeWeak(val, i, moRelaxed):
discard
s.done.store(true, moRelaxed)
proc solve(input: string): uint32 =
let s = Shared(prefix: input)
s.sol.store(uint32.high)
var thr: array[20, Thread[(ptr SharedObj, int)]]
doAssert countProcessors() < thr.len
for thr in thr.toOpenArray(0, countProcessors() - 1).mitems:
createThread(thr, worker, (cast[ptr SharedObj](s), 15))
joinThreads(thr.toOpenArray(0, countProcessors() - 1))
s.sol.load
proc part1(input: string): string =
$input.solve
proc colorizeTime(t: float): string =
# Crash happens here
let s = fmt"{t:.3f}"
if t < 0.5:
ansiForegroundColorCode(fgGreen) & s & ansiResetCode
elif t < 1.0:
ansiForegroundColorCode(fgYellow) & s & ansiResetCode
else:
ansiForegroundColorCode(fgRed) & s & ansiResetCode
proc timeit(f: (string) -> string, inp: string): (string, float) =
let startT = getMonoTime()
var ans = f(inp)
let endT = getMonoTime()
let t = endT - startT
return (ans, float(t.inMicroseconds) / 1_000_000)
proc run() =
# Needs the strip call to fail
let contents = readFile("input.txt").strip
let (ans, t) = timeit(part1, contents)
stdout.write fmt"Part 1: "
echo "$1 Elapsed time $2 seconds".format(align(ans, 54), colorizeTime(t))
run()
Ah yeah that seems to be the problem. Switching to a SharedPtr in threading removes the crash. Perhaps this can be signaled more prominently in the threading docs? An example in https://nim-lang.org/docs/typedthreads.html for something that isn't copied by value would be helpful, I had to search around to find that library, and https://forum.nim-lang.org/t/9378 which I found more helpful than the docs.