[BUG]: algorithm.parallelize crashes program
Bug description
0.00033999278648190045 12.336449968250383 GFLOP/s [114667:114667:20231209,143503.671761:ERROR file_io_posix.cc:144] open /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq: No such file or directory (2) [114667:114667:20231209,143503.671869:ERROR file_io_posix.cc:144] open /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq: No such file or directory (2) Stack dump: 0. Program arguments: mojo main.mojo
The program started crashing when I added for loop into the main
Steps to reproduce
from algorithm import parallelize, vectorize
from tensor import Tensor
from utils.index import Index
from sys.info import simdwidthof
from random import rand
import benchmark
alias type = DType.float32
alias nelts = simdwidthof[type]()
alias M = 128
alias N = 128
alias K = 128
@always_inline
fn bench[
func: fn (inout Tensor[type], Tensor[type], Tensor[type]) -> None]():
var A = rand[type](M, K)
var B = rand[type](K, N)
var C = Tensor[type](M, N)
@always_inline
@parameter
fn test_fn():
_ = func(C, A, B)
let secs = benchmark.run[test_fn](max_runtime_secs=1).mean()
let gflops = ((2 * M * N * K) / secs) / 1e9
print(secs)
print(gflops, "GFLOP/s")
fn matmul_parallelized(inout C: Tensor[type], A: Tensor[type], B: Tensor[type]):
@parameter
fn calc_row(m: Int):
for k in range(K):
@parameter
fn func[nelts: Int](n: Int):
C.simd_store[nelts](
m * M + n,
C.simd_load[nelts](m * M + n) + A.simd_load[nelts](m * M + k) * B.simd_load[nelts](k * K + n)
)
vectorize[nelts, func](N)
parallelize[calc_row](M, M)
fn main():
for i in range(5):
bench[matmul_parallelized]()
System information
- What OS did you do install Mojo on ?
* Windows 11
- Provide version information for Mojo by pasting the output of `mojo -v`
* mojo 0.6.0 (d55c0025)
- Provide Modular CLI version by pasting the output of `modular -v`
* modular 0.2.2 (95d42445)
I'm not able to reproduce the crash. Does it always crash for you or just sometimes? Do you get the correct output from running the program aside from the "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq" errors?
@ematejska I reproduced the crash on Docker, Intel Mac. I'm not sure about if the results are correct, however, some results did get produced before the crash. The trace reads:
0.0014979107597069598
2.8001027249584229 GFLOP/s
0.0018947687330543932
2.2136231862126645 GFLOP/s
[45173:45173:20231213,024209.681814:ERROR file_io_posix.cc:144] open /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq: No such file or directory (2)
[45173:45173:20231213,024209.681932:ERROR file_io_posix.cc:144] open /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq: No such file or directory (2)
[45173:45174:20231213,024209.684584:ERROR directory_reader_posix.cc:42] opendir /home/vscode/.modular/crashdb/attachments/4220a3bb-b1e2-4b6a-b6aa-e715745514ce: No such file or directory (2)
Please submit a bug report to https://github.com/modularml/mojo/issues and include the crash backtrace along with all the relevant source codes.
Stack dump:
0. Program arguments: /home/vscode/.modular/pkg/packages.modular.com_mojo/bin/mojo /workspaces/ubuntu/issues/1450.mojo
^C
It's related to early destruction of C (inserting _ = C fixes the problem). Only crashes with certain tensor shape.
from algorithm import parallelize
from tensor import Tensor
import benchmark
fn main():
var t = Tensor[DType.float32](100, 100)
@parameter
fn test_fn():
@parameter
fn calc_row(m: Int):
t.simd_store[1](m, 0.0)
parallelize[calc_row](1, 10)
let secs = benchmark.run[test_fn](max_runtime_secs=0.1).mean()
print(secs)
Less minimal example:
from runtime.asyncrt import DeviceContextPtr, TaskGroup, parallelism_level
from os.atomic import Atomic
from time import sleep
fn run[
thread_a_part1: fn () capturing -> Int,
thread_a_part2: fn () capturing -> Int,
thread_b_part1: fn () capturing -> Int,
thread_b_part2: fn () capturing -> Int,
]():
"""Alternate work on two threads.
While Thread a is doing part1, Thread b is doing part2,
and then flip.
"""
var a_1_done = Atomic[DType.uint8](0)
var a_2_done = Atomic[DType.uint8](0)
var b_1_done = Atomic[DType.uint8](1)
var b_2_done = Atomic[DType.uint8](1)
var shutdown = Atomic[DType.uint8](0)
alias SLEEP_TIME = 0.1
@parameter
@always_inline
async fn thread_a():
while True:
if b_2_done.load() == 1:
_ = b_2_done.fetch_sub(1)
if thread_a_part2() < 0:
_ = shutdown.fetch_add(1)
_ = a_2_done.fetch_add(1)
if shutdown.load() > 0:
print("shutting down a")
break
if b_1_done.load() == 1:
_ = b_1_done.fetch_sub(1)
if thread_a_part1() < 0:
_ = shutdown.fetch_add(1)
_ = a_1_done.fetch_add(1)
sleep(SLEEP_TIME)
@parameter
@always_inline
async fn thread_b():
while True:
if a_2_done.load() == 1:
_ = a_2_done.fetch_sub(1)
if thread_b_part2() < 0:
_ = shutdown.fetch_add(1)
_ = b_2_done.fetch_add(1)
if shutdown.load() > 0:
print("shutting down b")
break
if a_1_done.load() == 1:
_ = a_1_done.fetch_sub(1)
if thread_b_part1() < 0:
_ = shutdown.fetch_add(1)
_ = b_1_done.fetch_add(1)
sleep(SLEEP_TIME)
var tg = TaskGroup()
tg.create_task(thread_a())
tg.create_task(thread_b())
tg.wait()
def main():
var from_values = List[Int](1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
var a_to_values = List[Int]()
var b_to_values = List[Int]()
@parameter
fn a_part1() capturing -> Int:
if len(from_values) > 0:
a_to_values.append(from_values.pop())
return 0
else:
return -1
@parameter
fn b_part1() capturing -> Int:
if len(from_values) > 0:
b_to_values.append(from_values.pop())
return 0
else:
return -1
@parameter
fn a_part2() capturing -> Int:
if len(a_to_values) > 0:
print("a", a_to_values[0])
a_to_values.clear()
return 0
@parameter
fn b_part2() capturing -> Int:
if len(b_to_values) > 0:
print("b", b_to_values[0])
b_to_values.clear()
return 0
run[a_part1, a_part2, b_part1, b_part2]()
# without extending these variables, I get the same opaque compiler error as reported above.
_ = a_to_values
_ = b_to_values
_ = from_values