rt-thread icon indicating copy to clipboard operation
rt-thread copied to clipboard

rt_mutex_take函数BUG,多互斥锁嵌套导致 “thread == rt_thread_self()” 检查过不去

Open yuqingli05 opened this issue 3 years ago • 6 comments

使用rtthread 版本 SHA-1: aaf5462c6d4ae9466a0a8cd2a4bb00cfe1fef2f2

我自己测试百分之百复现的测试代码

static rt_mutex_t mutex1 = RT_NULL;
static rt_mutex_t mutex2 = RT_NULL;
static void threadA(void *pvParameter)
{
	rt_thread_delay(300);
	rt_kprintf("%s 1\n", __func__);
	rt_mutex_take(mutex1, RT_WAITING_FOREVER);
	rt_kprintf("%s 2\n", __func__);
	rt_thread_delay(2000);
	rt_kprintf("%s 3\n", __func__);
	rt_mutex_release(mutex1);
	rt_kprintf("%s 4\n", __func__);
}
static void threadB(void *pvParameter)
{
	rt_thread_delay(600);
	rt_kprintf("%s 1\n", __func__);
	rt_mutex_take(mutex2, RT_WAITING_FOREVER);
	rt_mutex_take(mutex1, RT_WAITING_FOREVER);
	rt_kprintf("%s 2\n", __func__);
	rt_thread_delay(2000);
	rt_kprintf("%s 3\n", __func__);
	rt_mutex_release(mutex1);
	rt_mutex_release(mutex2);
	rt_kprintf("%s 4\n", __func__);
}
static void threadC(void *pvParameter)
{
	rt_thread_delay(900);
	rt_kprintf("%s 1\n", __func__);
	rt_mutex_take(mutex2, RT_WAITING_FOREVER);
	rt_kprintf("%s 2\n", __func__);
	rt_thread_delay(2000);
	rt_kprintf("%s 3\n", __func__);
	rt_mutex_release(mutex2);
	rt_kprintf("%s 4\n", __func__);
}

int main(void)
{
	rt_thread_t tid;
	mutex1 = rt_mutex_create("mutex1", RT_IPC_FLAG_FIFO);
	RT_ASSERT(mutex1);
	mutex2 = rt_mutex_create("mutex2", RT_IPC_FLAG_FIFO);
	RT_ASSERT(mutex2);

	tid = rt_thread_create("threadA",
						   threadA,
						   RT_NULL,
						   2048,
						   10,
						   5);
	RT_ASSERT(tid != RT_NULL);
	rt_thread_startup(tid);
	tid = rt_thread_create("threadB",
						   threadB,
						   RT_NULL,
						   2048,
						   9,
						   5);
	RT_ASSERT(tid != RT_NULL);
	rt_thread_startup(tid);
	tid = rt_thread_create("threadC",
						   threadC,
						   RT_NULL,
						   2048,
						   8,
						   5);
	RT_ASSERT(tid != RT_NULL);
	rt_thread_startup(tid);

	printf("main test ......\n");

	while (1)
	{
		rt_thread_delay(1000);
	}
}

控制台问题点输出日志

threadA 1
threadA 2
threadB 1
Enter mutex->owner = threadA
Exit mutex->owner = threadA
threadC 1
Enter mutex->owner = threadB
(thread == rt_thread_self()) assertion failed at function:_rt_thread_suspend, line number:885

Enter mutex->owner = threadA Exit mutex->owner = threadA Enter mutex->owner = threadB

这三行是我自己在内核代码里面加的输出 用于定位,出问题的点。在添加 rt_mutex_take 函数添加两行输出

rt_err_t rt_mutex_take(rt_mutex_t mutex, rt_int32_t timeout)
{
	...
    if (mutex->owner == thread)
    {
		...
    }
    else
    {
        /* whether the mutex has owner thread. */
        if (mutex->owner == RT_NULL)
        {
			...
        }
        else
        {
            /* no waiting, return with timeout */
            if (timeout == 0)
            {
				...
            }
            else
            {
                ...
                /* update the priority level of mutex */
                if (priority < mutex->priority)
                {
                    mutex->priority = priority;
                    if (mutex->priority < mutex->owner->current_priority)
                    {
                        if(mutex->owner != rt_thread_self())
                            rt_kprintf("Enter mutex->owner = %s\n",mutex->owner->name);
                        _thread_update_priority(mutex->owner, priority);
                        if(mutex->owner != rt_thread_self())
                            rt_kprintf("Exit mutex->owner = %s\n",mutex->owner->name);
                    }
                }
                ...
            }
        }
    }
	...
    return RT_EOK;
}

问题分析 rt_mutex_take >> _thread_update_priority (ipc.c 1202行调用) >> _ipc_list_suspend(ipc.c 770行调用) >> rt_thread_suspend(ipc.c 129行调用) 到这里已经能看出问题了,rt_thread_suspend 是不允许 挂起其他线程的,只能挂起自己。而经过一串调用,最后传参不是当前线程,导致检查过不去 死锁

yuqingli05 avatar Dec 03 '22 22:12 yuqingli05

补一个问题说明:当我使用三个线程,两个互斥锁的时候,程序老是死锁,日志输出 “(thread == rt_thread_self()) assertion failed at function:_rt_thread_suspend, line number:885”。三个线程 和 两个互斥锁的用法 看上面100% 能复现问题的代码。

yuqingli05 avatar Dec 03 '22 22:12 yuqingli05

@BernardXiong

yuqingli05 avatar Dec 03 '22 22:12 yuqingli05

@mysterywolf

yuqingli05 avatar Dec 03 '22 22:12 yuqingli05

好的 感谢返回 我们看一下~

mysterywolf avatar Dec 04 '22 00:12 mysterywolf

谢谢,这部分后续也加dlog分析下

BernardXiong avatar Dec 04 '22 16:12 BernardXiong

谢谢,这部分后续也加dlog分析下

我提交了一个PR来修复这个问题,不知道你们有更好的方法吗

yuqingli05 avatar Dec 04 '22 16:12 yuqingli05

我基于最新的代码测试了一下,好像没有复现。 image

Guozhanxin avatar Feb 02 '23 11:02 Guozhanxin

我基于最新的代码测试了一下,好像没有复现。 图像

我刚用qemu工程测试,问题依旧。测试代码是上面的。你的测试代码不够新,4.1.1是可以的。我现有项目已经退回到4.1.1 工程了。这个问题是后面加入新特性导致的 image

yuqingli05 avatar Feb 12 '23 16:02 yuqingli05

@BernardXiong

@mysterywolf

你好,这个问题还没有更新吗?我认为这个问题挺严重的,涉及到内核互斥锁的工程,嵌套互斥锁之后很容易出问题。问题只出现在 4.1.1 之后。经过测试4.1.1 是没问题的。

yuqingli05 avatar Feb 12 '23 16:02 yuqingli05

我用最新代码复现出来了

mysterywolf avatar Feb 12 '23 19:02 mysterywolf

我用最新代码复现出来了

什么时候能解决这个问题那?我曾经提交过一个pr来解决问题。但是那个pr只是简单针对这个问题,可能考虑的不是很全面

yuqingli05 avatar Feb 13 '23 02:02 yuqingli05

感谢 已经有人在看了

mysterywolf avatar Feb 13 '23 02:02 mysterywolf

查看ipc有更新,master最新代码再次测试未复现: stm32f103-atk-nano,qemu image

yangjie11 avatar Mar 24 '23 07:03 yangjie11