Blog
Blog copied to clipboard
ARM64启动汇编代码分析
本次分析基于Linux 5.0. 从bootloader跳转到kernel,kernel对系统状态有一些约定。
- 关闭所有DMA设备防止获取不了正确的数据
- 通用寄存器传递的参数
- x0 设备树在内存中的物理地址
- x1 = 0 (保留,后面会用到)
- x2 = 0 (保留,后面会用到)
- x3 = 0 (保留,后面会用到)
- CPU模式
- 屏蔽CPU上的所有中断,比如PSTATE寄存器的DAIF域
- CPU必须处在EL2或者非安全模式的EL1
- MMU和高速缓存
- 关闭MMU
- 指令高速缓存可以关闭或者不关闭
- 从PoC角度观察,清除内核镜像加载的地址范围的高速缓存。最简单的办法是数据高速缓存关闭
- 架构时钟。配置CNTFRQ和CNTVOFF寄存器
- 内存一致性
- 系统寄存器。初始化寄存器
ARM64启动汇编代码位于arch/arm64/kernel/head.S
ENTRY(stext)
bl preserve_boot_args
bl el2_setup // Drop to EL1, w0=cpu_boot_mode
adrp x23, __PHYS_OFFSET
and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0
bl set_cpu_boot_mode_flag
bl __create_page_tables
/*
* The following calls CPU setup code, see arch/arm64/mm/proc.S for
* details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
bl __cpu_setup // initialise processor
b __primary_switch
ENDPROC(stext)
启动代码的主题框架就是上面这部分代码。
preserve_boot_args()
首先函数preserve_boot_args
保留bootloader传递过来的启动参数.
/*
* Preserve the arguments passed by the bootloader in x0 .. x3
*/
preserve_boot_args:
mov x21, x0 // x21=FDT
adr_l x0, boot_args // record the contents of
stp x21, x1, [x0] // x0 .. x3 at kernel entry
stp x2, x3, [x0, #16]
dmb sy // needed before dc ivac with
// MMU off
mov x1, #0x20 // 4 x 8 bytes
b __inval_dcache_area // tail call
ENDPROC(preserve_boot_args)
首先将x0~x3保存进boot_args数组中,然后使用dmb sy
指令确保后面的data cache invalid指令能看到前面保存x0到x3的动作。
el2_setup()
el2_setup
判断当前是el2还是el1启动,同时w0保存了返回的启动级别。
/*
* If we're fortunate enough to boot at EL2, ensure that the world is
* sane before dropping to EL1.
*
* Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if
* booted in EL1 or EL2 respectively.
*/
ENTRY(el2_setup)
msr SPsel, #1 // We want to use SP_EL{1,2}
mrs x0, CurrentEL
cmp x0, #CurrentEL_EL2
b.eq 1f
mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)
msr sctlr_el1, x0
mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1
isb //前面有对系统寄存器的修改,isb可以保证前面的指令执行完成
ret
这里为了简单,我们假定启动模式是el1. set_cpu_boot_mode_flag
根据当前的启动模式来做一些cache相关的操作。
接下来就是本次要分析的重点函数__create_page_tables
__create_page_tables()
__create_page_tables:
mov x28, lr // 保存返回值
/*
* Invalidate the init page tables to avoid potential dirty cache lines
* being evicted. Other page tables are allocated in rodata as part of
* the kernel image, and thus are clean to the PoC per the boot
* protocol.
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
bl __inval_dcache_area
/*
* Clear the init page tables.
*/
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
subs x1, x1, #64
b.ne 1b
mov x7, SWAPPER_MM_MMUFLAGS // 设置页表属性
/*
* Create the identity mapping.
*/
adrp x0, idmap_pg_dir // identity mapping pgd起始地址,链接脚本中分配了3个连续页面来存放pgd,pud,pmd
adrp x3, __idmap_text_start // __pa(__idmap_text_start)
mov x5, #VA_BITS
1:
adr_l x6, vabits_user
str x5, [x6]
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line
/*
* VA_BITS may be too small to allow for an ID mapping to be created
* that covers system RAM if that is located sufficiently high in the
* physical address space. So for the ID map, use an extended virtual
* range in that case, and configure an additional translation level
* if needed.
*
* Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
* entire ID map region can be mapped. As T0SZ == (64 - #bits used),
* this number conveniently equals the number of leading zeroes in
* the physical address of __idmap_text_end.
*/
adrp x5, __idmap_text_end
clz x5, x5
cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough?
b.ge 1f // .. then skip VA range extension
adr_l x6, idmap_t0sz
str x5, [x6]
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line
#if (VA_BITS < 48)
...
#else
/*
* If VA_BITS == 48, we don't have to configure an additional
* translation level, but the top-level table has more entries.
*/
mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
str_l x4, idmap_ptrs_per_pgd, x5
#endif
1:
ldr_l x4, idmap_ptrs_per_pgd // pgd有多少个页表项
mov x5, x3 // __pa(__idmap_text_start)
adr_l x6, __idmap_text_end // __pa(__idmap_text_end)
map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
/*
* Map the kernel image (starting with PHYS_OFFSET).
*/
adrp x0, init_pg_dir
mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text)
add x5, x5, x23 // add KASLR displacement
mov x4, PTRS_PER_PGD
adrp x6, _end // runtime __pa(_end)
adrp x3, _text // runtime __pa(_text)
sub x6, x6, x3 // _end - _text
add x6, x6, x5 // runtime __va(_end)
map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
/*
* Since the page tables have been populated with non-cacheable
* accesses (MMU disabled), invalidate the idmap and swapper page
* tables again to remove any speculatively loaded cache lines.
*/
adrp x0, idmap_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
dmb sy
bl __inval_dcache_area
ret x28
ENDPROC(__create_page_tables)
这里为了分析简便,假定VA_BITS是48位,页面大小是4KB. identity mapping pgd起始地址,链接脚本中分配了3个连续页面来存放pgd,pud,pmd
adrp x0, idmap_pg_dir
// linker script
. = ALIGN(PAGE_SIZE);
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;
这里IDMAP_DIR_SIZE
是3个页面的大小
首先创建的是idmap,为什么需要恒等映射?因为ARM 推荐这么做。
恒等映射过程如下图所示
对内核镜像的映射过程如下图。
这里将kernel image一股脑的映射到vmalloc区
KIMAGE_VADDR
开始的地方。
map_memory宏
宏map_memory
实现比较复制,但过程还是比较简单。
/*
* Macro to populate page table entries, these entries can be pointers to the next level
* or last level entries pointing to physical memory.
*
* tbl: page table address
* rtbl: pointer to page table or physical memory
* index: start index to write
* eindex: end index to write - [index, eindex] written to
* flags: flags for pagetable entry to or in
* inc: increment to rtbl between each entry
* tmp1: temporary variable
*
* Preserves: tbl, eindex, flags, inc
* Corrupts: index, tmp1
* Returns: rtbl
*/
.macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
.Lpe\@: phys_to_pte \tmp1, \rtbl
orr \tmp1, \tmp1, \flags // tmp1 = table entry
str \tmp1, [\tbl, \index, lsl #3]
add \rtbl, \rtbl, \inc // rtbl = pa next level
add \index, \index, #1
cmp \index, \eindex
b.ls .Lpe\@
.endm
/*
* Compute indices of table entries from virtual address range. If multiple entries
* were needed in the previous page table level then the next page table level is assumed
* to be composed of multiple pages. (This effectively scales the end index).
*
* vstart: virtual address of start of range
* vend: virtual address of end of range
* shift: shift used to transform virtual address into index
* ptrs: number of entries in page table
* istart: index in table corresponding to vstart
* iend: index in table corresponding to vend
* count: On entry: how many extra entries were required in previous level, scales
* our end index.
* On exit: returns how many extra entries required for next page table level
*
* Preserves: vstart, vend, shift, ptrs
* Returns: istart, iend, count
*/
.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
lsr \iend, \vend, \shift
mov \istart, \ptrs
sub \istart, \istart, #1
and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1)
mov \istart, \ptrs
mul \istart, \istart, \count
add \iend, \iend, \istart // iend += (count - 1) * ptrs
// our entries span multiple tables
lsr \istart, \vstart, \shift
mov \count, \ptrs
sub \count, \count, #1
and \istart, \istart, \count
sub \count, \iend, \istart
.endm
/*
* Map memory for specified virtual address range. Each level of page table needed supports
* multiple entries. If a level requires n entries the next page table level is assumed to be
* formed from n pages.
*
* tbl: location of page table
* rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE)
* vstart: start address to map
* vend: end address to map - we map [vstart, vend]
* flags: flags to use to map last level entries
* phys: physical address corresponding to vstart - physical memory is contiguous
* pgds: the number of pgd entries
*
* Temporaries: istart, iend, tmp, count, sv - these need to be different registers
* Preserves: vstart, vend, flags
* Corrupts: tbl, rtbl, istart, iend, tmp, count, sv
*/
.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
add \rtbl, \tbl, #PAGE_SIZE
mov \sv, \rtbl
mov \count, #0
compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#if SWAPPER_PGTABLE_LEVELS > 3
compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#endif
#if SWAPPER_PGTABLE_LEVELS > 2
compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
#endif
compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
.endm
简单说下,compute_indices
其实就是计算虚拟地址中对应页表的Index,如下图。而populate_entries
是用来填充页表项
Virtual Address Physical Memory
+-----------------------------------------------------------------------+ +------------------+
| | PGD Index | PUD Index | PMD Index | PTE Index | Page offset | | |
+-----------------------------------------------------------------------+ | |
63 47 | 38 | 29 | 20 | 11 | 0 | Page N |
| | | | +--------------------+ +---->+------------------+
| | | +---------------------+ | | | |
+------+ | | | | | | |
| | +----------+ | | | |------------------|
+------+ | PGD | | | +---------------->| 0ical address |
| ttbr |---->+-------------+ | PUD | | | |------------------|
+------+ | | | | +->+-------------+ | PMD | | | |
| +-------------+ | | | | | +->+-------------+ | PTE | +------------------+
+->| PUD address |----+ +-------------+ | | | | | +->+--------------+ | | |
+-------------+ +--->| PMD address |----+ +-------------+ | | | | | | |
| | +-------------+ +--->| PTE address |----+ +-------------_+ | | |
+-------------+ | | +-------------+ +--->| Page address |----+ | |
+-------------+ | | +--------------+ | |
+-------------+ | | | |
+--------------+ +------------------+
Descriptor format
`+------------------------------------------------------------------------------------------+
| Upper attributes | Address (bits 47:12) | Lower attributes | Block/table bit | Valid bit |
+------------------------------------------------------------------------------------------+
63 47 11 2 1 0
哪些函数被放进了恒等映射区域?可以通过System.map查看
ffff000011892000 T __idmap_text_start
ffff000011892000 T kimage_vaddr
ffff000011892008 T el2_setup
ffff00001189205c t set_hcr
ffff00001189212c t install_el2_stub
ffff000011892180 t set_cpu_boot_mode_flag
ffff0000118921a4 T secondary_holding_pen
ffff0000118921c8 t pen
ffff0000118921dc T secondary_entry
ffff0000118921e8 t secondary_startup
ffff000011892200 t __secondary_switched
ffff000011892234 T __enable_mmu
ffff00001189228c T __cpu_secondary_check52bitva
ffff000011892290 t __no_granule_support
ffff0000118922b4 t __relocate_kernel
ffff000011892300 t __primary_switch
ffff000011892378 T cpu_resume
ffff000011892398 T __cpu_soft_restart
ffff0000118923d8 T cpu_do_resume
ffff00001189244c T idmap_cpu_replace_ttbr1
ffff000011892480 t __idmap_kpti_flag
ffff000011892484 T idmap_kpti_install_ng_mappings
ffff0000118924c0 t do_pgd
ffff0000118924d8 t next_pgd
ffff0000118924e8 t skip_pgd
ffff00001189251c t walk_puds
ffff000011892524 t do_pud
ffff00001189253c t next_pud
ffff00001189254c t skip_pud
ffff00001189255c t walk_pmds
ffff000011892564 t do_pmd
ffff00001189257c t next_pmd
ffff00001189258c t skip_pmd
ffff00001189259c t walk_ptes
ffff0000118925a4 t do_pte
ffff0000118925c8 t skip_pte
ffff0000118925d8 t __idmap_kpti_secondary
ffff000011892620 T __cpu_setup
ffff0000118926b8 T __idmap_text_end
### __cpu_setup()
接下来就是一些CPU设置的一些初始化.
```asm
/*
* __cpu_setup
*
* Initialise the processor for turning the MMU on. Return in x0 the
* value of the SCTLR_EL1 register.
*/
.pushsection ".idmap.text", "awx"
ENTRY(__cpu_setup)
tlbi vmalle1 // Invalidate local TLB
dsb nsh
mov x0, #3 << 20
msr cpacr_el1, x0 // Enable FP/ASIMD
mov x0, #1 << 12 // Reset mdscr_el1 and disable
msr mdscr_el1, x0 // access to the DCC from EL0
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu
reset_pmuserenr_el0 x0 // Disable PMU access from EL0
/*
* Memory region attributes for LPAE:
*
* n = AttrIndx[2:0]
* n MAIR
* DEVICE_nGnRnE 000 00000000
* DEVICE_nGnRE 001 00000100
* DEVICE_GRE 010 00001100
* NORMAL_NC 011 01000100
* NORMAL 100 11111111
* NORMAL_WT 101 10111011
*/
ldr x5, =MAIR(0x00, MT_DEVICE_nGnRnE) | \
MAIR(0x04, MT_DEVICE_nGnRE) | \
MAIR(0x0c, MT_DEVICE_GRE) | \
MAIR(0x44, MT_NORMAL_NC) | \
MAIR(0xff, MT_NORMAL) | \
MAIR(0xbb, MT_NORMAL_WT)
msr mair_el1, x5
/*
* Prepare SCTLR
*/
mov_q x0, SCTLR_EL1_SET
/*
* Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
* both user and kernel.
*/
ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS
#ifdef CONFIG_ARM64_USER_VA_BITS_52
// ...
#else
ldr_l x9, idmap_t0sz
#endif
tcr_set_t0sz x10, x9
/*
* Set the IPS bits in TCR_EL1.
*/
tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6
#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable hardware update of the Access Flags bit.
* Hardware dirty bit management is enabled later,
* via capabilities.
*/
mrs x9, ID_AA64MMFR1_EL1
and x9, x9, #0xf
cbz x9, 1f
orr x10, x10, #TCR_HA // hardware Access flag update
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
msr tcr_el1, x10
ret // return to head.S
ENDPROC(__cpu_setup)
首先可以看到这个函数被放入了idmap.text
段,然后是一些系统寄存器的设置,接下来是内存属性相关设置,主要是mair_el1
寄存器,再然后是SCTLR和TCR、TTBR相关寄存器设置,并把SCTLR相关内存报错在x0寄存器然后返回。
__primary_switch()
最后来看下__primary_switch
函数
__primary_switch:
#ifdef CONFIG_RANDOMIZE_BASE
mov x19, x0 // preserve new SCTLR_EL1 value
mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value
#endif
adrp x1, init_pg_dir
bl __enable_mmu
#ifdef CONFIG_RELOCATABLE
bl __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
blr x8
// ...
ENDPROC(__primary_switch)
首先x0保存了sctlr_el1
,x1保存了内核镜像映射的pgd,然后作为参数传递给__enable_mmu.
__enable_mmu()
ENTRY(__enable_mmu)
mrs x2, ID_AA64MMFR0_EL1
ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
b.ne __no_granule_support
update_early_cpu_boot_status 0, x2, x3
adrp x2, idmap_pg_dir
phys_to_ttbr x1, x1
phys_to_ttbr x2, x2
msr ttbr0_el1, x2 // load TTBR0
offset_ttbr1 x1
msr ttbr1_el1, x1 // load TTBR1
isb
msr sctlr_el1, x0
isb
/*
* Invalidate the local I-cache so that any instructions fetched
* speculatively from the PoC are discarded, since they may have
* been dynamically patched at the PoU.
*/
ic iallu
dsb nsh
isb
ret
ENDPROC(__enable_mmu)
首先读取ID_AA64MMFR0_EL1
的bit28到bit31位,判断是否4KB页面。然后把当前cpu boot status保存进__early_cpu_boot_status
全局变量。接下来就是将前面设置好的页表基地址分别保存到ttbr0_el1
和ttbr1_el1
,最后就是打开mmu了。
__primary_switched()
注意,在__enable_mmu
函数返回后,现在mmu已经打开了,CPU发出的都是虚拟地址。
这里为什么用ldr来加载=__primary_switched
的地址,然后使用blr
来跳转?
__primary_switched:
adrp x4, init_thread_union
add sp, x4, #THREAD_SIZE
adr_l x5, init_task
msr sp_el0, x5 // Save thread_info
adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
isb
stp xzr, x30, [sp, #-16]!
mov x29, sp
str_l x21, __fdt_pointer, x5 // Save FDT pointer
ldr_l x4, kimage_vaddr // Save the offset between
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 // physical mappings
// Clear BSS
adr_l x0, __bss_start
mov x1, xzr
adr_l x2, __bss_stop
sub x2, x2, x0
bl __pi_memset
dsb ishst // Make zero page visible to PTW
#ifdef CONFIG_RANDOMIZE_BASE
tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
b.ne 0f
mov x0, x21 // pass FDT address in x0
bl kaslr_early_init // parse FDT for KASLR options
cbz x0, 0f // KASLR disabled? just proceed
orr x23, x23, x0 // record KASLR offset
ldp x29, x30, [sp], #16 // we must enable KASLR, return
ret // to __primary_switch()
0:
#endif
add sp, sp, #16
mov x29, #0
mov x30, #0
b start_kernel
ENDPROC(__primary_switched)
__primary_switched
函数的流程比较简单,先保存0号进程的task struct.然后设置异常向量表,然后设置sp指针,并清理BSS段,
最后就调到著名的start_kernel
里面去执行了。
参考资料
ARM64 Kernel Image Mapping的变化 Documentation/arm64/booting.txt