Optional 386 support
I was wondering if may be of interest to start implementing 386 assembly conditional directives to optionally add support for real mode 386 instructions. This would improve a lot div64 and mul64, but also some other operations that transfer memory. Something like this:
%macro RepStos
%ifidni cpu, '8086'
push ecx
shr ecx, 2
rep movsd
pop ecx
and ecx, 3
rep movsb
%else
shr cx, 1
rep movsw
adc cx, cx
rep movsb
%endif
%endmacro
(l)MS-DOS contains a function in its bios/msdisk.nas file that can use rep movsd if running on a 386, and is patched to fall back to rep movsw otherwise: https://hg.pushbx.org/ecm/msdos4/file/440e4eb392d0/src/BIOS/msdisk.nas#l2618
(l)MS-DOS contains a function in its bios/msdisk.nas file that can use
rep movsdif running on a 386, and is patched to fall back torep movswotherwise: https://hg.pushbx.org/ecm/msdos4/file/440e4eb392d0/src/BIOS/msdisk.nas#l2618
Sounds quite interesting, and maybe it is a better approach to use them on runtime instead of in assembly time as I proposed.
By the way, your example is flawed. In a 16-bit CS (including running in Real/Virtual 86 Mode) the default for movsd is a16, so it will use cx as a counter despite the 386+ dword operand size (o32). So you shouldn't operate on ecx as the counter but rather cx.
This would be useful for div32 and mul32 that can be further simplified on 386:
` %ifidni cpu, '8086'
div32: push bx mov bp, sp
; cache 32-bit divisor
mov cx, word ptr [bp+14]
mov bx, word ptr [bp+12]
; zero 32-bit quotient
mov word ptr [bp+ 6], 0
mov word ptr [bp+ 4], 0
; clear remainder in DX:AX
xor dx, dx
xor ax, ax
mov di, 32 ; 32 iterations
div32_loop: ; shift dividend ←→ CF shl word ptr [bp+ 8], 1 rcl word ptr [bp+10], 1 rcl word ptr [bp+12], 1 rcl word ptr [bp+14], 1
; shift remainder ← CF
rcl ax, 1
rcl dx, 1
; if rem32 ≥ divisor32?
cmp dx, cx
jb div32_skip
ja div32_sub
cmp ax, bx
jb div32_skip
div32_sub: sub ax, bx sbb dx, cx or word ptr [bp+ 4], 1 ; set quotient bit
div32_skip: dec di jnz div32_loop
; store remainder (low=AX, high=DX)
mov word ptr [bp+ 2], ax
mov word ptr [bp+ 4], dx
pop bx
ret
%else
div32: push bx mov bp, sp
; Load 32-bit divisor (CX:BX) → EBX
mov cx, [bp+14]
mov bx, [bp+12]
movzx eax, bx
movzx edx, cx
shl edx, 16
or eax, edx ; EAX = full divisor → store in EBX
mov ebx, eax
; Load 32-bit dividend (DX:AX) → EDX:EAX
mov ax, [bp+ 8]
mov dx, [bp+10]
movzx eax, ax
movzx edx, dx
; Perform 32-bit division → EAX = quotient, EDX = remainder
div ebx
; Store remainder and quotient in same stack locations
mov [bp+ 4], dx ; remainder.low
mov [bp+ 6], ax ; quotient.low
pop bx
ret
%endif
`
; Load 32-bit divisor (CX:BX) → EBX
mov cx, [bp+14]
mov bx, [bp+12]
movzx eax, bx
movzx edx, cx
shl edx, 16
or eax, edx ; EAX = full divisor → store in EBX
mov ebx, eax
Just do mov ebx, [bp + 12]
; Load 32-bit dividend (DX:AX) → EDX:EAX
mov ax, [bp+ 8]
mov dx, [bp+10]
movzx eax, ax
movzx edx, dx
You want edx = 0 and eax = dword [bp + 8], not this.
I'm fairly sure this bit is also wrong:
; Store remainder and quotient in same stack locations
mov [bp+ 4], dx ; remainder.low
mov [bp+ 6], ax ; quotient.low