edrdos Optional 386 support

I was wondering if may be of interest to start implementing 386 assembly conditional directives to optionally add support for real mode 386 instructions. This would improve a lot div64 and mul64, but also some other operations that transfer memory. Something like this:

%macro RepStos %ifidni cpu, '8086' push ecx shr ecx, 2 rep movsd pop ecx and ecx, 3 rep movsb %else shr cx, 1
rep movsw
adc cx, cx
rep movsb
%endif %endmacro

Feb 12 '25 09:02 javiergutierrezchamorro

(l)MS-DOS contains a function in its bios/msdisk.nas file that can use rep movsd if running on a 386, and is patched to fall back to rep movsw otherwise: https://hg.pushbx.org/ecm/msdos4/file/440e4eb392d0/src/BIOS/msdisk.nas#l2618

Feb 12 '25 10:02 ecm-pushbx

(l)MS-DOS contains a function in its bios/msdisk.nas file that can use rep movsd if running on a 386, and is patched to fall back to rep movsw otherwise: https://hg.pushbx.org/ecm/msdos4/file/440e4eb392d0/src/BIOS/msdisk.nas#l2618

Sounds quite interesting, and maybe it is a better approach to use them on runtime instead of in assembly time as I proposed.

Feb 12 '25 11:02 javiergutierrezchamorro

By the way, your example is flawed. In a 16-bit CS (including running in Real/Virtual 86 Mode) the default for movsd is a16, so it will use cx as a counter despite the 386+ dword operand size (o32). So you shouldn't operate on ecx as the counter but rather cx.

Feb 12 '25 11:02 ecm-pushbx

This would be useful for div32 and mul32 that can be further simplified on 386:

` %ifidni cpu, '8086'

div32: push bx mov bp, sp

; cache 32-bit divisor
mov     cx, word ptr [bp+14]
mov     bx, word ptr [bp+12]

; zero 32-bit quotient
mov     word ptr [bp+ 6], 0
mov     word ptr [bp+ 4], 0

; clear remainder in DX:AX
xor     dx, dx
xor     ax, ax

mov     di, 32           ; 32 iterations

div32_loop: ; shift dividend ←→ CF shl word ptr [bp+ 8], 1 rcl word ptr [bp+10], 1 rcl word ptr [bp+12], 1 rcl word ptr [bp+14], 1

; shift remainder ← CF
rcl     ax, 1
rcl     dx, 1

; if rem32 ≥ divisor32?
cmp     dx, cx
jb      div32_skip
ja      div32_sub
cmp     ax, bx
jb      div32_skip

div32_sub: sub ax, bx sbb dx, cx or word ptr [bp+ 4], 1 ; set quotient bit

div32_skip: dec di jnz div32_loop

; store remainder (low=AX, high=DX)
mov     word ptr [bp+ 2], ax
mov     word ptr [bp+ 4], dx

pop     bx
ret

%else

div32: push bx mov bp, sp

; Load 32-bit divisor (CX:BX) → EBX
mov     cx, [bp+14]
mov     bx, [bp+12]
movzx   eax, bx
movzx   edx, cx
shl     edx, 16
or      eax, edx             ; EAX = full divisor → store in EBX
mov     ebx, eax

; Load 32-bit dividend (DX:AX) → EDX:EAX
mov     ax, [bp+ 8]
mov     dx, [bp+10]
movzx   eax, ax
movzx   edx, dx

; Perform 32-bit division → EAX = quotient, EDX = remainder
div     ebx

; Store remainder and quotient in same stack locations
mov     [bp+ 4], dx          ; remainder.low
mov     [bp+ 6], ax          ; quotient.low

pop     bx
ret

%endif

`

Jul 10 '25 14:07 javiergutierrezchamorro

; Load 32-bit divisor (CX:BX) → EBX
mov     cx, [bp+14]
mov     bx, [bp+12]
movzx   eax, bx
movzx   edx, cx
shl     edx, 16
or      eax, edx             ; EAX = full divisor → store in EBX
mov     ebx, eax

Just do mov ebx, [bp + 12]

; Load 32-bit dividend (DX:AX) → EDX:EAX
mov     ax, [bp+ 8]
mov     dx, [bp+10]
movzx   eax, ax
movzx   edx, dx

You want edx = 0 and eax = dword [bp + 8], not this.

Jul 10 '25 14:07 ecm-pushbx

I'm fairly sure this bit is also wrong:

; Store remainder and quotient in same stack locations
mov     [bp+ 4], dx          ; remainder.low
mov     [bp+ 6], ax          ; quotient.low

Jul 10 '25 17:07 ecm-pushbx