;---------------------------Module-Header-------------------------------;
; Module Name: SQRT.ASM
;
; Contains FIXED and ULONG square root
;
; Created:  Sun 30-Aug-1987 19:28:30
;
; NOTE: the 64-bit sqrt code is 386 specific!
;
;-----------------------------------------------------------------------;
;
;     (C) Copyright Microsoft Corp. 1987-1991.  All rights reserved.
;
;     You have a royalty-free right to use, modify, reproduce and 
;     distribute the Sample Files (and/or any modified version) in 
;     any way you find useful, provided that you agree that 
;     Microsoft has no warranty obligations or liability for any 
;     Sample Application Files which are modified. 
;

?WIN	= 0
?PLM    = 0
?NODATA = 1

        .xlist
        include cmacros.inc
        .list

UQUAD   struc
uq0     dw      ?
uq1     dw      ?
uq2     dw      ?
uq3     dw      ?
UQUAD	ends

PTL	struc
ptl_x	dd	?
ptl_y	dd	?
PTL	ends

POINTFX struc
x       dd      ?
y       dd      ?
z       dd      ?
w       dd      ?
POINTFX ends


MAXINT equ 7FFFh
MININT equ 8000h


;       The following two equates are just used as shorthand
;       for the "word ptr" and "byte ptr" overrides.

wptr    equ     word ptr
bptr    equ     byte ptr

; The following structure should be used to access high and low
; words of a DWORD.  This means that "word ptr foo[2]" -> "foo.hi".

LONG    struc
lo      dw      ?
hi      dw      ?
LONG    ends

FARPOINTER      struc
off     dw      ?
sel     dw      ?
FARPOINTER      ends

assert macro   a,b,c,d
endm

EAXtoDXAX   macro
        shld    edx,eax,16      ; move HIWORD(eax) to dx
        endm

DXAXtoEAX   macro
        ror     eax,16          ; xchg HIWORD(eax) and LOWORD(eax)
        shrd    eax,edx,16      ; move LOWORD(edx) to HIWORD(eax)
        endm

.386
sBegin Code
        assumes cs,Code
        assumes es,nothing
        assumes ds,nothing

;---------------------------Private-Macro-------------------------------;
; set_ov
;
; Sets the overflow flag
;
; Entry:
;       reg = word register to use for scratch
; Returns:
;       OF set
; Error Returns:
;       none
; Registers Destroyed:
;       reg
;-----------------------------------------------------------------------;

set_ov  macro   reg
        mov     reg,8000h
        dec     reg
        endm

;---------------------------Public-Routine------------------------------;
; square_root
;
; Computes the Fixed square root of a Fixed.  This algorithm
; comes from Knuth D.E; Metafont:The Program (Addison-Weseley, 1986)
; Part 8:Algebraic and Transcendental Functions.
;
; Notation used below:
;
; Entry:
;       DX:AX = number to square root
; Returns:
;         DX:AX = square root of input
; Error Returns:
;       OF = 1
; Registers Destroyed:
;       BX,CX
; Registers Preserved:
;       DS,ES,SI,DI
;-----------------------------------------------------------------------;

        assumes ds,nothing
        assumes es,nothing

cProc   fxsqrt,<PUBLIC,NEAR,NODATA,NONWIN>,<>
        ParmD   fx
cBegin
	mov	ax, fx.lo
        mov     dx, fx.hi

        cCall   square_root

        mov     dh,dl           ; DX:AX *= sqrt(1000h)
        mov     dl,ah
        mov     ah,al
        xor     al,al
cEnd

cProc   ulsqrt,<PUBLIC,NEAR,NODATA,NONWIN>,<>
        ParmD   ul
cBegin
        mov     ax, ul.lo
        mov     dx, ul.hi

        cCall   square_root
cEnd

;
;   compute the square root of EDX:EAX
;
;   retult returned in EAX and DX:AX
;
;   We dont have a 64-bit square_root function
;   so approximate it by dividing by 4 until we can use our
;   32-bit function
;

cProc   sqrt64,<PUBLIC,NEAR,NODATA,NONWIN>,<>
cBegin
        xor     cx,cx

test_less_than_32:
        or      edx,edx             ; is it less than 32-bit?
        jz      less_than_32

        inc     cx

        shrd    eax,edx,2           ; divide by 4
        shr     edx,2

        jmp     short test_less_than_32

less_than_32:
        ;;
        ;;  EDX:EAX is now less than 32 bits, CX contains count of
        ;;  divisions by 4 we had to do to get here.  call square_root
        ;;  and then re-normalize
        ;;

        push    cx
        EAXtoDXAX
        call    square_root
        pop     cx
        jcxz    sqrt64_exit

@@:     shl     ax,1
        rcl     dx,1
        loop    @b

sqrt64_exit:
        DXAXtoEAX
cEnd

cProc   square_root,<PUBLIC,NEAR,NODATA,NONWIN>,<SI,DI>
;;        ParmD   fx
cBegin
;;        mov     ax, fx.lo
;;        mov     dx, fx.hi
; check for < 1 since this algorithm returns 0000:0001

        or      dx,dx
        jnz     non_zero_arg
        cmp     ax,1
        ja      non_zero_arg
        jmp     square_root_exit

negative_arg:                           ; can't have number negative
        xor     ax,ax
        cwd
        set_ov  bx
        jmp     square_root_exit

non_zero_arg:
        cCall   ulNormalize
;;;;;;;;jcxz    negative_arg
        jz      exit_relay
        shr     cx,1
        jc      add_shifts_only
        shr     dx,1
        rcr     ax,1
        dec     cx
add_shifts_only:

;;
;; use 23 for a FIXED square_root, and 15 for a ULONG square_root!
;;

if 0
        mov     ch,23                   ; FIXED square_root
else
        mov     ch,15                   ; ULONG square_root
endif

        sub     ch,cl
        mov     cl,8
        sub     ch,cl
        jg      more_than_8             ; CL = Min(count,8)
        add     cl,ch
        xor     ch,ch                   ; CH = Max(count-8,0)
more_than_8:


        xchg    ax,si                   ; DI:SI = x
        mov     di,dx

        xor     ax,ax                   ; AX = y = lower bound = 0

        mov     bx,2                    ; BX = q = estimate of root = 2

        add     si,si                   ; intiialize y = top bit of x
        adc     di,di
        adc     ax,ax

first_8_loop:
        add     si,si                   ; move 2 bits from top
        adc     di,di                   ; of x to bottom of y
        adc     ax,ax
        assert  NC

        add     si,si
        adc     di,di
        adc     ax,ax
        assert  NC

        sub     ax,bx                   ; AX = y - q
        jle     y_neg_or_zero_8

;!!!CR loops can be cleaned up some

        add     bx,bx                   ; BX = 2q
        sub     ax,bx                   ; AX = y - 3q
        jg      y_greater_q_8

        add     ax,bx                   ; AX = y - q

        dec     cl
        jnz     first_8_loop

        jcxz    first_8_was_enough
        jmp     short done_with_1st_8

y_greater_q_8:
        add     bx,2                    ; BX = 2q + 2 (or 2q if jg not taken)

        dec     cl
        jnz     first_8_loop

        or      ch,ch
        jnz     done_with_1st_8

first_8_was_enough:
        xchg    ax,bx                   ; DX:AX = q
        xor     dx,dx
        shr     ax,1                    ; root = q >> 1
        or      ax,ax                   ; clear overflow flag
exit_relay:
        jmp     square_root_exit

y_neg_or_zero_8:                       ; come here if y <= 0
        dec     bx
        add     bx,bx                   ; BX = 2q - 2
        add     ax,bx                   ; AX = y + q - 2

        dec     cl
        jnz     first_8_loop
        jcxz    first_8_was_enough

;si is now empty, di contains all signifigant bits
done_with_1st_8:

        mov     cl,4
        sub     ch,cl
        jg      second_4_loop           ; CL = Min(count,4)
        add     cl,ch
        xor     ch,ch                   ; CH = Max(count-8,0)

second_4_loop:
                                        ; move 2 bits from top
        add     di,di                   ; of x to bottom of y
        adc     ax,ax
        assert  NC

        add     di,di
        adc     ax,ax
        assert  NC

        sub     ax,bx                   ; AX = y - q
        jle     y_neg_or_zero_2nd_4

        add     bx,bx                   ; BX = 2q

;!!!CR use compare and jump rather than sub, branch, and restore
        sub     ax,bx                   ; AX = y - 3q
        jg      y_greater_q_2nd_4

        add     ax,bx                   ; AX = y - q

        dec     cl
        jnz     second_4_loop

        jcxz    second_4_was_enough
        jmp     short done_with_2nd_4

y_greater_q_2nd_4:
        add     bx,2                    ; BX = 2q + 2 (or 2q if jg not taken)

        dec     cl
        jnz     second_4_loop

        or      ch,ch
        jnz     done_with_2nd_4


second_4_was_enough:
        xchg    ax,bx                   ; DX:AX = q
        xor     dx,dx
        shr     ax,1                    ; root = q >> 1
        or      ax,ax                   ; clear overflow flag
        jmp     square_root_exit


y_neg_or_zero_2nd_4:                       ; come here if y <= 0
        dec     bx
        add     bx,bx                   ; BX = 2q - 2
        add     ax,bx                   ; AX = y + q - 2

        dec     cl
        jnz     second_4_loop
        jcxz    second_4_was_enough

done_with_2nd_4:

        xor     dx,dx                   ; DX:AX = y   SI:BX = q

        mov     cl,4
        sub     ch,cl
        jg      third_4_loop             ; CL = Min(count,4)
        add     cl,ch
        xor     ch,ch                   ; CH = Max(count-13,0)

third_4_loop:
;!!!CR Add comment noting that 32 bit math is being used now
                                        ; move 2 bits from top
        add     di,di                   ; of x to bottom of y
        adc     ax,ax
        adc     dx,dx
        assert  NC

        add     di,di
        adc     ax,ax
        adc     dx,dx
        assert  NC

        sub     ax,bx                   ; DX:AX = y - q
        sbb     dx,si
        js      y_negative_3rd_4
        jz      y_might_be_zero_3rd_4

sorry_y_not_zero_3rd_4:
        add     bx,bx
        adc     si,si                   ; SI:BX = 2q
        sub     ax,bx                   ; DX:AX = y - 3q
        sbb     dx,si
        jg      y_greater_q_3rd_4
        jl      y_less_or_equal_q_3rd_4
        or      ax,ax
        jnz     y_greater_q_3rd_4

y_less_or_equal_q_3rd_4:
        add     ax,bx                   ; DX:AX = y - q
        adc     dx,si

        dec     cl
        jnz     third_4_loop

        jcxz    third_4_was_enough
        jmp     short done_with_3rd_4

y_greater_q_3rd_4:
        add     bx,2
        adc     si,0

        dec     cl
        jnz     third_4_loop

        or      ch,ch
        jnz     done_with_3rd_4

third_4_was_enough:
        xchg    ax,bx                   ; DX:AX = q
        mov     dx,si
        shr     dx,1                    ; root = q >> 1
        rcr     ax,1
        or      ax,ax                   ; clear overflow flag
	jmp	square_root_exit ;;; short!

y_might_be_zero_3rd_4:
        or      ax,ax
        jnz     sorry_y_not_zero_3rd_4

y_negative_3rd_4:                       ; come here if y <= 0

        sub     bx,1
        sbb     si,0
        add     bx,bx                   ; DL:BX = 2q - 2
        adc     si,si

        add     ax,bx
        adc     dx,si                   ; DH:AX = y + q - 2

        dec     cl
        jnz     third_4_loop
        jcxz    third_4_was_enough


done_with_3rd_4:

        xchg    ch,cl

last_7_loop:
        add     ax,ax
        adc     dx,dx

        add     ax,ax
        adc     dx,dx

        sub     ax,bx                   ; DX:AX = y - q
        sbb     dx,si
        js      y_negative_last_7
        jz      y_might_be_zero_last_7

;!!!CR Clean up the exit so that a single exit point is used.

sorry_y_not_zero_last_7:
        add     bx,bx
        adc     si,si                   ; SI:BX = 2q
        sub     ax,bx                   ; DX:AX = y - 3q
        sbb     dx,si
        jg      y_greater_q_last_7
        jl      y_less_or_equal_q_last_7
        or      ax,ax
        jnz     y_greater_q_last_7

y_less_or_equal_q_last_7:
        add     ax,bx                   ; DX:AX = y - q
        adc     dx,si

        loop    last_7_loop

        xchg    ax,bx                   ; DX:AX = q
        mov     dx,si
        shr     dx,1                    ; root = q >> 1
        rcr     ax,1
        or      ax,ax                   ; clear overflow flag
        jmp     short square_root_exit

y_greater_q_last_7:
        add     bx,2
        adc     si,0

        loop    last_7_loop

        xchg    ax,bx                   ; DX:AX = q
        mov     dx,si
        shr     dx,1                    ; root = q >> 1
        rcr     ax,1
        or      ax,ax                   ; clear overflow flag
        jmp     short square_root_exit

y_might_be_zero_last_7:
        or      ax,ax
        jnz     sorry_y_not_zero_last_7

y_negative_last_7:                       ; come here if y <= 0

        sub     bx,1
        sbb     si,0
        add     bx,bx                   ; DL:BX = 2q - 2
        adc     si,si

        add     ax,bx
        adc     dx,si                   ; DH:AX = y + q - 2

        loop    last_7_loop

        xchg    ax,bx                   ; DX:AX = q
        mov     dx,si
        shr     dx,1                    ; root = q >> 1
        rcr     ax,1

        or      ax,ax                   ; clear overflow flag
square_root_exit:
cEnd

;---------------------------Public-Routine------------------------------;
; ulNormalize
;
; Normalizes a ULONG so that the highest order bit is 1.  Returns the
; number of shifts done.  Also returns ZF=1 if the ULONG was zero.
;
; Entry:
;       DX:AX = ULONG
; Returns:
;       DX:AX = normalized ULONG
;       CX    = shift count
;       ZF    = 1 if the ULONG is zero, 0 otherwise
; Registers Destroyed:
;       none
;-----------------------------------------------------------------------;

        assumes ds,nothing
        assumes es,nothing

cProc   ulNormalize,<PUBLIC,NEAR>
cBegin

; shift by words

        xor     cx,cx
        or      dx,dx
        js      ulNormalize_exit
        jnz     top_word_ok
        xchg    ax,dx
        or      dx,dx
        jz      ulNormalize_exit        ; the zero exit
        mov     cl,16
        js      ulNormalize_exit
top_word_ok:

; shift by bytes

        or      dh,dh
        jnz     top_byte_ok
        xchg    dh,dl
        xchg    dl,ah
        xchg    ah,al
        add     cl,8
        or      dh,dh
        js      ulNormalize_exit
top_byte_ok:

; do the rest by bits

        inc     cx
        add     ax,ax
        adc     dx,dx
        js      ulNormalize_exit
        inc     cx
        add     ax,ax
        adc     dx,dx
        js      ulNormalize_exit
        inc     cx
        add     ax,ax
        adc     dx,dx
        js      ulNormalize_exit
        inc     cx
        add     ax,ax
        adc     dx,dx
        js      ulNormalize_exit
        inc     cx
        add     ax,ax
        adc     dx,dx
        js      ulNormalize_exit
        inc     cx
        add     ax,ax
        adc     dx,dx
        js      ulNormalize_exit
        inc     cx
        add     ax,ax
        adc     dx,dx
ulNormalize_exit:
cEnd

sEnd    Code

        end
