This routine will take a 16-bit unsigned value in BC and produce the 32-bit square of that number in HL:DE.
I don't know if this is the most efficient way to do it - I couldn't find any useful information on the interwebs so I had to come up with it myself. The algorithm used is as follows:
Code: Select all
unsigned square(unsigned x)
{
unsigned sqrsum = 0;
unsigned sqrbit = 1;
unsigned result = 0;
for (; x; x >>= 1)
{
if (x & 1)
{
result += sqrbit;
result += sqrsum;
sqrsum += sqrbit * 2;
}
sqrsum *= 2;
sqrbit *= 4;
}
return result;
}
And the Z80 version. A bit large (~150 bytes) since it breaks it up into 8, 16 and 32 bit loops so it only uses 32-bit arithmetic when it needs to.
Code: Select all
;-------------------------------------------------------------------------------
;
;
; INPUTS:
;
; * BC - Number to square
;
; OUTPUTS:
;
; * HL - High word of square(BC)
; * DE - Low word of square(BC)
;
; DESTROYED:
;
; * AF
;-------------------------------------------------------------------------------
SquareBC:
;-------------------------------------------------------------------
; Process the low 4 bits of BC using 8-bit arithmetic.
;
; A = sqrsum
; D = sqrbit
; E = result
;-------------------------------------------------------------------
push bc ; [11]
xor a ; [4]
ld de, $0100 ; [10]
_sqrLoop8: add a, a ; [4] sqrsum * 2
sra c ; [8]
jr nc, _nextBit8 ; [12/7]
;-------------------------------------------------------------------
; When the next bit of BC is set...
;-------------------------------------------------------------------
ld l, a ; [4] save sqrsum
add a, e ; [4] += result
add a, d ; [4] += sqrbit
ld e, a ; [4] save result
ld a, l ; [4] restore sqrsum
sla d ; [8] sqrbit * 2
add a, d ; [4] sqrsum + sqrbit * 2
sla d ; [8]
jp nc, _sqrLoop8 ; [10]
jp _sqrDone8 ; [10]
;-------------------------------------------------------------------
; When the next bit of BC is reset...
;-------------------------------------------------------------------
_nextBit8: sla d ; [8]
sla d ; [8]
jp nc, _sqrLoop8 ; [10]
;-------------------------------------------------------------------
; Clean up from 8-bit mode and check if there is more to do
;-------------------------------------------------------------------
_sqrDone8: ld h, d ; [4] move sqrsuml into HL (D is zero).
ld l, a ; [4]
ld a, c ; [4] check if there is anything left
or b ; [4]
jp nz, _sqrGo16 ; [10]
ld l, d ; [4] zero HL and return
pop bc ; [10]
ret ; [10]
;-------------------------------------------------------------------
; Process the next 4 bits with 16-bit arithmetic
;
; HL = sqrsum
; DE = result
; BC = sqrbit
;-------------------------------------------------------------------
_sqrGo16: ld a, c ; [4]
push bc ; [11]
ld b, 1 ; [7]
ld c, d ; [4]
_sqrLoop16: add hl, hl ; [11] sqrsuml *= 2
rra ; [4] shift next bit
jr nc, _nextBit16 ; [12/7]
;-------------------------------------------------------------------
; When the next bit of BC is set...
;-------------------------------------------------------------------
ex de, hl ; [4] sqrsum <-> result
add hl, de ; [11] result += sqrsum
add hl, bc ; [15] result += sqrbit
ex de, hl ; [4] result += sqrsum
sla b ; [8]
add hl, bc ; [11] sqrsum += sqrbit
sla b ; [8]
jp nc, _sqrLoop16 ; [10]
jp _sqrDone16 ; [10]
;-------------------------------------------------------------------
; When the next bit of BC is reset...
;-------------------------------------------------------------------
_nextBit16: sla b ; [8]
sla b ; [8]
jp nc, _sqrLoop16 ; [10]
;-------------------------------------------------------------------
; Check if there are more bits to process before continuing.
;-------------------------------------------------------------------
_sqrDone16: pop af ; [10] restore high byte of initial BC
or a ; [4]
jp nz, _sqrGo32 ; [10]
ld h, b ; [4] B is zero
ld l, b ; [4]
pop bc ; [10]
ret ; [10]
;-------------------------------------------------------------------
; Process the remaining 16 bits with 32-bit arithmetic
;
; DE:HL = sqrsum
; (SP):IX = result
; BC = sqrbit
;-------------------------------------------------------------------
_sqrGo32: push ix ; [15] preserve IX
ld ixh, d ; [8] move resultl into IX
ld ixl, e ; [8]
ld d, b ; [4] zero sqrsumh
ld e, b ; [4]
push de ; [11] initial resulth = 0
ld c, $01 ; [7] initial sqrbit
_sqrLoop32: add hl, hl ; [11] sqrsum * 2
rl e ; [8]
rl d ; [8]
sra a ; [8]
jr nc, _nextBit32 ; [12/7]
;-------------------------------------------------------------------
; When the next bit of BC is set...
;-------------------------------------------------------------------
ex de, hl ; [4] sqrsuml <-> sqrsumh
add ix, de ; [15] resultl += sqrsuml
ex de, hl ; [4] sqrsumh <-> sqrsuml
ex (sp), hl ; [19] sqrsuml <-> resulth
adc hl, de ; [15] resulth += sqrsumh
add hl, bc ; [11] resulth += sqrbit
ex (sp), hl ; [19] resulth <-> sqrsuml
ex de, hl ; [4] sqrsuml <-> sqrsumh
sla c ; [8] sqrbit *= 2
rl b ; [8]
add hl, bc ; [11] sqrsumh += sqrbit
ex de, hl ; [4] sqrsumh <-> sqrsuml
sla c ; [8] squarebit *= 2
rl b ; [8]
jp nc, _sqrLoop32 ; [10]
jp _sqrDone32 ; [10]
;-------------------------------------------------------------------
; When the next bit of BC is reset...
;-------------------------------------------------------------------
_nextBit32: jr z, _sqrDone32 ; [12/7]
sla c ; [8] sqrbit * 4
rl b ; [8]
sla c ; [8]
rl b ; [8]
jp nc, _sqrLoop32 ; [10]
_sqrDone32: ld d, ixh ; [8] move result into HL:DE
ld e, ixl ; [8]
pop hl ; [10] pop high result
pop ix ; [14]
pop bc ; [10]
ret ; [10]