George Spelvin
2016-12-23 22:11:21 UTC
I mentioned last time trying doing something like this, and now
I have it coded up (as utoa_bytes()).
Input Decimal
bits mem_toa mem_tod itoa nibbles bytes
8 269 220 217 141 143
16 664 462 451 321 273
24 1294 783 838 608 432
32 2059 1219 1167 948 666
40 3059 1775 1649 1395 941
48 4194 2373 2127 1895 1217
56 5477 3069 2733 2459 1551
64 6995 3822 3420 3130 1902
It's larger (120 bytes, vs, 90 for the nibbles code), and has a bit more
startup overhead, but is quite fast. I also have a 122-byte variant
which saves 1 cycle per pair of digits. (1895 cycles for 2^64-1.)
I'm interested in this both for speed, and because it's a good fit to my
suggestion of saving RAM space buffering the converted digits in BCD.
So now that we have several good candidates, how to proceed?
What size/speed tradeoff should be the final choice?
Personally, I'm not worried about 30 bytes of code on enhanced AVRs with
a multiplier, and although I haven't coded up the combined algorithm, I
believe the whole thing is smaller than the ultoa_invert code it replaces.
But this is a judgement call, and I'd appreciate some other voices.
#define __zero_reg__ r1
#define __tmp_reg__ r0
.macro DEFUN name
.section .text.\name,"ax",@progbits
.global \name
.func \name
\name:
.endm
.macro ENDF name
.size \name, .-\name
.endfunc
.endm
#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP jmp
#else
#define XCALL rcall
#define XJMP rjmp
#endif
.macro wmov r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
movw \r_dest, \r_src
#else
mov \r_dest, \r_src
mov \r_dest+1, \r_src+1
#endif
.endm
.text
#define pBuf 26
#define pNum 30
#define Length r20
#define Rem r19
#define Quot r21
#define Count r22
#define Num r23
#define Ten r16
#define Ox67 r18
;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
DEFUN u64toa_nibbles
wmov pBuf, 24
wmov pNum, 22
push Ten
ldi Ten, 10
;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10,
;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
;; also works, and that saves us one "lsr r1" per division.
ldi Ox67, 0x67
clr Count
.LoopDigits:
add pNum, Length
;; Count is 0.
adc pNum+1, Count
mov Count, Length
ldi Length, -1
clr Rem
.LoopBytes:
ld Num, -Z
;; Rem:Num <<= 4
;; "andi Rem, 0xf" not needed because Rem < 10.
eor Rem, Num
andi Num, 0x0f
eor Rem, Num
swap Rem
;; Quot := Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
mov Quot, r1
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
;; ...and (mostly) the same again for the low nibble.
;; Quot <<= 4
swap Quot
;; Rem:Num <<= 4
swap Rem
eor Rem, Num
;; Quot += Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
add Quot, r1
;; Quot = 0 ==> R1 = 0, hence we can also skip the MUL + SUB below.
breq 1f
sbrc Length, 7
;; Current Length not yet known: Set it according to highest non-zero byte.
mov Length, Count
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
1:
st Z, Quot
dec Count
brne .LoopBytes
subi Rem, -'0'
st X+, Rem
;; Only continue if we determined Length (Length > 0)
;; Length < 0 means all bytes are zero, hence we are done then.
sbrs Length, 7
rjmp .LoopDigits
pop Ten
clr __zero_reg__
st X, __zero_reg__
;; pBuf survived in R25:R24
#if 0
# XJMP strrev
wmov ZL, 24
7: ld Count, -X
ld Num, Z
st Z+, Count
st X, Num
/* Loop while Z < X */
cp ZL, XL
cpc ZH, XH
brlo 7b
#endif
ret
ENDF u64toa_nibbles
#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This works simularly, but in base 100. Every step, we take a
;; remainder <= 99, multiply by 256, and add the next byte (<=255).
;; The result is in the range 0 <= x < 25600.
;;
;; To divide by 100 with multiply:
;; x * 0x29 >> 12 == x/100 for 0 <= x < 1099
;; x * 0x51f >> 17 == x/100 for 0 <= x < 4699
;; x * 0x147b >> 19 == x/100 for 0 <= x < 43699
;; x * 0x28f6 >> 20 == x/100 for 0 <= x < 43699
;;
;; Using a multiplier one bit larger than strictly necessary gives us
;; a more convenient shift amount. This is still small enough that we
;; can compute the high bits of the product we need in only two registers.
;;
;; The largest possible x = 0x63ff also has the largest possible
;; lsbyte, so it will give us the largest possible partial products.
;; Multiplying that by 0x28f6 requires four partial products:
;;
;; FF * F6 = F50A
;; FF * 28-- = 27D8--
;; 63-- * F6 = 5F22--
;; 63-- * 28-- = F78----
;;
;; The important thing is that the sum of the first three partial products
;; is 0x87ef0a, a 24-bit number. So there is no need to propagate
;; carries into the msbyte. We immediately discard the lsbyte of the
;; first partial product, and sum the others into a 2-byte register.
;; Then we throw away the lsbyte of that register and use it for the msbyte
;; of the final sum.
;; extern void utoa_bytes(char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
#define ZH r31
#define ZL r30
#define XH r27
#define XL r26
#define Q2 r23 /* Byte 2 of 4-byte product */
#define Q1 r22 /* Byte 1 (and 3) of 4-byte product */
#define Count r21
#define Length r20 /* Argument */
#define Rem r19
#define Hundred r18 /* Constant 0x64 = 100 */
#define Khi r17 /* Constant 0x28 = 40 */
#define Klo r16 /* Constant 0xf6 = 246 */
#define Num r15
DEFUN utoa_bytes
movw XL, r24 ; pBuf (output) to X
movw ZL, r22 ; pNum (input) to Z
ldi Hundred, 100
push Khi
ldi Khi, 0x28
push Klo
ldi Klo, 0xf6
push Num
clr Count
.LoopDigits2:
add ZL, Length
adc ZH, Count
mov Count, Length
ldi Length, -1
clr Rem
.LoopBytes2:
ld Num, -Z
/* Multiply Rem:Num by Khi:Klo */
mul Num, Khi
mov Q1, r0
mov Q2, r1
mul Rem, Klo
add Q1, r0
adc Q2, r1 ; Cannot overflow
mul Num, Klo
add Q1, r1
clr Q1 ; We only need the carry bit out
adc Q2, Q1 ; This cannot overflow, either
mul Rem, Khi
add Q2, r0
adc Q1, r1
; We now have the high 12 bits of the 28-bit product in Q1:Q2.
; Shift down by 4 bits
andi Q2, 0xf0
or Q1, Q2
swap Q1
;; We now have the new quotient in "Q1".
st Z, Q1
;; If Q1 == 0, don't set length (and multiply is redundant, too)
breq 1f
sbrc Length, 7 ;; Do we have a length yet?
mov Length, Count ;; Remember position of highest non-zero Q
mul Q1, Hundred
sub Num, r0 ; New remainder
1:
mov Rem, Num
dec Count
brne .LoopBytes2
;; We now have the final base-100 remiander in Rem.
;; Break it apart into two base-10 digits in Q1:Rem
;; (We could use the multiplier again, but that woud require
;; even more registers for the constants and more instructions,
;; and this isn't the inner loop.)
ldi Q1, '0'
3: subi Q1, -3
subi Rem, 30
brcc 3b
4: dec Q1
subi Rem, -10
brcs 4b
subi Rem, -'0'
st X+, Rem
st X+, Q1
;; Only continue if we determined Length (Length > 0)
;; Length < 0 means all bytes are zero, hence we are done.
sbrs Length, 7
rjmp .LoopDigits2
;; Chop off redundant trailing zero
clr __zero_reg__
cpi Q1, '0'
brne 5f
st -X, __zero_reg__ ; Could sbiw, but this is same speed
5: st X, __zero_reg__
; Restore registers
pop Num
pop Klo
pop Khi
movw r24, r26 ; Return end of output
ret
ENDF utoa_bytes
#undef Num
#undef Klo
#undef Khi
#undef Hundred
#undef Rem
#undef Length
#undef Count
#undef Q1
#undef Q2
#undef XL
#undef XH
#undef ZL
#undef ZH
I have it coded up (as utoa_bytes()).
Input Decimal
bits mem_toa mem_tod itoa nibbles bytes
8 269 220 217 141 143
16 664 462 451 321 273
24 1294 783 838 608 432
32 2059 1219 1167 948 666
40 3059 1775 1649 1395 941
48 4194 2373 2127 1895 1217
56 5477 3069 2733 2459 1551
64 6995 3822 3420 3130 1902
It's larger (120 bytes, vs, 90 for the nibbles code), and has a bit more
startup overhead, but is quite fast. I also have a 122-byte variant
which saves 1 cycle per pair of digits. (1895 cycles for 2^64-1.)
I'm interested in this both for speed, and because it's a good fit to my
suggestion of saving RAM space buffering the converted digits in BCD.
So now that we have several good candidates, how to proceed?
What size/speed tradeoff should be the final choice?
Personally, I'm not worried about 30 bytes of code on enhanced AVRs with
a multiplier, and although I haven't coded up the combined algorithm, I
believe the whole thing is smaller than the ultoa_invert code it replaces.
But this is a judgement call, and I'd appreciate some other voices.
#define __zero_reg__ r1
#define __tmp_reg__ r0
.macro DEFUN name
.section .text.\name,"ax",@progbits
.global \name
.func \name
\name:
.endm
.macro ENDF name
.size \name, .-\name
.endfunc
.endm
#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP jmp
#else
#define XCALL rcall
#define XJMP rjmp
#endif
.macro wmov r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
movw \r_dest, \r_src
#else
mov \r_dest, \r_src
mov \r_dest+1, \r_src+1
#endif
.endm
.text
#define pBuf 26
#define pNum 30
#define Length r20
#define Rem r19
#define Quot r21
#define Count r22
#define Num r23
#define Ten r16
#define Ox67 r18
;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
DEFUN u64toa_nibbles
wmov pBuf, 24
wmov pNum, 22
push Ten
ldi Ten, 10
;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10,
;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
;; also works, and that saves us one "lsr r1" per division.
ldi Ox67, 0x67
clr Count
.LoopDigits:
add pNum, Length
;; Count is 0.
adc pNum+1, Count
mov Count, Length
ldi Length, -1
clr Rem
.LoopBytes:
ld Num, -Z
;; Rem:Num <<= 4
;; "andi Rem, 0xf" not needed because Rem < 10.
eor Rem, Num
andi Num, 0x0f
eor Rem, Num
swap Rem
;; Quot := Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
mov Quot, r1
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
;; ...and (mostly) the same again for the low nibble.
;; Quot <<= 4
swap Quot
;; Rem:Num <<= 4
swap Rem
eor Rem, Num
;; Quot += Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
add Quot, r1
;; Quot = 0 ==> R1 = 0, hence we can also skip the MUL + SUB below.
breq 1f
sbrc Length, 7
;; Current Length not yet known: Set it according to highest non-zero byte.
mov Length, Count
;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
1:
st Z, Quot
dec Count
brne .LoopBytes
subi Rem, -'0'
st X+, Rem
;; Only continue if we determined Length (Length > 0)
;; Length < 0 means all bytes are zero, hence we are done then.
sbrs Length, 7
rjmp .LoopDigits
pop Ten
clr __zero_reg__
st X, __zero_reg__
;; pBuf survived in R25:R24
#if 0
# XJMP strrev
wmov ZL, 24
7: ld Count, -X
ld Num, Z
st Z+, Count
st X, Num
/* Loop while Z < X */
cp ZL, XL
cpc ZH, XH
brlo 7b
#endif
ret
ENDF u64toa_nibbles
#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This works simularly, but in base 100. Every step, we take a
;; remainder <= 99, multiply by 256, and add the next byte (<=255).
;; The result is in the range 0 <= x < 25600.
;;
;; To divide by 100 with multiply:
;; x * 0x29 >> 12 == x/100 for 0 <= x < 1099
;; x * 0x51f >> 17 == x/100 for 0 <= x < 4699
;; x * 0x147b >> 19 == x/100 for 0 <= x < 43699
;; x * 0x28f6 >> 20 == x/100 for 0 <= x < 43699
;;
;; Using a multiplier one bit larger than strictly necessary gives us
;; a more convenient shift amount. This is still small enough that we
;; can compute the high bits of the product we need in only two registers.
;;
;; The largest possible x = 0x63ff also has the largest possible
;; lsbyte, so it will give us the largest possible partial products.
;; Multiplying that by 0x28f6 requires four partial products:
;;
;; FF * F6 = F50A
;; FF * 28-- = 27D8--
;; 63-- * F6 = 5F22--
;; 63-- * 28-- = F78----
;;
;; The important thing is that the sum of the first three partial products
;; is 0x87ef0a, a 24-bit number. So there is no need to propagate
;; carries into the msbyte. We immediately discard the lsbyte of the
;; first partial product, and sum the others into a 2-byte register.
;; Then we throw away the lsbyte of that register and use it for the msbyte
;; of the final sum.
;; extern void utoa_bytes(char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
#define ZH r31
#define ZL r30
#define XH r27
#define XL r26
#define Q2 r23 /* Byte 2 of 4-byte product */
#define Q1 r22 /* Byte 1 (and 3) of 4-byte product */
#define Count r21
#define Length r20 /* Argument */
#define Rem r19
#define Hundred r18 /* Constant 0x64 = 100 */
#define Khi r17 /* Constant 0x28 = 40 */
#define Klo r16 /* Constant 0xf6 = 246 */
#define Num r15
DEFUN utoa_bytes
movw XL, r24 ; pBuf (output) to X
movw ZL, r22 ; pNum (input) to Z
ldi Hundred, 100
push Khi
ldi Khi, 0x28
push Klo
ldi Klo, 0xf6
push Num
clr Count
.LoopDigits2:
add ZL, Length
adc ZH, Count
mov Count, Length
ldi Length, -1
clr Rem
.LoopBytes2:
ld Num, -Z
/* Multiply Rem:Num by Khi:Klo */
mul Num, Khi
mov Q1, r0
mov Q2, r1
mul Rem, Klo
add Q1, r0
adc Q2, r1 ; Cannot overflow
mul Num, Klo
add Q1, r1
clr Q1 ; We only need the carry bit out
adc Q2, Q1 ; This cannot overflow, either
mul Rem, Khi
add Q2, r0
adc Q1, r1
; We now have the high 12 bits of the 28-bit product in Q1:Q2.
; Shift down by 4 bits
andi Q2, 0xf0
or Q1, Q2
swap Q1
;; We now have the new quotient in "Q1".
st Z, Q1
;; If Q1 == 0, don't set length (and multiply is redundant, too)
breq 1f
sbrc Length, 7 ;; Do we have a length yet?
mov Length, Count ;; Remember position of highest non-zero Q
mul Q1, Hundred
sub Num, r0 ; New remainder
1:
mov Rem, Num
dec Count
brne .LoopBytes2
;; We now have the final base-100 remiander in Rem.
;; Break it apart into two base-10 digits in Q1:Rem
;; (We could use the multiplier again, but that woud require
;; even more registers for the constants and more instructions,
;; and this isn't the inner loop.)
ldi Q1, '0'
3: subi Q1, -3
subi Rem, 30
brcc 3b
4: dec Q1
subi Rem, -10
brcs 4b
subi Rem, -'0'
st X+, Rem
st X+, Q1
;; Only continue if we determined Length (Length > 0)
;; Length < 0 means all bytes are zero, hence we are done.
sbrs Length, 7
rjmp .LoopDigits2
;; Chop off redundant trailing zero
clr __zero_reg__
cpi Q1, '0'
brne 5f
st -X, __zero_reg__ ; Could sbiw, but this is same speed
5: st X, __zero_reg__
; Restore registers
pop Num
pop Klo
pop Khi
movw r24, r26 ; Return end of output
ret
ENDF utoa_bytes
#undef Num
#undef Klo
#undef Khi
#undef Hundred
#undef Rem
#undef Length
#undef Count
#undef Q1
#undef Q2
#undef XL
#undef XH
#undef ZL
#undef ZH