Discussion:
[avr-libc-dev] Even faster decimal code
George Spelvin
2016-12-23 22:11:21 UTC
Permalink
I mentioned last time trying doing something like this, and now
I have it coded up (as utoa_bytes()).

Input Decimal
bits mem_toa mem_tod itoa nibbles bytes
8 269 220 217 141 143
16 664 462 451 321 273
24 1294 783 838 608 432
32 2059 1219 1167 948 666
40 3059 1775 1649 1395 941
48 4194 2373 2127 1895 1217
56 5477 3069 2733 2459 1551
64 6995 3822 3420 3130 1902

It's larger (120 bytes, vs, 90 for the nibbles code), and has a bit more
startup overhead, but is quite fast. I also have a 122-byte variant
which saves 1 cycle per pair of digits. (1895 cycles for 2^64-1.)

I'm interested in this both for speed, and because it's a good fit to my
suggestion of saving RAM space buffering the converted digits in BCD.

So now that we have several good candidates, how to proceed?
What size/speed tradeoff should be the final choice?

Personally, I'm not worried about 30 bytes of code on enhanced AVRs with
a multiplier, and although I haven't coded up the combined algorithm, I
believe the whole thing is smaller than the ultoa_invert code it replaces.

But this is a judgement call, and I'd appreciate some other voices.



#define __zero_reg__ r1
#define __tmp_reg__ r0

.macro DEFUN name
.section .text.\name,"ax",@progbits
.global \name
.func \name
\name:
.endm

.macro ENDF name
.size \name, .-\name
.endfunc
.endm

#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP jmp
#else
#define XCALL rcall
#define XJMP rjmp
#endif

.macro wmov r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
movw \r_dest, \r_src
#else
mov \r_dest, \r_src
mov \r_dest+1, \r_src+1
#endif
.endm

.text

#define pBuf 26
#define pNum 30
#define Length r20

#define Rem r19
#define Quot r21
#define Count r22
#define Num r23

#define Ten r16
#define Ox67 r18

;; extern void u64toa_nibbles (char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]

DEFUN u64toa_nibbles

wmov pBuf, 24
wmov pNum, 22

push Ten
ldi Ten, 10
;; For / 10 by multiply we would usually use a value of 0xcd ~ 0x800/10,
;; however Rem below will be in [0, 0x9f] so that ~ 1/2 of that value
;; also works, and that saves us one "lsr r1" per division.
ldi Ox67, 0x67
clr Count

.LoopDigits:

add pNum, Length
;; Count is 0.
adc pNum+1, Count

mov Count, Length
ldi Length, -1
clr Rem

.LoopBytes:

ld Num, -Z

;; Rem:Num <<= 4
;; "andi Rem, 0xf" not needed because Rem < 10.
eor Rem, Num
andi Num, 0x0f
eor Rem, Num
swap Rem

;; Quot := Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
mov Quot, r1

;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0

;; ...and (mostly) the same again for the low nibble.

;; Quot <<= 4
swap Quot

;; Rem:Num <<= 4
swap Rem
eor Rem, Num

;; Quot += Rem / 10
mul Rem, Ox67
lsr r1
lsr r1
add Quot, r1

;; Quot = 0 ==> R1 = 0, hence we can also skip the MUL + SUB below.
breq 1f
sbrc Length, 7
;; Current Length not yet known: Set it according to highest non-zero byte.
mov Length, Count

;; Rem := Rem - 10 * (Rem / 10) = Rem % 10
mul r1, Ten
sub Rem, r0
1:

st Z, Quot

dec Count
brne .LoopBytes

subi Rem, -'0'
st X+, Rem

;; Only continue if we determined Length (Length > 0)
;; Length < 0 means all bytes are zero, hence we are done then.
sbrs Length, 7
rjmp .LoopDigits

pop Ten
clr __zero_reg__

st X, __zero_reg__

;; pBuf survived in R25:R24
#if 0
# XJMP strrev
wmov ZL, 24
7: ld Count, -X
ld Num, Z
st Z+, Count
st X, Num
/* Loop while Z < X */
cp ZL, XL
cpc ZH, XH
brlo 7b
#endif
ret
ENDF u64toa_nibbles

#undef Count
#undef pNum
#undef pBuf
#undef Rem
#undef Num
#undef Quot
#undef Ten
#undef Ox67
#undef Length

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; This works simularly, but in base 100. Every step, we take a
;; remainder <= 99, multiply by 256, and add the next byte (<=255).
;; The result is in the range 0 <= x < 25600.
;;
;; To divide by 100 with multiply:
;; x * 0x29 >> 12 == x/100 for 0 <= x < 1099
;; x * 0x51f >> 17 == x/100 for 0 <= x < 4699
;; x * 0x147b >> 19 == x/100 for 0 <= x < 43699
;; x * 0x28f6 >> 20 == x/100 for 0 <= x < 43699
;;
;; Using a multiplier one bit larger than strictly necessary gives us
;; a more convenient shift amount. This is still small enough that we
;; can compute the high bits of the product we need in only two registers.
;;
;; The largest possible x = 0x63ff also has the largest possible
;; lsbyte, so it will give us the largest possible partial products.
;; Multiplying that by 0x28f6 requires four partial products:
;;
;; FF * F6 = F50A
;; FF * 28-- = 27D8--
;; 63-- * F6 = 5F22--
;; 63-- * 28-- = F78----
;;
;; The important thing is that the sum of the first three partial products
;; is 0x87ef0a, a 24-bit number. So there is no need to propagate
;; carries into the msbyte. We immediately discard the lsbyte of the
;; first partial product, and sum the others into a 2-byte register.
;; Then we throw away the lsbyte of that register and use it for the msbyte
;; of the final sum.

;; extern void utoa_bytes(char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
#define ZH r31
#define ZL r30
#define XH r27
#define XL r26

#define Q2 r23 /* Byte 2 of 4-byte product */
#define Q1 r22 /* Byte 1 (and 3) of 4-byte product */
#define Count r21
#define Length r20 /* Argument */
#define Rem r19
#define Hundred r18 /* Constant 0x64 = 100 */
#define Khi r17 /* Constant 0x28 = 40 */
#define Klo r16 /* Constant 0xf6 = 246 */
#define Num r15


DEFUN utoa_bytes
movw XL, r24 ; pBuf (output) to X
movw ZL, r22 ; pNum (input) to Z
ldi Hundred, 100
push Khi
ldi Khi, 0x28
push Klo
ldi Klo, 0xf6
push Num
clr Count

.LoopDigits2:
add ZL, Length
adc ZH, Count

mov Count, Length
ldi Length, -1
clr Rem

.LoopBytes2:
ld Num, -Z

/* Multiply Rem:Num by Khi:Klo */
mul Num, Khi
mov Q1, r0
mov Q2, r1
mul Rem, Klo
add Q1, r0
adc Q2, r1 ; Cannot overflow
mul Num, Klo
add Q1, r1
clr Q1 ; We only need the carry bit out
adc Q2, Q1 ; This cannot overflow, either
mul Rem, Khi
add Q2, r0
adc Q1, r1

; We now have the high 12 bits of the 28-bit product in Q1:Q2.
; Shift down by 4 bits
andi Q2, 0xf0
or Q1, Q2
swap Q1
;; We now have the new quotient in "Q1".
st Z, Q1

;; If Q1 == 0, don't set length (and multiply is redundant, too)
breq 1f
sbrc Length, 7 ;; Do we have a length yet?
mov Length, Count ;; Remember position of highest non-zero Q

mul Q1, Hundred
sub Num, r0 ; New remainder
1:
mov Rem, Num
dec Count
brne .LoopBytes2

;; We now have the final base-100 remiander in Rem.
;; Break it apart into two base-10 digits in Q1:Rem
;; (We could use the multiplier again, but that woud require
;; even more registers for the constants and more instructions,
;; and this isn't the inner loop.)
ldi Q1, '0'
3: subi Q1, -3
subi Rem, 30
brcc 3b
4: dec Q1
subi Rem, -10
brcs 4b

subi Rem, -'0'
st X+, Rem
st X+, Q1

;; Only continue if we determined Length (Length > 0)
;; Length < 0 means all bytes are zero, hence we are done.
sbrs Length, 7
rjmp .LoopDigits2

;; Chop off redundant trailing zero
clr __zero_reg__
cpi Q1, '0'
brne 5f
st -X, __zero_reg__ ; Could sbiw, but this is same speed
5: st X, __zero_reg__

; Restore registers
pop Num
pop Klo
pop Khi

movw r24, r26 ; Return end of output
ret
ENDF utoa_bytes

#undef Num
#undef Klo
#undef Khi
#undef Hundred
#undef Rem
#undef Length
#undef Count
#undef Q1
#undef Q2
#undef XL
#undef XH
#undef ZL
#undef ZH
Georg-Johann Lay
2016-12-24 11:42:42 UTC
Permalink
Post by George Spelvin
I mentioned last time trying doing something like this, and now
I have it coded up (as utoa_bytes()).
Input Decimal
bits mem_toa mem_tod itoa nibbles bytes
8 269 220 217 141 143
16 664 462 451 321 273
24 1294 783 838 608 432
32 2059 1219 1167 948 666
40 3059 1775 1649 1395 941
48 4194 2373 2127 1895 1217
56 5477 3069 2733 2459 1551
64 6995 3822 3420 3130 1902
It's larger (120 bytes, vs, 90 for the nibbles code), and has a bit more
startup overhead, but is quite fast. I also have a 122-byte variant
which saves 1 cycle per pair of digits. (1895 cycles for 2^64-1.)
I'm interested in this both for speed, and because it's a good fit to my
suggestion of saving RAM space buffering the converted digits in BCD.
So now that we have several good candidates, how to proceed?
What size/speed tradeoff should be the final choice?
After all it's you who will provide the final implementation and
testing, hence the final decision of what's appropriate, how much
effort will be put into the implementation, and what the final code
will look like is your decision, IMO.
Post by George Spelvin
Personally, I'm not worried about 30 bytes of code on enhanced AVRs with
a multiplier, and although I haven't coded up the combined algorithm, I
believe the whole thing is smaller than the ultoa_invert code it replaces.
We only have multilib granularity, and there are not so many features
that are related to the flash size. One is __AVR_HAVE_JMP_CALL__ which
applies to devices with >= 16 KiB flash. The next size milestone is
__AVR_HAVE_ELPM__ which means >= 128 KiB. The JMP + CALL looks
reasonable to me; I used it for 64-bit divisions in libgcc (which leads
to the funny situation that 64-bit division might run faster than a
32-bit division for the same values).
Post by George Spelvin
But this is a judgement call, and I'd appreciate some other voices.
For smaller devices (e.g. no CALL but MUL some bytes can be squeezed
out by moving LDI of the constants to the inner loop which saves
PUSH + POP. But on smaller devices, where xprintf is already a
major code consumer, a programmer might prefer something like ulltoa
over the bloat from xprintf.
Post by George Spelvin
#define __zero_reg__ r1
#define __tmp_reg__ r0
.macro DEFUN name
.global \name
.func \name
.endm
.macro ENDF name
.size \name, .-\name
.endfunc
.endm
#if defined (__AVR_HAVE_JMP_CALL__)
#define XCALL call
#define XJMP jmp
#else
#define XCALL rcall
#define XJMP rjmp
#endif
.macro wmov r_dest, r_src
#if defined (__AVR_HAVE_MOVW__)
movw \r_dest, \r_src
#else
mov \r_dest, \r_src
mov \r_dest+1, \r_src+1
#endif
.endm
.text
[...u64toa_nibbles...]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; This works simularly, but in base 100. Every step, we take a
;; remainder <= 99, multiply by 256, and add the next byte (<=255).
;; The result is in the range 0 <= x < 25600.
;;
;; x * 0x29 >> 12 == x/100 for 0 <= x < 1099
;; x * 0x51f >> 17 == x/100 for 0 <= x < 4699
;; x * 0x147b >> 19 == x/100 for 0 <= x < 43699
;; x * 0x28f6 >> 20 == x/100 for 0 <= x < 43699
;;
;; Using a multiplier one bit larger than strictly necessary gives us
;; a more convenient shift amount. This is still small enough that we
;; can compute the high bits of the product we need in only two registers.
;;
;; The largest possible x = 0x63ff also has the largest possible
;; lsbyte, so it will give us the largest possible partial products.
;;
;; FF * F6 = F50A
;; FF * 28-- = 27D8--
;; 63-- * F6 = 5F22--
;; 63-- * 28-- = F78----
;;
;; The important thing is that the sum of the first three partial products
;; is 0x87ef0a, a 24-bit number. So there is no need to propagate
;; carries into the msbyte. We immediately discard the lsbyte of the
;; first partial product, and sum the others into a 2-byte register.
;; Then we throw away the lsbyte of that register and use it for the msbyte
;; of the final sum.
;; extern void utoa_bytes(char *pBuf, void *pNum, uint8_t Length)
;; Length in [1, 127]
#define ZH r31
#define ZL r30
#define XH r27
#define XL r26
#define Q2 r23 /* Byte 2 of 4-byte product */
#define Q1 r22 /* Byte 1 (and 3) of 4-byte product */
Maybe it's a bit easier to grasp if

#define Q3 Q1

and then use Q3 where byte #3 is computed (starting with "clr Q1")
Post by George Spelvin
#define Count r21
#define Length r20 /* Argument */
#define Rem r19
#define Hundred r18 /* Constant 0x64 = 100 */
#define Khi r17 /* Constant 0x28 = 40 */
#define Klo r16 /* Constant 0xf6 = 246 */
#define Num r15
DEFUN utoa_bytes
movw XL, r24 ; pBuf (output) to X
movw ZL, r22 ; pNum (input) to Z
ldi Hundred, 100
push Khi
ldi Khi, 0x28
push Klo
ldi Klo, 0xf6
push Num
clr Count
add ZL, Length
adc ZH, Count
mov Count, Length
ldi Length, -1
clr Rem
ld Num, -Z
/* Multiply Rem:Num by Khi:Klo */
mul Num, Khi
mov Q1, r0
mov Q2, r1
Can use "wmov Q1, r0"
Post by George Spelvin
mul Rem, Klo
add Q1, r0
adc Q2, r1 ; Cannot overflow
mul Num, Klo
add Q1, r1
clr Q1 ; We only need the carry bit out
adc Q2, Q1 ; This cannot overflow, either
mul Rem, Khi
add Q2, r0
adc Q1, r1
; We now have the high 12 bits of the 28-bit product in Q1:Q2.
; Shift down by 4 bits
andi Q2, 0xf0
or Q1, Q2
swap Q1
;; We now have the new quotient in "Q1".
st Z, Q1
;; If Q1 == 0, don't set length (and multiply is redundant, too)
breq 1f
sbrc Length, 7 ;; Do we have a length yet?
mov Length, Count ;; Remember position of highest non-zero Q
mul Q1, Hundred
sub Num, r0 ; New remainder
mov Rem, Num
dec Count
brne .LoopBytes2
;; We now have the final base-100 remiander in Rem.
;; Break it apart into two base-10 digits in Q1:Rem
;; (We could use the multiplier again, but that woud require
;; even more registers for the constants and more instructions,
;; and this isn't the inner loop.)
ldi Q1, '0'
3: subi Q1, -3
subi Rem, 30
brcc 3b
4: dec Q1
subi Rem, -10
brcs 4b
subi Rem, -'0'
st X+, Rem
st X+, Q1
;; Only continue if we determined Length (Length > 0)
;; Length < 0 means all bytes are zero, hence we are done.
sbrs Length, 7
rjmp .LoopDigits2
;; Chop off redundant trailing zero
clr __zero_reg__
cpi Q1, '0'
brne 5f
st -X, __zero_reg__ ; Could sbiw, but this is same speed
5: st X, __zero_reg__
; Restore registers
pop Num
pop Klo
pop Khi
movw r24, r26 ; Return end of output
ret
ENDF utoa_bytes
#undef Num
#undef Klo
#undef Khi
#undef Hundred
#undef Rem
#undef Length
#undef Count
#undef Q1
#undef Q2
#undef XL
#undef XH
#undef ZL
#undef ZH
George Spelvin
2016-12-24 17:59:34 UTC
Permalink
Post by Georg-Johann Lay
Post by George Spelvin
So now that we have several good candidates, how to proceed?
What size/speed tradeoff should be the final choice?
After all it's you who will provide the final implementation and
testing, hence the final decision of what's appropriate, how much
effort will be put into the implementation, and what the final code
will look like is your decision, IMO.
Well, thank you very much, but after your "that's quite some size
increase" e-mail (and showing me better code than I'd been working on
for a couple of weeks), I'm feeling rather less confident.

(And, despite my asking, nobody's expressed any opinion at all about
my "save RAM by using BCD" suggestion. Brilliant or crackpot?)
Post by Georg-Johann Lay
We only have multilib granularity, and there are not so many features
that are related to the flash size. One is __AVR_HAVE_JMP_CALL__ which
applies to devices with >= 16 KiB flash. The next size milestone is
__AVR_HAVE_ELPM__ which means >= 128 KiB. The JMP + CALL looks
reasonable to me; I used it for 64-bit divisions in libgcc (which leads
to the funny situation that 64-bit division might run faster than a
32-bit division for the same values).
Interesting suggestion. I could just use the multiplierless base-100 code,
which is smaller and still reasonably fast.

And thank you very much! I knew that HAVE_JUMP_CALL meant that RJMP/RCALL
range wasn't enough, which means more than 12 bits of PC (2^13 bytes of
flash), but it had gotten lost in the forest of confusion.

I'm befuddled by all of the different architecture options and don't
understand the difference between most of them. I've been slowly
downloading data sheets for different examples from gcc's list and
looking for differences, but it's a laborious process. (That document
on avr-tiny started out with me documenting my realization that avr1
was something else.)

For example, does MUL support imply MOVW support? (I've been assuming
so, but that's an easy edit.)

And what's the relationship between MOVW support and ADIW/SBIW? Are they
the same feature, or are there processors with one and not the other?


(For aggressive size squeezing, I've realized that a lot of code is wasted
copying pointer return values from X or Z to r24:r25, only to have the
caller copy them right back to use the pointers. It would be lovely to
tell if there were a way to tell gcc "expect the resturn vaue for this
function in r30:r31". And, sometimes, "this function preserves r18-r20".)
Post by Georg-Johann Lay
For smaller devices (e.g. no CALL but MUL some bytes can be squeezed
out by moving LDI of the constants to the inner loop which saves
PUSH + POP. But on smaller devices, where xprintf is already a
major code consumer, a programmer might prefer something like ulltoa
over the bloat from xprintf.
Um...I see how I can swap the Hundred constant around, but Khi/Klo are
both used twice each, so loading them twice would not save as much.
(If I return the end pointer rather than returning r24:r25 to the end
for a call to strrev that's not in the current code, that avoids two
more push/pop anyway.)

The way I have the multiply organized, I have to do the two middle
partial products first, then the low, then the high. I can swap the
middle ones around, but I can't make both constants' uses adjacent.
Post by Georg-Johann Lay
Post by George Spelvin
#define Q2 r23 /* Byte 2 of 4-byte product */
#define Q1 r22 /* Byte 1 (and 3) of 4-byte product */
Maybe it's a bit easier to grasp if
#define Q3 Q1
and then use Q3 where byte #3 is computed (starting with "clr Q1")
I thought about that, but remembering that two names refer to the same
register (and thus you may not rearrange code to overlap their usage)
is also a pain. I originally called them "Qeven" and "Qodd".

Maybe I can just improve the comments...
Post by Georg-Johann Lay
Post by George Spelvin
/* Multiply Rem:Num by Khi:Klo */
mul Num, Khi
mov Q1, r0
mov Q2, r1
Can use "wmov Q1, r0"
Ooh, nice! I forgot that movw isn't limited to high registers.

Let me try to improve the comments... do you still think this would
be better with Q3? (It dawned on me that even if the product *isn't*
guaranteed to not overflow, the structure can compute the high half of
a 32-bit product in only two accumulator registers if we add one more
"ADC Q1,Q1".)
Post by Georg-Johann Lay
Post by George Spelvin
mul Rem, Klo
add Q1, r0
adc Q2, r1 ; Cannot overflow
mul Num, Klo
add Q1, r1
clr Q1 ; No longer need Q1; re-use register for Q3
adc Q2, Q1 ; Propagate carry to Q2
;adc Q1, Q1 ; (Omit: no carry possible due to input range)
Post by Georg-Johann Lay
Post by George Spelvin
mul Rem, Khi
add Q2, r0 ; Now byte 2 (hlo, Q2) of 32-bit product
adc Q1, r1 ; Now msbyte (hhi, Q3) of 32-bit product
Post by Georg-Johann Lay
Post by George Spelvin
; We now have the high 12 bits of the 28-bit product in Q1:Q2.
; Shift down by 4 bits
andi Q2, 0xf0
or Q1, Q2
swap Q1
;; We now have the new quotient in "Q1".
st Z, Q1
Martin McKee
2016-12-28 16:42:04 UTC
Permalink
I find all this fascinating, but I'm really not the one to be commenting on
what the best approach here is. I will say, however, that in many of my
applications, I would be more likely to chose a speed increase over reduced
memory. I tend to live with mostly compute-bound, control applications
though.

I've not had enough time to look at the code to make fine-grained
suggestions (and I'm out of practice with AVR ASM), but I've not felt that
the comments were too bad. I've been able to follow the code as written
easily enough.

Cheers,
Martin Jay McKee
Post by George Spelvin
Post by Georg-Johann Lay
Post by George Spelvin
So now that we have several good candidates, how to proceed?
What size/speed tradeoff should be the final choice?
After all it's you who will provide the final implementation and
testing, hence the final decision of what's appropriate, how much
effort will be put into the implementation, and what the final code
will look like is your decision, IMO.
Well, thank you very much, but after your "that's quite some size
increase" e-mail (and showing me better code than I'd been working on
for a couple of weeks), I'm feeling rather less confident.
(And, despite my asking, nobody's expressed any opinion at all about
my "save RAM by using BCD" suggestion. Brilliant or crackpot?)
Post by Georg-Johann Lay
We only have multilib granularity, and there are not so many features
that are related to the flash size. One is __AVR_HAVE_JMP_CALL__ which
applies to devices with >= 16 KiB flash. The next size milestone is
__AVR_HAVE_ELPM__ which means >= 128 KiB. The JMP + CALL looks
reasonable to me; I used it for 64-bit divisions in libgcc (which leads
to the funny situation that 64-bit division might run faster than a
32-bit division for the same values).
Interesting suggestion. I could just use the multiplierless base-100 code,
which is smaller and still reasonably fast.
And thank you very much! I knew that HAVE_JUMP_CALL meant that RJMP/RCALL
range wasn't enough, which means more than 12 bits of PC (2^13 bytes of
flash), but it had gotten lost in the forest of confusion.
I'm befuddled by all of the different architecture options and don't
understand the difference between most of them. I've been slowly
downloading data sheets for different examples from gcc's list and
looking for differences, but it's a laborious process. (That document
on avr-tiny started out with me documenting my realization that avr1
was something else.)
For example, does MUL support imply MOVW support? (I've been assuming
so, but that's an easy edit.)
And what's the relationship between MOVW support and ADIW/SBIW? Are they
the same feature, or are there processors with one and not the other?
(For aggressive size squeezing, I've realized that a lot of code is wasted
copying pointer return values from X or Z to r24:r25, only to have the
caller copy them right back to use the pointers. It would be lovely to
tell if there were a way to tell gcc "expect the resturn vaue for this
function in r30:r31". And, sometimes, "this function preserves r18-r20".)
Post by Georg-Johann Lay
For smaller devices (e.g. no CALL but MUL some bytes can be squeezed
out by moving LDI of the constants to the inner loop which saves
PUSH + POP. But on smaller devices, where xprintf is already a
major code consumer, a programmer might prefer something like ulltoa
over the bloat from xprintf.
Um...I see how I can swap the Hundred constant around, but Khi/Klo are
both used twice each, so loading them twice would not save as much.
(If I return the end pointer rather than returning r24:r25 to the end
for a call to strrev that's not in the current code, that avoids two
more push/pop anyway.)
The way I have the multiply organized, I have to do the two middle
partial products first, then the low, then the high. I can swap the
middle ones around, but I can't make both constants' uses adjacent.
Post by Georg-Johann Lay
Post by George Spelvin
#define Q2 r23 /* Byte 2 of 4-byte product */
#define Q1 r22 /* Byte 1 (and 3) of 4-byte product */
Maybe it's a bit easier to grasp if
#define Q3 Q1
and then use Q3 where byte #3 is computed (starting with "clr Q1")
I thought about that, but remembering that two names refer to the same
register (and thus you may not rearrange code to overlap their usage)
is also a pain. I originally called them "Qeven" and "Qodd".
Maybe I can just improve the comments...
Post by Georg-Johann Lay
Post by George Spelvin
/* Multiply Rem:Num by Khi:Klo */
mul Num, Khi
mov Q1, r0
mov Q2, r1
Can use "wmov Q1, r0"
Ooh, nice! I forgot that movw isn't limited to high registers.
Let me try to improve the comments... do you still think this would
be better with Q3? (It dawned on me that even if the product *isn't*
guaranteed to not overflow, the structure can compute the high half of
a 32-bit product in only two accumulator registers if we add one more
"ADC Q1,Q1".)
Post by Georg-Johann Lay
Post by George Spelvin
mul Rem, Klo
add Q1, r0
adc Q2, r1 ; Cannot overflow
mul Num, Klo
add Q1, r1
clr Q1 ; No longer need Q1; re-use register for Q3
adc Q2, Q1 ; Propagate carry to Q2
;adc Q1, Q1 ; (Omit: no carry possible due to input
range)
Post by Georg-Johann Lay
Post by George Spelvin
mul Rem, Khi
add Q2, r0 ; Now byte 2 (hlo, Q2) of 32-bit product
adc Q1, r1 ; Now msbyte (hhi, Q3) of 32-bit product
Post by Georg-Johann Lay
Post by George Spelvin
; We now have the high 12 bits of the 28-bit product in Q1:Q2.
; Shift down by 4 bits
andi Q2, 0xf0
or Q1, Q2
swap Q1
;; We now have the new quotient in "Q1".
st Z, Q1
_______________________________________________
AVR-libc-dev mailing list
https://lists.nongnu.org/mailman/listinfo/avr-libc-dev
Georg-Johann Lay
2017-01-01 21:11:27 UTC
Permalink
Post by George Spelvin
Post by Georg-Johann Lay
We only have multilib granularity, and there are not so many features
that are related to the flash size. One is __AVR_HAVE_JMP_CALL__ which
applies to devices with >= 16 KiB flash. The next size milestone is
__AVR_HAVE_ELPM__ which means >= 128 KiB. The JMP + CALL looks
reasonable to me; I used it for 64-bit divisions in libgcc (which leads
to the funny situation that 64-bit division might run faster than a
32-bit division for the same values).
Interesting suggestion. I could just use the multiplierless base-100 code,
which is smaller and still reasonably fast.
And thank you very much! I knew that HAVE_JUMP_CALL meant that RJMP/RCALL
range wasn't enough, which means more than 12 bits of PC (2^13 bytes of
flash), but it had gotten lost in the forest of confusion.
I'm befuddled by all of the different architecture options and don't
understand the difference between most of them. I've been slowly
downloading data sheets for different examples from gcc's list and
looking for differences, but it's a laborious process. (That document
on avr-tiny started out with me documenting my realization that avr1
was something else.)
For example, does MUL support imply MOVW support? (I've been assuming
so, but that's an easy edit.)
gcc sources provide a good overview of features per core, cf.

http://gcc.gnu.org/viewcvs/gcc/trunk/gcc/config/avr/avr-devices.c?revision=243994&view=markup

A bit more comments for the structure which is initialized with
that data:

http://gcc.gnu.org/viewcvs/gcc/trunk/gcc/config/avr/avr-arch.h?revision=243994&view=markup#l51

The devices are here:

http://gcc.gnu.org/viewcvs/gcc/trunk/gcc/config/avr/avr-mcus.def?revision=243994&view=markup

Johann
George Spelvin
2016-12-28 18:40:22 UTC
Permalink
Post by Martin McKee
I find all this fascinating, but I'm really not the one to be commenting on
what the best approach here is. I will say, however, that in many of my
applications, I would be more likely to chose a speed increase over reduced
memory. I tend to live with mostly compute-bound, control applications
though.
Thank you very much, this is very useful! Even if it's just one person's
experience, at least it's *a* data point.

May I ask, whay do you think of my RAM-for-ROM tradeoff idea?
Is there one or the other that you more commonly run out of?
Post by Martin McKee
I've not had enough time to look at the code to make fine-grained
suggestions (and I'm out of practice with AVR ASM), but I've not felt that
the comments were too bad. I've been able to follow the code as written
easily enough.
Yes, my original algorithm was excessively tricky. The last (and
fastest) one is actually a lot simpler.
Martin McKee
2016-12-29 18:58:14 UTC
Permalink
When it comes to RAM vs. ROM, I usually have issues with running out of
RAM. Then again, my applications tend to be very RAM hungry. I am always
keeping large blocks of data in memory. When I do more standard interface
applications, I find that I run out of ROM first. I would say that I
generally find working around lack of RAM easier in my applications. So,
if I have to hit the wall with one, I prefer it to be RAM.

Martin Jay McKee
Post by Martin McKee
Post by Martin McKee
I find all this fascinating, but I'm really not the one to be commenting
on
Post by Martin McKee
what the best approach here is. I will say, however, that in many of my
applications, I would be more likely to chose a speed increase over
reduced
Post by Martin McKee
memory. I tend to live with mostly compute-bound, control applications
though.
Thank you very much, this is very useful! Even if it's just one person's
experience, at least it's *a* data point.
May I ask, whay do you think of my RAM-for-ROM tradeoff idea?
Is there one or the other that you more commonly run out of?
Post by Martin McKee
I've not had enough time to look at the code to make fine-grained
suggestions (and I'm out of practice with AVR ASM), but I've not felt
that
Post by Martin McKee
the comments were too bad. I've been able to follow the code as written
easily enough.
Yes, my original algorithm was excessively tricky. The last (and
fastest) one is actually a lot simpler.
George Spelvin
2016-12-29 20:39:03 UTC
Permalink
Post by Martin McKee
When it comes to RAM vs. ROM, I usually have issues with running out of
RAM. Then again, my applications tend to be very RAM hungry. I am always
keeping large blocks of data in memory. When I do more standard interface
applications, I find that I run out of ROM first. I would say that I
generally find working around lack of RAM easier in my applications. So,
if I have to hit the wall with one, I prefer it to be RAM.
Thank you again! Since nobody else has chimed it, and your preferences
(speed > ROM > RAM, within reason) are compatible with my original
"customer", I'll punt on my RAM-saving ideas, and get the highest-speed
code integrated.
George Spelvin
2017-01-01 22:14:31 UTC
Permalink
Post by Georg-Johann Lay
gcc sources provide a good overview of features per core, cf.
Yes, I found that and have turned it into some notes about the
differences. I intend to combine them with my earlier avr-tiny/avr1
notes into something useful.

But basically, there are two dimensions of memory size and "features".

Memory size is somewhere along the following axis. I'm conflating ROM
and RAM sizes because it's almost a combined order (with xmega5 being
the exception):

- If a device has <= 128 bytes of RAM, then the pointer registers
are only 8 bits wide. SPH is not implemented, and the high bytes of
X, Y and Z are ignored. (This is flagged with __AVR_SP8__, or
its compllement __AVR_HAVE_SPH__.)
- (This is not the same as -mtiny-stack, which is purely a software
convention that the stack will be restricted to a single 256-byte
page, meaning that stack pointer manipulation need not propagate
carries.)
- Devices with <= 8K of ROM only have the RJMP and RCALL instructions.
Devices with >= 16K of ROM implement 2-word JMP and CALL.
(And define __AVR_HAVE_JMP_CALL__)
(Add verbiage about skip bug here.)
- Devices with <= 64K of ROM only have LPM. Devices with >= 128K
of ROM implement the RAMPZ register and ELPM instruction.
(__AVR_HAVE_ELPM__, and maybe __AVR_HAVE_ELPMX__)
- Devices with <= 128K of ROM have 2-byte PCs and only have IJMP/ICALL.
Devices with >= 256K of ROM have 3-byte PC registers (use more stack
and take more time during CALL), and implement the EIND register and
EIJMP/EICALL instructions.
(__AVR_3_BYTE_PC__, __AVR_HAVE_EIJMP_EICALL__)
- Devices with <= 64K of RAM (almost all of them) do not implement
RAMPX/RAMPY/RAMPD. Devices with > 64K of RAM (only xmega5 and xmega7,
using their external memory interfaces) implement those registers,
which provide the high address bits to load-store instructions.

Features go along the second axis. Each level includes all of the
features of the levels before.

- The "classic" models (avr2 and avr3) do not have MOVW and only have
the zero-argument form of LPM (= LPM r0,Z). Note that classic models
with ELPM do exist (avr31).
- The first enhancement is __AVR_HAVE_LPMX__ and __AVR_HAVE_MOVW__
(which are actually synonyms). The MOVW and LPM Rd,Z(+) instructions.
And, if >64K of ROM, the corresponding ELPM instructions.
(These are avr25, avr35, and avr4+).
- The second enhancement is multiply support. (Called __AVR_HAVE_MUL__
and __AVR_ENHANCED__.) Multiply support implies MOVW/LPMX support.
- The third enhancement is the XMEGA features. For software, the
main issues are:
- Register file is no longer memory-mapped.
- Greatly enlarged I/O port range
- Stack pointer writes don't need to disable interrupts; writing the
low half disables interrupts for 4 cycles.
- CCP register to lock important configuration from unwanted change.
- Some instructions take one cycle less.
- The fourth is the XMEGA RMW instructions. Not all XMEGA models have
these, so they're indicated with a separate -mrmw flag.

Here's the chart that makes it all clear:

Classic +MOVW +MUL +XMEGA +RMW
<= 128 RAM (-msp8)
<= 8K ROM avr2 avr25 avr4
16K..64K ROM avr3 avr35 avr5 xmega2
128K ROM avr31 avr51 xmega4
Post by Georg-Johann Lay
= 256K ROM avr6 xmega6
64K RAM xmega7 (-mrmw)
xmega5 is xmega4 plus the >64K RAM features, but not the >= 256K ROM
features.

(And attiny40 is its own special brand of perverse.)

Loading...