[avr-libc-dev] Interested in 64-bit printf support?

Discussion:

George Spelvin

2016-12-04 22:53:06 UTC

The developers of the TICC time interval counter are having
a problem printing 64-bit integers:
https://github.com/TAPR/TICC/blob/master/TO-DO.txt

I went to work figuring out how to do that conversion, and after some
false starts, came up with some quite efficient arbitrary-precision
printing code. It's small, fast, and works for any number of bytes.
(Implementation currently limited to 255 bytes.)

The code, in its current state, is appended; it's been tested extensively
on x86 and more lightly on simulavr. Any comments from experienced AVR
programmers is definitely appreciated. Like the current __ultoa_invert,
it formats the number backward in a caller-provided buffer.

It's (currently) slightly larger, but slightly faster than __ultoa_invert.
Timing in cycles for a simulated atmega1280:

Input genprint __ultoa_invert
0 160
0xff 316 480
0xffff 584 792
0xffffff 1005 1260
0xffffffff 1434 1572
0xffffffffff 2024
48 ones 2626
56 ones 3286
64 ones 4103

I could just pass this over to the TICC project, but is there any interest
in me making the necessary overhauls to vfprintf to incorporate this?

I'd have to change vfprintf to pass around a pointer and length internally
rather than copying the value. The printing code requires the input
in a modifiable little-endian buffer, but that's how varargs works on
AVR anyway. (Adding the hex/octal and negative number handling is quite
simple.)

If I get really sneaky, I could eliminate the output buffer by having
it push the formatted number on the stack and call down to an output
routine that supplies the necessary prefix padding once the length is
known. But that might not be doable in C unless there's a way to force
allocation of a stack frame.

(If anyone cares, I have some other code for 16- and 32-bit numbers based
on the ideas at http://homepage.divms.uiowa.edu/~jones/bcd/decimal.html.
It converts 2 digits at a time, using a 100-byte binary-to-BCD lookup
table, but is a lot larger, extending it to 64 bits is a mess, and doing
without a hardware multiplier bloats it with shift-and-add sequences.)

#if __AVR_ARCH__
typedef _Bool bool;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
#define false (bool)0
#define true (bool)1
#else
#include <stdbool.h>
#include <stdint.h>
#endif
/*
* Print an arbitrary buffer of bytes. This uses only byte operations
* and only multiplies by the constant 0x33, so is suitable for an
* 8-bit microcontroller.
*
* The basic operation involves three steps:
* 1) Sum the bytes mod 255 to find the remainder mod 5. Combine
* this with the lsbit to compute the least significant digit.
* 2) Subtract this from the number
* 3) Then do a 1-bit shift and multiply by 0xcc..cccd to accomplish a
* divide by 10.
*
* Because of the simple pattern of that multiplier (it's -0x33333...),
* it can be managed in O(n) time and O(1) additional space.
*/

/* Reduce a byte mod 5. */
static uint8_t __attribute__((const))
mod5(uint8_t x)
{
#if __AVR_ARCH__
uint8_t tmp;
/* Reduce x mod 15. We add the high halves, to get a carry. */
asm( "mov __tmp_reg__,%0"
"\n swap __tmp_reg__"
"\n cbr %0,15"
"\n add %0,__tmp_reg__"
"\n cbr %0,15"
"\n swap %0" /* Swap and add carry bit to low */
"\n adc %0,__zero_reg__" /* End-around carry */
: "+d" (x) :: "r0");
#else
x = (x >> 4) + (x & 0xf); /* 1 <= x <= 30 */
x = (x >> 4) + (x & 0xf); /* 1 <= x <= 15 */
#endif
/* 1 <= x <= 15; now for final reduction mod 5 */
if (x >= 10)
x -= 10; /* 0 <= x <= 9 */
if (x >= 5)
x -= 5; /* 0 <= x <= 4 */
return x;
}

/*
* Convert the arbitrary-precision number in bin[0..len-1] from
* little-endian binary to to decimal. The length can be up to 127 bytes
* (but the algorithm can go forever).
* The digits are written little-endian, so they must be reversed
* for output. A pointer to the end of the formatted number is returned.
* (The caller is responsible for ensuring the output is big enough.)
*/
char *
genprint(char *out, uint8_t *bin, uint8_t len)
{
uint8_t digit;

/* This shouldn't happen, but don't crash */
if (!len)
return out;
/*
* Pre-pass: strip trailing zero bytes, but not the last.
* Also leave "bin" pointing to the end of the input.
*/
bin += len;
while (!*--bin) {
if (--len)
continue;
len++;
break;
}
bin++; /* Leave bin pointing just past end of input */

do {
uint16_t accum;
uint8_t i = len;
bool lsbit = false;

/*
* Pass 1, msb-to-lsb: divide bin[] by 2, and return the
* value mod 10 (the new value mod 5, combined with the
* previous lsbit).
*/
digit = 255; /* Could also be zero */
#if __AVR_ARCH__
/*
* The one implementation hassle is the need for two
* carry bits. One for the shift, and the other for
* the end-around carries modulo 255.
*
* Unfamiliar AVR inline asm feature: given a memory
* constraint, %2 prints X, Y or Z. %E2 and %F2 are the
* low and high registers of the pointer, respectively.
* So if %2 = Z, then %E2 = r30 and %F2 = r31.
*/
asm(""
"\n1: ld __tmp_reg__,-%4"
"\n lsr %1" /* lsbit to carry bit */
"\n ror __tmp_reg__"
"\n st %4,__tmp_reg__"
"\n rol %1" /* carry bit to lsbit */
"\n add %0,__tmp_reg__"
"\n adc %0,__zero_reg__" /* End-around carry */
"\n dec %2"
"\n brne 1b"
"\n.ifnc %A3,%E4"
"\n .error \"GCC constraints not working as expected.\""
"\n.endif"
"\n.ifnc %B3,%F4"
"\n .error \"GCC constraints not working as expected.\""
"\n.endif"
: "+&r" (digit), "+&r" (lsbit), "+&r" (i), "+e" (bin)
: "m" (*bin)
: "r0");
#else
do {
uint8_t byte = *--bin;
uint16_t t = digit;

t += *bin = lsbit << 7 | byte >> 1;
lsbit = byte & 1;
digit = (t >> 8) + (t & 0xff);
} while (--i);
#endif
digit = mod5(digit);
*out++ = digit + digit + lsbit + '0';

/*
* Pass 2, lsb-to-msb: the exciting part!
*
* We have to subtract the digit, then multiply by
* 0xCCCC...CD to achieve a divide by 5. We actually
* multiply by 0x333...33, and negate. To do the negate,
* we don't complement and add 1, but rather subtract
* 1 and then complement. That works because then the
* subtraction can be combined with the accumulation
* for the multiply.
*
* For each byte, we multiply it by 0x33, and add that
* to the "value to be stored in all higher bytes"
* accumulator. The accumulator needs to be 16 bits, but
* we can prevent overflow by adding the msbyte to the
* lsbyte after each iteration.
*
* We can also use this to do the necessary subtracts.
* To subtract 1, we just initialize the accumulator
* to 0xff. To subtract the digit without having a
* separate carry-propagation pass through bin[], we
* further subtract 0x33 times the digit.
* (Since digit <= 4, this fits into 1 byte.)
*/
if (digit > 4)
__builtin_unreachable();
accum = 255 - (uint8_t)(digit * 0x33);

i = len;
do {
accum += 0x33 * *bin;
*bin++ = digit = ~(uint8_t)accum;
accum = (accum >> 8) + (accum & 0xff);
} while (--i);

/* Decrease len as high bytes become zero */
} while (digit || (--bin, --len));
return out;
}

#if __AVR_ARCH__
#define READ_PORT 0x38
#define WRITE_PORT 0x39
#define EXIT_PORT 0x3a

static uint8_t inline __attribute__((always_inline))
inb(uint8_t port)
{
uint8_t x;
asm("in %0,%1" : "=r" (x) : "I" (port));
return x;
}

static void inline __attribute__((always_inline))
outb(uint8_t port, uint8_t x)
{
asm("out %0,%1" :: "I" (port), "r" (x));
}

static void
putch(char c)
{
outb(WRITE_PORT, c);
}

static void
revwrite(char const *p, uint8_t len)
{
while (len--)
putch(*--p);
}

static void
revput(char *buf, uint8_t *bin, uint8_t len)
{
char const *p = genprint(buf, bin, len);
revwrite(p, p-buf);
putch('\n');
}

static void
putst(char const __flash *p)
{
char c;
while ((c = *p++) != 0)
putch(c);
putch('\n');
}

static void __attribute__((noreturn))
my_exit(uint8_t status)
{
outb(EXIT_PORT, status);
__builtin_unreachable();
}

int
main(void)
{
unsigned i;
char buf[21];
union {
unsigned i;
uint8_t b[2];
} icopy;
uint8_t bin[8];

static char const __flash hello[] = "Hello, world!";

putst(hello);

for (i = 0; i < 8; i++) {
int j;
for (j = 0; j < 8; j++)
bin[j] = 0xff;

revput(buf, bin, i+1);
}

for (i = 0; i < 100; i++) {
icopy.i = i;
revput(buf, icopy.b, 2);
}
my_exit(0);
}

#else
static bool __attribute__((pure))
revcmp(char const *a, char const *b, unsigned len)
{
while (len--)
if (*a++ != b[len])
return false;
return true;
}

#include <stdio.h>

static bool
test(uint64_t x)
{
char buf1[21], buf2[21];
int len = sprintf(buf1, "%llu", x);
union {
uint64_t x;
uint8_t buf[8];
} u = { x };
char *p = genprint(buf2, u.buf, 8);

*p = '\0';
//printf("Result: %u -> '%s' vs '%s'\n", x, buf1, buf2);
if (p - buf2 == len && revcmp(buf1, buf2, len))
return true;
printf("Mismatch: %llu (%#llx) -> '%s' vs '%s'\n", x, x, buf1, buf2);
return false;
}

int
main(void)
{
unsigned i;

for (i = 0; i < 10000000; i++) {
//printf("Testing %u\n", i);
if (!test(i))
return 1;
if (!test(~i))
return 1;
if (!test(~(uint64_t)i))
return 1;
}
printf("All succeeded, up to %u\n", i);
return 0;
}

#endif /* !__AVR_ARCH__ */

George Spelvin

2016-12-05 19:26:28 UTC