Discussion:
[avr-libc-dev] [untested PATCH] Save 11 instructions in vfprintf_flt.o
George Spelvin
2016-12-07 22:08:33 UTC
Permalink
As part of poking around vfprintf.c, I came across this low-hanging fruit.

I'm waiting to test all of my printf changes together, but I thought
I'd throw it out for comment early. I presume this sort of thing is okay?

Basically, by reversing the sense of the FL_FLTUPP flag so it has the same polarity
as bit 5 of the ASCII character set, you can save some code. Both getting the
flag from the format letter, and copying the flag to the output.

The difference is 16 bytes on <= avr31, 20 bytes on avr35, and 22 on the rest:

/----- Before -----\ /------ After -----\
text dec hex text dec hex filename
1876 1876 754 1860 1860 744 ./avr/lib/avr2/vfprintf_flt.o
1876 1876 754 1860 1860 744 ./avr/lib/avr2/tiny-stack/vfprintf_flt.o
1702 1702 6a6 1682 1682 692 ./avr/lib/avr25/vfprintf_flt.o
1702 1702 6a6 1682 1682 692 ./avr/lib/avr25/tiny-stack/vfprintf_flt.o
1942 1942 796 1926 1926 786 ./avr/lib/avr3/vfprintf_flt.o
1948 1948 79c 1932 1932 78c ./avr/lib/avr31/vfprintf_flt.o
1770 1770 6ea 1750 1750 6d6 ./avr/lib/avr35/vfprintf_flt.o
1704 1704 6a8 1682 1682 692 ./avr/lib/avr4/vfprintf_flt.o
1768 1768 6e8 1746 1746 6d2 ./avr/lib/avr5/vfprintf_flt.o
1850 1850 73a 1828 1828 724 ./avr/lib/avr51/vfprintf_flt.o
1850 1850 73a 1828 1828 724 ./avr/lib/avr6/vfprintf_flt.o
1768 1768 6e8 1746 1746 6d2 ./avr/lib/avrxmega2/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega4/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega5/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega6/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega7/vfprintf_flt.o

Here's the diff. (The changes of "/* no break */" to "/* FALLTHROUGH */"
are to silence GCC 7's new fallthrough warning.)

diff --git a/avr-libc/libc/stdio/vfprintf.c b/avr-libc/libc/stdio/vfprintf.c
index 3ba6f9a9..83849432 100644
--- a/avr-libc/libc/stdio/vfprintf.c
+++ b/avr-libc/libc/stdio/vfprintf.c
@@ -114,6 +114,22 @@
})
#endif

+/*
+ * Copy bit (src & smask) to (dst & dmask).
+ *
+ * Unlike "if (src & smask) dst |= dmask", which is also two instructions
+ * and two cycles, this overwrites the destination bit (clearing it
+ * if necessary), and has fewer constraints; it can operate on the low
+ * 16 registers.
+ */
+#define COPYBIT(dst, dmask, src, smask) \
+ asm( "bst %2,%3" \
+ "\n bld %0,%1" \
+ : "=r" (dst) \
+ : "I" (ntz(dmask)), \
+ "r" (src), \
+ "I" (ntz(smask)))
+
/* -------------------------------------------------------------------- */
#if PRINTF_LEVEL <= PRINTF_MIN

@@ -219,7 +235,7 @@ vfprintf (FILE * stream, const char *fmt, va_list ap)
goto ultoa;
case 'p':
flags |= FL_ALT;
- /* no break */
+ /* FALLTHROUGH */
case 'x':
flags |= (FL_ALTHEX | FL_ALTLWR);
base = 16;
@@ -278,7 +294,7 @@ vfprintf (FILE * stream, const char *fmt, va_list ap)
#define FL_ALTUPP FL_PLUS
#define FL_ALTHEX FL_SPACE

-#define FL_FLTUPP FL_ALT
+#define FL_FLTLWR FL_ALT
#define FL_FLTEXP FL_PREC
#define FL_FLTFIX FL_LONG

@@ -367,23 +383,22 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
# error
#endif

-#if PRINTF_LEVEL >= PRINTF_FLT
- if (c >= 'E' && c <= 'G') {
- flags |= FL_FLTUPP;
- c += 'e' - 'E';
- goto flt_oper;
-
- } else if (c >= 'e' && c <= 'g') {
-
+ if ((c >= 'E' && c <= 'G') || (c >= 'e' && c <= 'g')) {
+#if PRINTF_LEVEL < PRINTF_FLT
+ /* Float printf not supported; stub */
+ (void) va_arg (ap, double);
+ buf[0] = '?';
+ goto buf_addr;
+#else
int exp; /* exponent of master decimal digit */
int n;
unsigned char vtype; /* result of float value parse */
unsigned char sign; /* sign character (or 0) */
# define ndigs c /* only for this block, undef is below */

- flags &= ~FL_FLTUPP;
+ COPYBIT(flags, FL_FLTLWR, c, 'e'-'E');
+ c |= 'e' - 'E';

- flt_oper:
if (!(flags & FL_PREC))
prec = 6;
flags &= ~(FL_FLTEXP | FL_FLTFIX);
@@ -434,11 +449,9 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
# if ('I'-'i' != 'N'-'n') || ('I'-'i' != 'F'-'f') || ('I'-'i' != 'A'-'a')
# error
# endif
- while ( (ndigs = pgm_read_byte(p)) != 0) {
- if (flags & FL_FLTUPP)
- ndigs += 'I' - 'i';
+ while ( (ndigs = pgm_read_byte(p++)) != 0) {
+ COPYBIT(ndigs, 'i'-'I', flags, FL_FLTLWR);
putc (ndigs, stream);
- p++;
}
goto tail;
}
@@ -523,7 +536,9 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
}

/* exponent */
- putc (flags & FL_FLTUPP ? 'E' : 'e', stream);
+ ndigs = 'E';
+ COPYBIT(ndigs, 'e'-'E', flags, FL_FLTLWR);
+ putc (ndigs, stream);
ndigs = '+';
if (exp < 0 || (exp == 0 && (vtype & FTOA_CARRY) != 0)) {
exp = -exp;
@@ -538,16 +553,8 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)

goto tail;
# undef ndigs
- }
-
-#else /* to: PRINTF_LEVEL >= PRINTF_FLT */
- if ((c >= 'E' && c <= 'G') || (c >= 'e' && c <= 'g')) {
- (void) va_arg (ap, double);
- buf[0] = '?';
- goto buf_addr;
- }
-
#endif
+ }

{
const char * pnt;
@@ -618,7 +625,7 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
goto ultoa;
case 'p':
flags |= FL_ALT;
- /* no break */
+ /* FALLTHROUGH */
case 'x':
if (flags & FL_ALT)
flags |= FL_ALTHEX;
Georg-Johann Lay
2016-12-08 17:46:34 UTC
Permalink
Post by George Spelvin
As part of poking around vfprintf.c, I came across this low-hanging fruit.
I'm waiting to test all of my printf changes together, but I thought
I'd throw it out for comment early. I presume this sort of thing is okay?
Basically, by reversing the sense of the FL_FLTUPP flag so it has the same polarity
as bit 5 of the ASCII character set, you can save some code. Both getting the
flag from the format letter, and copying the flag to the output.
/----- Before -----\ /------ After -----\
text dec hex text dec hex filename
1876 1876 754 1860 1860 744 ./avr/lib/avr2/vfprintf_flt.o
1876 1876 754 1860 1860 744 ./avr/lib/avr2/tiny-stack/vfprintf_flt.o
1702 1702 6a6 1682 1682 692 ./avr/lib/avr25/vfprintf_flt.o
1702 1702 6a6 1682 1682 692 ./avr/lib/avr25/tiny-stack/vfprintf_flt.o
1942 1942 796 1926 1926 786 ./avr/lib/avr3/vfprintf_flt.o
1948 1948 79c 1932 1932 78c ./avr/lib/avr31/vfprintf_flt.o
1770 1770 6ea 1750 1750 6d6 ./avr/lib/avr35/vfprintf_flt.o
1704 1704 6a8 1682 1682 692 ./avr/lib/avr4/vfprintf_flt.o
1768 1768 6e8 1746 1746 6d2 ./avr/lib/avr5/vfprintf_flt.o
1850 1850 73a 1828 1828 724 ./avr/lib/avr51/vfprintf_flt.o
1850 1850 73a 1828 1828 724 ./avr/lib/avr6/vfprintf_flt.o
1768 1768 6e8 1746 1746 6d2 ./avr/lib/avrxmega2/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega4/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega5/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega6/vfprintf_flt.o
1838 1838 72e 1816 1816 718 ./avr/lib/avrxmega7/vfprintf_flt.o
Here's the diff. (The changes of "/* no break */" to "/* FALLTHROUGH */"
are to silence GCC 7's new fallthrough warning.)
diff --git a/avr-libc/libc/stdio/vfprintf.c b/avr-libc/libc/stdio/vfprintf.c
index 3ba6f9a9..83849432 100644
--- a/avr-libc/libc/stdio/vfprintf.c
+++ b/avr-libc/libc/stdio/vfprintf.c
@@ -114,6 +114,22 @@
})
#endif
+/*
+ * Copy bit (src & smask) to (dst & dmask).
+ *
+ * Unlike "if (src & smask) dst |= dmask", which is also two instructions
This is confusing because the BST + BLD code below is not a replacement
for what the C code is indicating. For example the C code never clears
the bit as opposed to BLD.
Post by George Spelvin
+ * and two cycles, this overwrites the destination bit (clearing it
+ * if necessary), and has fewer constraints; it can operate on the low
+ * 16 registers.
+ */
+#define COPYBIT(dst, dmask, src, smask) \
+ asm( "bst %2,%3" \
+ "\n bld %0,%1" \
+ : "=r" (dst) \
This is wrong because the old value of dst does not die here:
all bits except %1 are surviving. Correct constraint is "+r".
Post by George Spelvin
+ : "I" (ntz(dmask)), \
+ "r" (src), \
+ "I" (ntz(smask)))
+
/* -------------------------------------------------------------------- */
#if PRINTF_LEVEL <= PRINTF_MIN
@@ -219,7 +235,7 @@ vfprintf (FILE * stream, const char *fmt, va_list ap)
goto ultoa;
flags |= FL_ALT;
- /* no break */
+ /* FALLTHROUGH */
flags |= (FL_ALTHEX | FL_ALTLWR);
base = 16;
@@ -278,7 +294,7 @@ vfprintf (FILE * stream, const char *fmt, va_list ap)
#define FL_ALTUPP FL_PLUS
#define FL_ALTHEX FL_SPACE
-#define FL_FLTUPP FL_ALT
+#define FL_FLTLWR FL_ALT
#define FL_FLTEXP FL_PREC
#define FL_FLTFIX FL_LONG
@@ -367,23 +383,22 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
# error
#endif
-#if PRINTF_LEVEL >= PRINTF_FLT
- if (c >= 'E' && c <= 'G') {
- flags |= FL_FLTUPP;
- c += 'e' - 'E';
- goto flt_oper;
-
- } else if (c >= 'e' && c <= 'g') {
-
+ if ((c >= 'E' && c <= 'G') || (c >= 'e' && c <= 'g')) {
+#if PRINTF_LEVEL < PRINTF_FLT
+ /* Float printf not supported; stub */
+ (void) va_arg (ap, double);
+ buf[0] = '?';
+ goto buf_addr;
+#else
int exp; /* exponent of master decimal digit */
int n;
unsigned char vtype; /* result of float value parse */
unsigned char sign; /* sign character (or 0) */
# define ndigs c /* only for this block, undef is below */
- flags &= ~FL_FLTUPP;
+ COPYBIT(flags, FL_FLTLWR, c, 'e'-'E');
+ c |= 'e' - 'E';
if (!(flags & FL_PREC))
prec = 6;
flags &= ~(FL_FLTEXP | FL_FLTFIX);
@@ -434,11 +449,9 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
# if ('I'-'i' != 'N'-'n') || ('I'-'i' != 'F'-'f') || ('I'-'i' != 'A'-'a')
# error
# endif
- while ( (ndigs = pgm_read_byte(p)) != 0) {
- if (flags & FL_FLTUPP)
- ndigs += 'I' - 'i';
+ while ( (ndigs = pgm_read_byte(p++)) != 0) {
+ COPYBIT(ndigs, 'i'-'I', flags, FL_FLTLWR);
putc (ndigs, stream);
- p++;
}
goto tail;
}
@@ -523,7 +536,9 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
}
/* exponent */
- putc (flags & FL_FLTUPP ? 'E' : 'e', stream);
+ ndigs = 'E';
+ COPYBIT(ndigs, 'e'-'E', flags, FL_FLTLWR);
+ putc (ndigs, stream);
ndigs = '+';
if (exp < 0 || (exp == 0 && (vtype & FTOA_CARRY) != 0)) {
exp = -exp;
@@ -538,16 +553,8 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
goto tail;
# undef ndigs
- }
-
-#else /* to: PRINTF_LEVEL >= PRINTF_FLT */
- if ((c >= 'E' && c <= 'G') || (c >= 'e' && c <= 'g')) {
- (void) va_arg (ap, double);
- buf[0] = '?';
- goto buf_addr;
- }
-
#endif
+ }
{
const char * pnt;
@@ -618,7 +625,7 @@ int vfprintf (FILE * stream, const char *fmt, va_list ap)
goto ultoa;
flags |= FL_ALT;
- /* no break */
+ /* FALLTHROUGH */
if (flags & FL_ALT)
flags |= FL_ALTHEX;
_______________________________________________
AVR-libc-dev mailing list
https://lists.nongnu.org/mailman/listinfo/avr-libc-dev
George Spelvin
2016-12-08 18:04:47 UTC
Permalink
Post by Georg-Johann Lay
Post by George Spelvin
+ * Unlike "if (src & smask) dst |= dmask", which is also two instructions
This is confusing because the BST + BLD code below is not a replacement
for what the C code is indicating. For example the C code never clears
the bit as opposed to BLD.
Post by George Spelvin
+ * and two cycles, this overwrites the destination bit (clearing it
+ * if necessary), and has fewer constraints; it can operate on the low
+ * 16 registers.
That's *exactly* what I was trying to say in the rest of the sentence you
quoted back to me! Perhaps I should just give the equivalent C code.
Post by Georg-Johann Lay
+ */
+#define COPYBIT(dst, dmask, src, smask) \
+ asm( "bst %2,%3" \
+ "\n bld %0,%1" \
+ : "=r" (dst) \
all bits except %1 are surviving. Correct constraint is "+r".
Yes, I notice that myself a few minutes after posting. It didn't
seem earth-shaking enough to warrant a followup.

It's now:

/*
* Copy bit (src & smask) to (dst & dmask). This expands to a pair of
* bst/bld instructions which transfer the bit via the T register.
*
* Equivalent to "dst &= ~dmask; if (src & smask) dst |= dmask;", but is
* only two instructions and has fewer constraints; it can operate on
* the low 16 registers.
*/
#ifdef __BUILTIN_AVR_INSERT_BITS
/*
* Using a GCC builtin is preferable; it gives the optimizer more
* information and saves a few more bytes.
*
* The first argument to __builtin_avr_insert_bits is an array of 8
* nibbles, each of which indicates the source of the corresponding
* bit of the resilt. All are 0xf (return the corresponding dst bit
* unchanged) except the one corresponding to dmask, which specifies
* the bit position in src to copy from.
*/
#define COPYBIT(dst, dmask, src, smask) \
((dst) = __builtin_avr_insert_bits( \
~((15ul-ntz(smask))*(dmask)*(dmask)*(dmask)*(dmask)), \
src, dst))
#else
#define COPYBIT(dst, dmask, src, smask) \
asm( "bst %2,%3" \
"\n bld %0,%1" \
: "+r" (dst) \
: "n" (ntz(dmask)), \
"r" (src), \
"n" (ntz(smask)))
#endif

Loading...