diff --git a/NEWS b/NEWS index 9b749df0d..6e4150ed2 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,10 @@ Any uppercase BUG_* names are modernish shell bug IDs. Note that to use these options the operating system must support the corresponding resource limit. +- Ported performance optimizations from illumos to improve the performance + of the cksum builtin. (Note that the cksum builtin is not enabled by + default.) + 2021-12-27: - Two bash-like flags for 'whence' were backported from ksh 93v-: diff --git a/src/lib/libsum/Mamfile b/src/lib/libsum/Mamfile index 729507d9e..dd8eeaa50 100644 --- a/src/lib/libsum/Mamfile +++ b/src/lib/libsum/Mamfile @@ -88,14 +88,10 @@ make install prev ${PACKAGE_ast_INCLUDE}/ast_common.h implicit done ${PACKAGE_ast_INCLUDE}/fnv.h done sum-prng.c - make sum-crc.c implicit - done sum-crc.c make sum-bsd.c implicit done sum-bsd.c make sum-ast4.c implicit done sum-ast4.c - make sum-att.c implicit - done sum-att.c make FEATURE/sum implicit meta FEATURE/sum features/%>FEATURE/% features/sum sum make features/sum @@ -171,6 +167,12 @@ make install prev ${PACKAGE_ast_INCLUDE}/ast_std.h implicit done ${PACKAGE_ast_INCLUDE}/ast.h dontcare done sum.h + make sum-att.c implicit + prev ${PACKAGE_ast_INCLUDE}/ast.h implicit + done sum-att.c + make sum-crc.c implicit + prev ${PACKAGE_ast_INCLUDE}/ast.h implicit + done sum-crc.c done sumlib.c meta sumlib.o %.c>%.o sumlib.c sumlib prev sumlib.c diff --git a/src/lib/libsum/sum-att.c b/src/lib/libsum/sum-att.c index 6a2c56165..0a2ca85f9 100644 --- a/src/lib/libsum/sum-att.c +++ b/src/lib/libsum/sum-att.c @@ -23,6 +23,8 @@ * att */ +#include + #define att_description \ "The system 5 release 4 checksum. This is the default for \bsum\b \ when \bgetconf UNIVERSE\b is \batt\b. This is the only true sum; \ @@ -35,6 +37,73 @@ #define att_data long_data #define att_scale 512 +#if !(defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 6))) && \ + (defined(__SUNPRO_C) || (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0)))) + +#if defined(__SUNPRO_C) +# include +# define sum_prefetch(addr) sun_prefetch_read_many((void *)(addr)) +#elif defined(__GNUC__) +# define sum_prefetch(addr) __builtin_prefetch((addr), 0, 3) +#endif + +#define CBLOCK_SIZE (64) +#if !defined(__clang__) +#pragma unroll(16) +#endif + +/* Inmos transputer would love this algorithm */ +static int +att_block(register Sum_t* p, const void* s, size_t n) +{ + register uint32_t c = ((Integral_t*)p)->sum; + register const unsigned char* b = (const unsigned char*)s; + register const unsigned char* e = b + n; + register uint32_t s0, s1, s2, s3, s4, s5, s6, s7; + register unsigned int i; + + s0=s1=s2=s3=s4=s5=s6=s7=0U; + + sum_prefetch((void *)b); + + while (n > CBLOCK_SIZE) + { + sum_prefetch((b+CBLOCK_SIZE)); + + /* Compiler will unroll for() loops per #pragma unroll */ +#if defined(__clang__) + #pragma clang loop unroll_count(16) +#endif + for (i=0 ; i < (CBLOCK_SIZE/8) ; i++) + { + /* + * use s0-s7 to decouple calculations (this improves pipelining) + * because each operation is completely independent from its + * siblings + */ + s0+=b[0]; + s1+=b[1]; + s2+=b[2]; + s3+=b[3]; + s4+=b[4]; + s5+=b[5]; + s6+=b[6]; + s7+=b[7]; + + b+=8; + n-=8; + } + } + + c+=s0+s1+s2+s3+s4+s5+s6+s7; + + while (b < e) + c += *b++; + ((Integral_t*)p)->sum = c; + return 0; +} + +#else static int att_block(register Sum_t* p, const void* s, size_t n) { @@ -47,6 +116,7 @@ att_block(register Sum_t* p, const void* s, size_t n) ((Integral_t*)p)->sum = c; return 0; } +#endif static int att_done(Sum_t* p) diff --git a/src/lib/libsum/sum-crc.c b/src/lib/libsum/sum-crc.c index 9dfa3ad70..97fe30c2e 100644 --- a/src/lib/libsum/sum-crc.c +++ b/src/lib/libsum/sum-crc.c @@ -23,6 +23,8 @@ * crc */ +#include + #define crc_description \ "32 bit CRC (cyclic redundancy check)." #define crc_options "\ @@ -48,7 +50,8 @@ typedef struct Crc_s Crcnum_t init; Crcnum_t done; Crcnum_t xorsize; - Crcnum_t tab[256]; + const Crcnum_t *tab; /* use |const| to give the compiler a hint that the data won't change */ + Crcnum_t tabdata[256]; unsigned int addsize; unsigned int rotate; } Crc_t; @@ -56,6 +59,62 @@ typedef struct Crc_s #define CRC(p,s,c) (s = (s >> 8) ^ (p)->tab[(s ^ (c)) & 0xff]) #define CRCROTATE(p,s,c) (s = (s << 8) ^ (p)->tab[((s >> 24) ^ (c)) & 0xff]) +static const +Crcnum_t posix_cksum_tab[256] = { + 0x00000000U, + 0x04c11db7U, 0x09823b6eU, 0x0d4326d9U, 0x130476dcU, 0x17c56b6bU, + 0x1a864db2U, 0x1e475005U, 0x2608edb8U, 0x22c9f00fU, 0x2f8ad6d6U, + 0x2b4bcb61U, 0x350c9b64U, 0x31cd86d3U, 0x3c8ea00aU, 0x384fbdbdU, + 0x4c11db70U, 0x48d0c6c7U, 0x4593e01eU, 0x4152fda9U, 0x5f15adacU, + 0x5bd4b01bU, 0x569796c2U, 0x52568b75U, 0x6a1936c8U, 0x6ed82b7fU, + 0x639b0da6U, 0x675a1011U, 0x791d4014U, 0x7ddc5da3U, 0x709f7b7aU, + 0x745e66cdU, 0x9823b6e0U, 0x9ce2ab57U, 0x91a18d8eU, 0x95609039U, + 0x8b27c03cU, 0x8fe6dd8bU, 0x82a5fb52U, 0x8664e6e5U, 0xbe2b5b58U, + 0xbaea46efU, 0xb7a96036U, 0xb3687d81U, 0xad2f2d84U, 0xa9ee3033U, + 0xa4ad16eaU, 0xa06c0b5dU, 0xd4326d90U, 0xd0f37027U, 0xddb056feU, + 0xd9714b49U, 0xc7361b4cU, 0xc3f706fbU, 0xceb42022U, 0xca753d95U, + 0xf23a8028U, 0xf6fb9d9fU, 0xfbb8bb46U, 0xff79a6f1U, 0xe13ef6f4U, + 0xe5ffeb43U, 0xe8bccd9aU, 0xec7dd02dU, 0x34867077U, 0x30476dc0U, + 0x3d044b19U, 0x39c556aeU, 0x278206abU, 0x23431b1cU, 0x2e003dc5U, + 0x2ac12072U, 0x128e9dcfU, 0x164f8078U, 0x1b0ca6a1U, 0x1fcdbb16U, + 0x018aeb13U, 0x054bf6a4U, 0x0808d07dU, 0x0cc9cdcaU, 0x7897ab07U, + 0x7c56b6b0U, 0x71159069U, 0x75d48ddeU, 0x6b93dddbU, 0x6f52c06cU, + 0x6211e6b5U, 0x66d0fb02U, 0x5e9f46bfU, 0x5a5e5b08U, 0x571d7dd1U, + 0x53dc6066U, 0x4d9b3063U, 0x495a2dd4U, 0x44190b0dU, 0x40d816baU, + 0xaca5c697U, 0xa864db20U, 0xa527fdf9U, 0xa1e6e04eU, 0xbfa1b04bU, + 0xbb60adfcU, 0xb6238b25U, 0xb2e29692U, 0x8aad2b2fU, 0x8e6c3698U, + 0x832f1041U, 0x87ee0df6U, 0x99a95df3U, 0x9d684044U, 0x902b669dU, + 0x94ea7b2aU, 0xe0b41de7U, 0xe4750050U, 0xe9362689U, 0xedf73b3eU, + 0xf3b06b3bU, 0xf771768cU, 0xfa325055U, 0xfef34de2U, 0xc6bcf05fU, + 0xc27dede8U, 0xcf3ecb31U, 0xcbffd686U, 0xd5b88683U, 0xd1799b34U, + 0xdc3abdedU, 0xd8fba05aU, 0x690ce0eeU, 0x6dcdfd59U, 0x608edb80U, + 0x644fc637U, 0x7a089632U, 0x7ec98b85U, 0x738aad5cU, 0x774bb0ebU, + 0x4f040d56U, 0x4bc510e1U, 0x46863638U, 0x42472b8fU, 0x5c007b8aU, + 0x58c1663dU, 0x558240e4U, 0x51435d53U, 0x251d3b9eU, 0x21dc2629U, + 0x2c9f00f0U, 0x285e1d47U, 0x36194d42U, 0x32d850f5U, 0x3f9b762cU, + 0x3b5a6b9bU, 0x0315d626U, 0x07d4cb91U, 0x0a97ed48U, 0x0e56f0ffU, + 0x1011a0faU, 0x14d0bd4dU, 0x19939b94U, 0x1d528623U, 0xf12f560eU, + 0xf5ee4bb9U, 0xf8ad6d60U, 0xfc6c70d7U, 0xe22b20d2U, 0xe6ea3d65U, + 0xeba91bbcU, 0xef68060bU, 0xd727bbb6U, 0xd3e6a601U, 0xdea580d8U, + 0xda649d6fU, 0xc423cd6aU, 0xc0e2d0ddU, 0xcda1f604U, 0xc960ebb3U, + 0xbd3e8d7eU, 0xb9ff90c9U, 0xb4bcb610U, 0xb07daba7U, 0xae3afba2U, + 0xaafbe615U, 0xa7b8c0ccU, 0xa379dd7bU, 0x9b3660c6U, 0x9ff77d71U, + 0x92b45ba8U, 0x9675461fU, 0x8832161aU, 0x8cf30badU, 0x81b02d74U, + 0x857130c3U, 0x5d8a9099U, 0x594b8d2eU, 0x5408abf7U, 0x50c9b640U, + 0x4e8ee645U, 0x4a4ffbf2U, 0x470cdd2bU, 0x43cdc09cU, 0x7b827d21U, + 0x7f436096U, 0x7200464fU, 0x76c15bf8U, 0x68860bfdU, 0x6c47164aU, + 0x61043093U, 0x65c52d24U, 0x119b4be9U, 0x155a565eU, 0x18197087U, + 0x1cd86d30U, 0x029f3d35U, 0x065e2082U, 0x0b1d065bU, 0x0fdc1becU, + 0x3793a651U, 0x3352bbe6U, 0x3e119d3fU, 0x3ad08088U, 0x2497d08dU, + 0x2056cd3aU, 0x2d15ebe3U, 0x29d4f654U, 0xc5a92679U, 0xc1683bceU, + 0xcc2b1d17U, 0xc8ea00a0U, 0xd6ad50a5U, 0xd26c4d12U, 0xdf2f6bcbU, + 0xdbee767cU, 0xe3a1cbc1U, 0xe760d676U, 0xea23f0afU, 0xeee2ed18U, + 0xf0a5bd1dU, 0xf464a0aaU, 0xf9278673U, 0xfde69bc4U, 0x89b8fd09U, + 0x8d79e0beU, 0x803ac667U, 0x84fbdbd0U, 0x9abc8bd5U, 0x9e7d9662U, + 0x933eb0bbU, 0x97ffad0cU, 0xafb010b1U, 0xab710d06U, 0xa6322bdfU, + 0xa2f33668U, 0xbcb4666dU, 0xb8757bdaU, 0xb5365d03U, 0xb1f740b4U +}; + static Sum_t* crc_open(const Method_t* method, const char* name) { @@ -73,6 +132,20 @@ crc_open(const Method_t* method, const char* name) sum->method = (Method_t*)method; sum->name = name; } + + if(!strcmp(name, "crc-0x04c11db7-rotate-done-size")) + { + sum->init=0; + sum->done=0xffffffff; + sum->xorsize=0x0; + sum->addsize=0x1; + sum->rotate=1; + + /* Optimized codepath for POSIX cksum to save startup time */ + sum->tab=posix_cksum_tab; + } + else + { polynomial = 0xedb88320; s = name; while (*(t = s)) @@ -106,7 +179,7 @@ crc_open(const Method_t* method, const char* name) p[0] = polynomial; for (i = 1; i < 8; i++) p[i] = (p[i-1] << 1) ^ ((p[i-1] & 0x80000000) ? polynomial : 0); - for (i = 0; i < elementsof(sum->tab); i++) + for (i = 0; i < elementsof(sum->tabdata); i++) { t = 0; x = i; @@ -116,19 +189,23 @@ crc_open(const Method_t* method, const char* name) t ^= p[j]; x >>= 1; } - sum->tab[i] = t; + sum->tabdata[i] = t; } } else { - for (i = 0; i < elementsof(sum->tab); i++) + for (i = 0; i < elementsof(sum->tabdata); i++) { x = i; for (j = 0; j < 8; j++) x = (x>>1) ^ ((x & 1) ? polynomial : 0); - sum->tab[i] = x; + sum->tabdata[i] = x; } + + sum->tab=sum->tabdata; } + } + return (Sum_t*)sum; } @@ -141,6 +218,79 @@ crc_init(Sum_t* p) return 0; } +#if !(defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 6))) && \ + (defined(__SUNPRO_C) || (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0)))) + +#if defined(__SUNPRO_C) +# include +# define sum_prefetch(addr) sun_prefetch_read_many((void *)(addr)) +#elif defined(__GNUC__) +# define sum_prefetch(addr) __builtin_prefetch((addr), 0, 3) +#endif + +#define CBLOCK_SIZE (64) +#if !defined(__clang__) +#pragma unroll(16) +#endif + +static int +crc_block(Sum_t* p, const void* s, size_t n) +{ + Crc_t* sum = (Crc_t*)p; + register Crcnum_t c = sum->sum; + register const unsigned char* b = (const unsigned char*)s; + register const unsigned char* e = b + n; + unsigned short i; + + sum_prefetch(b); + + if (sum->rotate) + { + while (n > CBLOCK_SIZE) + { + sum_prefetch(b+CBLOCK_SIZE); +#if defined(__clang__) + #pragma clang loop unroll_count(16) +#endif + for(i=0 ; i < CBLOCK_SIZE ; i++) + { + CRCROTATE(sum, c, *b++); + } + + n-=CBLOCK_SIZE; + } + + while (b < e) + { + CRCROTATE(sum, c, *b++); + } + } + else + { + while (n > CBLOCK_SIZE) + { + sum_prefetch(b+CBLOCK_SIZE); +#if defined(__clang__) + #pragma clang loop unroll_count(16) +#endif + for(i=0 ; i < CBLOCK_SIZE ; i++) + { + CRC(sum, c, *b++); + } + + n-=CBLOCK_SIZE; + } + + while (b < e) + { + CRC(sum, c, *b++); + } + } + + sum->sum = c; + return 0; +} +#else static int crc_block(Sum_t* p, const void* s, size_t n) { @@ -158,6 +308,7 @@ crc_block(Sum_t* p, const void* s, size_t n) sum->sum = c; return 0; } +#endif static int crc_done(Sum_t* p)