From 07114edc6f9faaa26364e6b6db1686856be16951 Mon Sep 17 00:00:00 2001 From: Pete Batard Date: Wed, 2 Mar 2016 13:40:37 +0000 Subject: [PATCH] [checksum] more cleanup and optimization * Why... can't I stop... trying to optimize... this thing?!? --- src/checksum.c | 63 ++++++++++++++++++-------------------------------- src/missing.h | 17 ++++++++++++++ src/rufus.rc | 10 ++++---- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/src/checksum.c b/src/checksum.c index 5e6694aa..ac0b8f28 100644 --- a/src/checksum.c +++ b/src/checksum.c @@ -68,36 +68,15 @@ char sum_str[NUM_CHECKSUMS][65]; int bufnum, sum_count[NUM_CHECKSUMS] = { 16, 20, 32 }; HANDLE data_ready[NUM_CHECKSUMS], thread_ready[NUM_CHECKSUMS]; DWORD rSize[2]; -char buffer[2][BUFFER_SIZE]; +char ALIGNED(64) buffer[2][BUFFER_SIZE]; -#if defined(__GNUC__) -#define ALIGNED(m) __attribute__ ((__aligned__(m))) -#elif defined(_MSC_VER) -#define ALIGNED(m) __declspec(align(m)) -#endif - -/* Rotate a 32 bit integer by n bytes */ -#if defined(__GNUC__) && defined(__i386__) -static inline uint32_t rol(uint32_t x, int n) -{ - __asm__("roll %%cl,%0" - :"=r" (x) - :"0" (x),"c" (n)); - return x; -} -#elif defined(_MSC_VER) && (_M_IX86 >= 300) -static __inline uint32_t rol(uint32_t x, int n) -{ - __asm { - mov eax, x - mov ecx, n - rol eax, cl - } - /* returns with result in EAX */ -} -#else -#define rol(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) ) -#endif +/* + * Rotate 32 bit integers by n bytes. + * Don't bother trying to hand-optimize those, as the + * compiler usually does a pretty good job at that. + */ +#define ROL(a,b) (((a) << (b)) | ((a) >> (32-(b)))) +#define ROR(a,b) (((a) >> (b)) | ((a) << (32-(b)))) // For SHA-256 static const uint32_t k[64] = { @@ -154,7 +133,6 @@ static void sha256_init(SUM_CONTEXT *ctx) ctx->state[7] = 0x5be0cd19; } - /* Transform the message X which consists of 16 32-bit-words (SHA-1) */ static void sha1_transform(SUM_CONTEXT *ctx, const unsigned char *data) { @@ -192,10 +170,10 @@ static void sha1_transform(SUM_CONTEXT *ctx, const unsigned char *data) #define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) ) #define F4(x,y,z) ( x ^ y ^ z ) -#define M(i) ( tm = x[i&0x0f] ^ x[(i-14)&0x0f] ^ x[(i-8)&0x0f] ^ x[(i-3)&0x0f], (x[i&0x0f] = rol(tm,1)) ) +#define M(i) ( tm = x[i&0x0f] ^ x[(i-14)&0x0f] ^ x[(i-8)&0x0f] ^ x[(i-3)&0x0f], (x[i&0x0f] = ROL(tm,1)) ) -#define SHA1STEP(a,b,c,d,e,f,k,m) do { e += rol(a, 5) + f(b, c, d) + k + m; \ - b = rol(b, 30); } while(0) +#define SHA1STEP(a,b,c,d,e,f,k,m) do { e += ROL(a, 5) + f(b, c, d) + k + m; \ + b = ROL(b, 30); } while(0) SHA1STEP(a, b, c, d, e, F1, K1, x[0]); SHA1STEP(e, a, b, c, d, F1, K1, x[1]); SHA1STEP(d, e, a, b, c, F1, K1, x[2]); @@ -303,15 +281,12 @@ static void sha256_transform(SUM_CONTEXT *ctx, const unsigned char *data) g = ctx->state[6]; h = ctx->state[7]; -#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b)))) -#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b)))) - #define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z))) #define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) -#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22)) -#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25)) -#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3)) -#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10)) +#define EP0(x) (ROR(x,2) ^ ROR(x,13) ^ ROR(x,22)) +#define EP1(x) (ROR(x,6) ^ ROR(x,11) ^ ROR(x,25)) +#define SIG0(x) (ROR(x,7) ^ ROR(x,18) ^ ((x) >> 3)) +#define SIG1(x) (ROR(x,17) ^ ROR(x,19) ^ ((x) >> 10)) #ifdef BIG_ENDIAN_HOST @@ -487,6 +462,7 @@ static void sha1_write(SUM_CONTEXT *ctx, const unsigned char *buf, size_t len) } while (len >= 64) { + PREFETCH64(&buf[64]); sha1_transform(ctx, buf); ctx->bytecount = 0; ctx->bitcount += 64 * 8; @@ -505,6 +481,7 @@ static void sha256_write(SUM_CONTEXT *ctx, const unsigned char *buf, size_t len) ctx->buf[ctx->bytecount] = buf[i]; ctx->bytecount++; if (ctx->bytecount == 64) { + PREFETCH64(&buf[i + 64]); sha256_transform(ctx, ctx->buf); ctx->bitcount += 64 * 8; ctx->bytecount = 0; @@ -539,6 +516,7 @@ static void md5_write(SUM_CONTEXT *ctx, const unsigned char *buf, size_t len) /* Process data in 64-byte chunks */ while (len >= 64) { + PREFETCH64(&buf[64]); memcpy(ctx->buf, buf, 64); md5_transform(ctx, ctx->buf); buf += 64; @@ -704,6 +682,11 @@ static void md5_final(SUM_CONTEXT *ctx) #undef X } +// These 'null' calls are useful for testing load balancing and individual algorithm speed +static void null_init(SUM_CONTEXT *ctx) { memset(ctx, 0, sizeof(*ctx)); } +static void null_write(SUM_CONTEXT *ctx, const unsigned char *buf, size_t len) { } +static void null_final(SUM_CONTEXT *ctx) { } + typedef void sum_init_t(SUM_CONTEXT *ctx); typedef void sum_write_t(SUM_CONTEXT *ctx, const unsigned char *buf, size_t len); typedef void sum_final_t(SUM_CONTEXT *ctx); diff --git a/src/missing.h b/src/missing.h index a97b8485..7c5f6333 100644 --- a/src/missing.h +++ b/src/missing.h @@ -35,6 +35,23 @@ #define MIN(a,b) (((a) < (b)) ? (a) : (b)) #endif +#if defined(__GNUC__) +#define ALIGNED(m) __attribute__ ((__aligned__(m))) +#elif defined(_MSC_VER) +#define ALIGNED(m) __declspec(align(m)) +#endif + +/* + * Prefetch 64 bytes at address m, for read-only operation + * We account for these built-in calls doing nothing if the + * line has already been fetched, or if the address is invalid. + */ +#if defined(__GNUC__) +#define PREFETCH64(m) do { __builtin_prefetch(m, 0, 0); __builtin_prefetch(m+32, 0, 0); } while(0) +#elif defined(_MSC_VER) +#define PREFETCH64(m) do { _m_prefetch(m); _m_prefetch(m+32); } while(0) +#endif + #if defined(_MSC_VER) #define bswap_uint64 _byteswap_uint64 #define bswap_uint32 _byteswap_ulong diff --git a/src/rufus.rc b/src/rufus.rc index 1339802f..d22250ff 100644 --- a/src/rufus.rc +++ b/src/rufus.rc @@ -33,7 +33,7 @@ LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL IDD_DIALOG DIALOGEX 12, 12, 242, 376 STYLE DS_SETFONT | DS_MODALFRAME | DS_CENTER | WS_MINIMIZEBOX | WS_POPUP | WS_CAPTION | WS_SYSMENU EXSTYLE WS_EX_ACCEPTFILES -CAPTION "Rufus 2.8.873" +CAPTION "Rufus 2.8.874" FONT 8, "Segoe UI Symbol", 400, 0, 0x0 BEGIN LTEXT "Device",IDS_DEVICE_TXT,9,6,200,8 @@ -320,8 +320,8 @@ END // VS_VERSION_INFO VERSIONINFO - FILEVERSION 2,8,873,0 - PRODUCTVERSION 2,8,873,0 + FILEVERSION 2,8,874,0 + PRODUCTVERSION 2,8,874,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x1L @@ -338,13 +338,13 @@ BEGIN BEGIN VALUE "CompanyName", "Akeo Consulting (http://akeo.ie)" VALUE "FileDescription", "Rufus" - VALUE "FileVersion", "2.8.873" + VALUE "FileVersion", "2.8.874" VALUE "InternalName", "Rufus" VALUE "LegalCopyright", "© 2011-2016 Pete Batard (GPL v3)" VALUE "LegalTrademarks", "http://www.gnu.org/copyleft/gpl.html" VALUE "OriginalFilename", "rufus.exe" VALUE "ProductName", "Rufus" - VALUE "ProductVersion", "2.8.873" + VALUE "ProductVersion", "2.8.874" END END BLOCK "VarFileInfo"