mirror of
https://github.com/pbatard/rufus.git
synced 2024-08-14 23:57:05 +00:00
[checksum] enable x86 acceleration if the CPU supports it
* Newer Intel and AMD CPUs have SSE extensions for SHA-1 and SHA-256 acceleration. * Add new cpu.c/cpu.h sources to detect the extensions, and use them in checksum.c if available. * Acceleration code is taken from https://github.com/noloader/SHA-Intrinsics.
This commit is contained in:
parent
ddcbe8ed81
commit
36f4716afd
9 changed files with 771 additions and 27 deletions
|
@ -349,6 +349,7 @@
|
|||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\src\badblocks.c" />
|
||||
<ClCompile Include="..\src\cpu.c" />
|
||||
<ClCompile Include="..\src\dos_locale.c" />
|
||||
<ClCompile Include="..\src\drive.c" />
|
||||
<ClCompile Include="..\src\format.c" />
|
||||
|
@ -380,6 +381,7 @@
|
|||
<ClInclude Include="..\res\grub\grub_version.h" />
|
||||
<ClInclude Include="..\src\badblocks.h" />
|
||||
<ClInclude Include="..\src\bled\bled.h" />
|
||||
<ClInclude Include="..\src\cpu.h" />
|
||||
<ClInclude Include="..\src\drive.h" />
|
||||
<ClInclude Include="..\src\format.h" />
|
||||
<ClInclude Include="..\src\gpt_types.h" />
|
||||
|
|
|
@ -93,6 +93,9 @@
|
|||
<ClCompile Include="..\src\wue.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\src\cpu.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\src\rufus.h">
|
||||
|
@ -191,6 +194,9 @@
|
|||
<ClInclude Include="..\src\vhd.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\src\cpu.h">
|
||||
<Filter>Header Files</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="..\res\rufus.ico">
|
||||
|
|
|
@ -14,8 +14,8 @@ AM_V_WINDRES = $(AM_V_WINDRES_$(V))
|
|||
%_rc.o: %.rc ../res/loc/embedded.loc
|
||||
$(AM_V_WINDRES) $(AM_RCFLAGS) -i $< -o $@
|
||||
|
||||
rufus_SOURCES = badblocks.c checksum.c dev.c dos.c dos_locale.c drive.c format.c format_ext.c format_fat32.c icon.c iso.c localization.c \
|
||||
net.c parser.c pki.c process.c re.c rufus.c smart.c stdfn.c stdio.c stdlg.c syslinux.c ui.c vhd.c wue.c
|
||||
rufus_SOURCES = badblocks.c checksum.c cpu.c dev.c dos.c dos_locale.c drive.c format.c format_ext.c format_fat32.c icon.c iso.c \
|
||||
localization.c net.c parser.c pki.c process.c re.c rufus.c smart.c stdfn.c stdio.c stdlg.c syslinux.c ui.c vhd.c wue.c
|
||||
rufus_CFLAGS = -I$(srcdir)/ms-sys/inc -I$(srcdir)/syslinux/libfat -I$(srcdir)/syslinux/libinstaller -I$(srcdir)/syslinux/win -I$(srcdir)/libcdio $(AM_CFLAGS) \
|
||||
-DEXT2_FLAT_INCLUDES=0 -DSOLUTION=rufus
|
||||
rufus_LDFLAGS = $(AM_LDFLAGS) -mwindows -L ../.mingw
|
||||
|
|
|
@ -88,7 +88,7 @@ CONFIG_CLEAN_FILES =
|
|||
CONFIG_CLEAN_VPATH_FILES =
|
||||
PROGRAMS = $(noinst_PROGRAMS)
|
||||
am_rufus_OBJECTS = rufus-badblocks.$(OBJEXT) rufus-checksum.$(OBJEXT) \
|
||||
rufus-dev.$(OBJEXT) rufus-dos.$(OBJEXT) \
|
||||
rufus-cpu.$(OBJEXT) rufus-dev.$(OBJEXT) rufus-dos.$(OBJEXT) \
|
||||
rufus-dos_locale.$(OBJEXT) rufus-drive.$(OBJEXT) \
|
||||
rufus-format.$(OBJEXT) rufus-format_ext.$(OBJEXT) \
|
||||
rufus-format_fat32.$(OBJEXT) rufus-icon.$(OBJEXT) \
|
||||
|
@ -281,8 +281,8 @@ AM_V_WINDRES_0 = @echo " RC $@";$(WINDRES)
|
|||
AM_V_WINDRES_1 = $(WINDRES)
|
||||
AM_V_WINDRES_ = $(AM_V_WINDRES_$(AM_DEFAULT_VERBOSITY))
|
||||
AM_V_WINDRES = $(AM_V_WINDRES_$(V))
|
||||
rufus_SOURCES = badblocks.c checksum.c dev.c dos.c dos_locale.c drive.c format.c format_ext.c format_fat32.c icon.c iso.c localization.c \
|
||||
net.c parser.c pki.c process.c re.c rufus.c smart.c stdfn.c stdio.c stdlg.c syslinux.c ui.c vhd.c wue.c
|
||||
rufus_SOURCES = badblocks.c checksum.c cpu.c dev.c dos.c dos_locale.c drive.c format.c format_ext.c format_fat32.c icon.c iso.c \
|
||||
localization.c net.c parser.c pki.c process.c re.c rufus.c smart.c stdfn.c stdio.c stdlg.c syslinux.c ui.c vhd.c wue.c
|
||||
|
||||
rufus_CFLAGS = -I$(srcdir)/ms-sys/inc -I$(srcdir)/syslinux/libfat -I$(srcdir)/syslinux/libinstaller -I$(srcdir)/syslinux/win -I$(srcdir)/libcdio $(AM_CFLAGS) \
|
||||
-DEXT2_FLAT_INCLUDES=0 -DSOLUTION=rufus
|
||||
|
@ -357,6 +357,12 @@ rufus-checksum.o: checksum.c
|
|||
rufus-checksum.obj: checksum.c
|
||||
$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rufus_CFLAGS) $(CFLAGS) -c -o rufus-checksum.obj `if test -f 'checksum.c'; then $(CYGPATH_W) 'checksum.c'; else $(CYGPATH_W) '$(srcdir)/checksum.c'; fi`
|
||||
|
||||
rufus-cpu.o: cpu.c
|
||||
$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rufus_CFLAGS) $(CFLAGS) -c -o rufus-cpu.o `test -f 'cpu.c' || echo '$(srcdir)/'`cpu.c
|
||||
|
||||
rufus-cpu.obj: cpu.c
|
||||
$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rufus_CFLAGS) $(CFLAGS) -c -o rufus-cpu.obj `if test -f 'cpu.c'; then $(CYGPATH_W) 'cpu.c'; else $(CYGPATH_W) '$(srcdir)/cpu.c'; fi`
|
||||
|
||||
rufus-dev.o: dev.c
|
||||
$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rufus_CFLAGS) $(CFLAGS) -c -o rufus-dev.o `test -f 'dev.c' || echo '$(srcdir)/'`dev.c
|
||||
|
||||
|
|
554
src/checksum.c
554
src/checksum.c
|
@ -6,6 +6,7 @@
|
|||
* Copyright © 2004 g10 Code GmbH
|
||||
* Copyright © 2002-2015 Wei Dai & Igor Pavlov
|
||||
* Copyright © 2015-2021 Pete Batard <pete@akeo.ie>
|
||||
* Copyright © 2022 Jeffrey Walton <noloader@gmail.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -28,6 +29,8 @@
|
|||
*
|
||||
* SHA-512 modified from LibTomCrypt - Public Domain
|
||||
*
|
||||
* CPU accelerated SHA code taken from SHA-Intrinsics - Public Domain
|
||||
*
|
||||
* MD5 code from various public domain sources sharing the following
|
||||
* copyright declaration:
|
||||
*
|
||||
|
@ -61,6 +64,7 @@
|
|||
#include <windowsx.h>
|
||||
|
||||
#include "db.h"
|
||||
#include "cpu.h"
|
||||
#include "rufus.h"
|
||||
#include "winio.h"
|
||||
#include "missing.h"
|
||||
|
@ -68,6 +72,22 @@
|
|||
#include "msapi_utf8.h"
|
||||
#include "localization.h"
|
||||
|
||||
/* Includes for SHA-1 and SHA-256 intrinsics */
|
||||
#if defined(CPU_X86_SHA1_ACCELERATION) || defined(CPU_X86_SHA256_ACCELERATION)
|
||||
#if defined(_MSC_VER)
|
||||
#include <immintrin.h>
|
||||
#elif defined(__GNUC__)
|
||||
#include <stdint.h>
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define RUFUS_ENABLE_GCC_ARCH(arch)
|
||||
#else
|
||||
#define RUFUS_ENABLE_GCC_ARCH(arch) __attribute__ ((target (arch)))
|
||||
#endif
|
||||
|
||||
#undef BIG_ENDIAN_HOST
|
||||
|
||||
#define BUFFER_SIZE (64*KB)
|
||||
|
@ -208,7 +228,7 @@ static void sha512_init(SUM_CONTEXT* ctx)
|
|||
}
|
||||
|
||||
/* Transform the message X which consists of 16 32-bit-words (SHA-1) */
|
||||
static void sha1_transform(SUM_CONTEXT *ctx, const uint8_t *data)
|
||||
static void sha1_transform_cc(SUM_CONTEXT *ctx, const uint8_t *data)
|
||||
{
|
||||
uint32_t a, b, c, d, e, tm, x[16];
|
||||
|
||||
|
@ -341,8 +361,235 @@ static void sha1_transform(SUM_CONTEXT *ctx, const uint8_t *data)
|
|||
ctx->state[4] += e;
|
||||
}
|
||||
|
||||
#ifdef CPU_X86_SHA1_ACCELERATION
|
||||
/*
|
||||
* Transform the message X which consists of 16 32-bit-words (SHA-1)
|
||||
* The code is public domain taken from https://github.com/noloader/SHA-Intrinsics.
|
||||
*/
|
||||
RUFUS_ENABLE_GCC_ARCH("ssse3,sse4.1,sha")
|
||||
static void sha1_transform_x86(uint64_t state64[5], const uint8_t *data, size_t length)
|
||||
{
|
||||
__m128i ABCD, E0, E1;
|
||||
__m128i MSG0, MSG1, MSG2, MSG3;
|
||||
const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
|
||||
|
||||
/* Rufus uses uint64_t for the state array. Pack it into uint32_t. */
|
||||
uint32_t state[5] = {
|
||||
(uint32_t)state64[0],
|
||||
(uint32_t)state64[1],
|
||||
(uint32_t)state64[2],
|
||||
(uint32_t)state64[3],
|
||||
(uint32_t)state64[4]
|
||||
};
|
||||
|
||||
/* Load initial values */
|
||||
ABCD = _mm_loadu_si128((const __m128i*) state);
|
||||
E0 = _mm_set_epi32(state[4], 0, 0, 0);
|
||||
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
|
||||
|
||||
while (length >= SHA1_BLOCKSIZE)
|
||||
{
|
||||
/* Save current state */
|
||||
const __m128i ABCD_SAVE = ABCD;
|
||||
const __m128i E0_SAVE = E0;
|
||||
|
||||
/* Rounds 0-3 */
|
||||
MSG0 = _mm_loadu_si128((const __m128i*)(data + 0));
|
||||
MSG0 = _mm_shuffle_epi8(MSG0, MASK);
|
||||
E0 = _mm_add_epi32(E0, MSG0);
|
||||
E1 = ABCD;
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
|
||||
|
||||
/* Rounds 4-7 */
|
||||
MSG1 = _mm_loadu_si128((const __m128i*)(data + 16));
|
||||
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
||||
E0 = ABCD;
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
|
||||
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
||||
|
||||
/* Rounds 8-11 */
|
||||
MSG2 = _mm_loadu_si128((const __m128i*)(data + 32));
|
||||
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
||||
E1 = ABCD;
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
|
||||
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
||||
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
||||
|
||||
/* Rounds 12-15 */
|
||||
MSG3 = _mm_loadu_si128((const __m128i*)(data + 48));
|
||||
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
||||
E0 = ABCD;
|
||||
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
|
||||
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
||||
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
||||
|
||||
/* Rounds 16-19 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
||||
E1 = ABCD;
|
||||
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
|
||||
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
||||
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
||||
|
||||
/* Rounds 20-23 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
||||
E0 = ABCD;
|
||||
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
|
||||
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
||||
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
||||
|
||||
/* Rounds 24-27 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
||||
E1 = ABCD;
|
||||
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
|
||||
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
||||
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
||||
|
||||
/* Rounds 28-31 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
||||
E0 = ABCD;
|
||||
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
|
||||
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
||||
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
||||
|
||||
/* Rounds 32-35 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
||||
E1 = ABCD;
|
||||
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
|
||||
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
||||
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
||||
|
||||
/* Rounds 36-39 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
||||
E0 = ABCD;
|
||||
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
|
||||
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
||||
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
||||
|
||||
/* Rounds 40-43 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
||||
E1 = ABCD;
|
||||
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
|
||||
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
||||
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
||||
|
||||
/* Rounds 44-47 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
||||
E0 = ABCD;
|
||||
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
|
||||
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
||||
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
||||
|
||||
/* Rounds 48-51 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
||||
E1 = ABCD;
|
||||
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
|
||||
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
||||
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
||||
|
||||
/* Rounds 52-55 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
||||
E0 = ABCD;
|
||||
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
|
||||
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
|
||||
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
||||
|
||||
/* Rounds 56-59 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
||||
E1 = ABCD;
|
||||
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
|
||||
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
|
||||
MSG0 = _mm_xor_si128(MSG0, MSG2);
|
||||
|
||||
/* Rounds 60-63 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
||||
E0 = ABCD;
|
||||
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
|
||||
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
|
||||
MSG1 = _mm_xor_si128(MSG1, MSG3);
|
||||
|
||||
/* Rounds 64-67 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG0);
|
||||
E1 = ABCD;
|
||||
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
|
||||
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
|
||||
MSG2 = _mm_xor_si128(MSG2, MSG0);
|
||||
|
||||
/* Rounds 68-71 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG1);
|
||||
E0 = ABCD;
|
||||
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
|
||||
MSG3 = _mm_xor_si128(MSG3, MSG1);
|
||||
|
||||
/* Rounds 72-75 */
|
||||
E0 = _mm_sha1nexte_epu32(E0, MSG2);
|
||||
E1 = ABCD;
|
||||
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
|
||||
|
||||
/* Rounds 76-79 */
|
||||
E1 = _mm_sha1nexte_epu32(E1, MSG3);
|
||||
E0 = ABCD;
|
||||
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
|
||||
|
||||
/* Combine state */
|
||||
E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
|
||||
ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
|
||||
|
||||
data += 64;
|
||||
length -= 64;
|
||||
}
|
||||
|
||||
/* Save state */
|
||||
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
|
||||
_mm_storeu_si128((__m128i*) state, ABCD);
|
||||
state[4] = _mm_extract_epi32(E0, 3);
|
||||
|
||||
/* Repack into uint64_t. */
|
||||
state64[0] = state[0];
|
||||
state64[1] = state[1];
|
||||
state64[2] = state[2];
|
||||
state64[3] = state[3];
|
||||
state64[4] = state[4];
|
||||
}
|
||||
#endif /* CPU_X86_SHA1_ACCELERATION */
|
||||
|
||||
/* Transform the message X which consists of 16 32-bit-words (SHA-1) */
|
||||
static void sha1_transform(SUM_CONTEXT *ctx, const uint8_t *data)
|
||||
{
|
||||
#ifdef CPU_X86_SHA1_ACCELERATION
|
||||
if (cpu_has_sha1_accel)
|
||||
{
|
||||
/* SHA-1 acceleration using intrinsics */
|
||||
sha1_transform_x86(ctx->state, data, SHA1_BLOCKSIZE);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* Portable C/C++ implementation */
|
||||
sha1_transform_cc(ctx, data);
|
||||
}
|
||||
}
|
||||
|
||||
/* Transform the message X which consists of 16 32-bit-words (SHA-256) */
|
||||
static __inline void sha256_transform(SUM_CONTEXT *ctx, const uint8_t *data)
|
||||
static __inline void sha256_transform_cc(SUM_CONTEXT *ctx, const uint8_t *data)
|
||||
{
|
||||
uint32_t a, b, c, d, e, f, g, h, j, x[16];
|
||||
|
||||
|
@ -415,6 +662,243 @@ static __inline void sha256_transform(SUM_CONTEXT *ctx, const uint8_t *data)
|
|||
ctx->state[7] += h;
|
||||
}
|
||||
|
||||
#ifdef CPU_X86_SHA256_ACCELERATION
|
||||
/*
|
||||
* Transform the message X which consists of 16 32-bit-words (SHA-256)
|
||||
* The code is public domain taken from https://github.com/noloader/SHA-Intrinsics.
|
||||
*/
|
||||
RUFUS_ENABLE_GCC_ARCH("ssse3,sse4.1,sha")
|
||||
static __inline void sha256_transform_x86(uint64_t state64[8], const uint8_t *data, size_t length)
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP;
|
||||
__m128i MSG0, MSG1, MSG2, MSG3;
|
||||
const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
/* Rufus uses uint64_t for the state array. Pack it into uint32_t. */
|
||||
uint32_t state[8] = {
|
||||
(uint32_t)state64[0],
|
||||
(uint32_t)state64[1],
|
||||
(uint32_t)state64[2],
|
||||
(uint32_t)state64[3],
|
||||
(uint32_t)state64[4],
|
||||
(uint32_t)state64[5],
|
||||
(uint32_t)state64[6],
|
||||
(uint32_t)state64[7]
|
||||
};
|
||||
|
||||
/* Load initial values */
|
||||
TMP = _mm_loadu_si128((const __m128i*) (state+0));
|
||||
STATE1 = _mm_loadu_si128((const __m128i*) (state+4));
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */
|
||||
|
||||
while (length >= SHA256_BLOCKSIZE)
|
||||
{
|
||||
/* Save current state */
|
||||
const __m128i ABEF_SAVE = STATE0;
|
||||
const __m128i CDGH_SAVE = STATE1;
|
||||
|
||||
/* Rounds 0-3 */
|
||||
MSG = _mm_loadu_si128((const __m128i*) (data+0));
|
||||
MSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
/* Rounds 4-7 */
|
||||
MSG1 = _mm_loadu_si128((const __m128i*) (data+16));
|
||||
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
|
||||
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
||||
|
||||
/* Rounds 8-11 */
|
||||
MSG2 = _mm_loadu_si128((const __m128i*) (data+32));
|
||||
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
|
||||
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
||||
|
||||
/* Rounds 12-15 */
|
||||
MSG3 = _mm_loadu_si128((const __m128i*) (data+48));
|
||||
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
|
||||
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
||||
MSG0 = _mm_add_epi32(MSG0, TMP);
|
||||
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
||||
|
||||
/* Rounds 16-19 */
|
||||
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
||||
MSG1 = _mm_add_epi32(MSG1, TMP);
|
||||
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
||||
|
||||
/* Rounds 20-23 */
|
||||
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
||||
MSG2 = _mm_add_epi32(MSG2, TMP);
|
||||
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
||||
|
||||
/* Rounds 24-27 */
|
||||
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
||||
MSG3 = _mm_add_epi32(MSG3, TMP);
|
||||
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
||||
|
||||
/* Rounds 28-31 */
|
||||
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
||||
MSG0 = _mm_add_epi32(MSG0, TMP);
|
||||
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
||||
|
||||
/* Rounds 32-35 */
|
||||
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
||||
MSG1 = _mm_add_epi32(MSG1, TMP);
|
||||
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
||||
|
||||
/* Rounds 36-39 */
|
||||
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
||||
MSG2 = _mm_add_epi32(MSG2, TMP);
|
||||
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
||||
|
||||
/* Rounds 40-43 */
|
||||
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
||||
MSG3 = _mm_add_epi32(MSG3, TMP);
|
||||
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
||||
|
||||
/* Rounds 44-47 */
|
||||
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
||||
MSG0 = _mm_add_epi32(MSG0, TMP);
|
||||
MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
||||
|
||||
/* Rounds 48-51 */
|
||||
MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
||||
MSG1 = _mm_add_epi32(MSG1, TMP);
|
||||
MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
||||
|
||||
/* Rounds 52-55 */
|
||||
MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
||||
MSG2 = _mm_add_epi32(MSG2, TMP);
|
||||
MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
/* Rounds 56-59 */
|
||||
MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
||||
MSG3 = _mm_add_epi32(MSG3, TMP);
|
||||
MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
/* Rounds 60-63 */
|
||||
MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
/* Combine state */
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
data += 64;
|
||||
length -= 64;
|
||||
}
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */
|
||||
|
||||
/* Save state */
|
||||
_mm_storeu_si128((__m128i*) (state+0), STATE0);
|
||||
_mm_storeu_si128((__m128i*) (state+4), STATE1);
|
||||
|
||||
/* Repack into uint64_t. */
|
||||
state64[0] = state[0];
|
||||
state64[1] = state[1];
|
||||
state64[2] = state[2];
|
||||
state64[3] = state[3];
|
||||
state64[4] = state[4];
|
||||
state64[5] = state[5];
|
||||
state64[6] = state[6];
|
||||
state64[7] = state[7];
|
||||
}
|
||||
#endif /* CPU_X86_SHA256_ACCELERATION */
|
||||
|
||||
static __inline void sha256_transform(SUM_CONTEXT *ctx, const uint8_t *data)
|
||||
{
|
||||
#ifdef CPU_X86_SHA256_ACCELERATION
|
||||
if (cpu_has_sha256_accel)
|
||||
{
|
||||
/* SHA-256 acceleration using intrinsics */
|
||||
sha256_transform_x86(ctx->state, data, SHA256_BLOCKSIZE);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* Portable C/C++ implementation */
|
||||
sha256_transform_cc(ctx, data);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Transform the message X which consists of 16 64-bit-words (SHA-512)
|
||||
* This is an algorithm that *REALLY* benefits from being executed as 64-bit
|
||||
|
@ -618,12 +1102,29 @@ static void sha1_write(SUM_CONTEXT *ctx, const uint8_t *buf, size_t len)
|
|||
len -= num;
|
||||
}
|
||||
|
||||
/* Process data in blocksize chunks */
|
||||
while (len >= SHA1_BLOCKSIZE) {
|
||||
PREFETCH64(buf + SHA1_BLOCKSIZE);
|
||||
sha1_transform(ctx, buf);
|
||||
buf += SHA1_BLOCKSIZE;
|
||||
len -= SHA1_BLOCKSIZE;
|
||||
#ifdef CPU_X86_SHA1_ACCELERATION
|
||||
if (cpu_has_sha1_accel)
|
||||
{
|
||||
/* Process all full blocks at once */
|
||||
if (len >= SHA1_BLOCKSIZE) {
|
||||
/* Calculate full blocks, in bytes */
|
||||
num = (len / SHA1_BLOCKSIZE) * SHA1_BLOCKSIZE;
|
||||
/* SHA-1 acceleration using intrinsics */
|
||||
sha1_transform_x86(ctx->state, buf, num);
|
||||
buf += num;
|
||||
len -= num;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* Process data in blocksize chunks */
|
||||
while (len >= SHA1_BLOCKSIZE) {
|
||||
PREFETCH64(buf + SHA1_BLOCKSIZE);
|
||||
sha1_transform(ctx, buf);
|
||||
buf += SHA1_BLOCKSIZE;
|
||||
len -= SHA1_BLOCKSIZE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle any remaining bytes of data. */
|
||||
|
@ -653,12 +1154,29 @@ static void sha256_write(SUM_CONTEXT *ctx, const uint8_t *buf, size_t len)
|
|||
len -= num;
|
||||
}
|
||||
|
||||
/* Process data in blocksize chunks */
|
||||
while (len >= SHA256_BLOCKSIZE) {
|
||||
PREFETCH64(buf + SHA256_BLOCKSIZE);
|
||||
sha256_transform(ctx, buf);
|
||||
buf += SHA256_BLOCKSIZE;
|
||||
len -= SHA256_BLOCKSIZE;
|
||||
#ifdef CPU_X86_SHA256_ACCELERATION
|
||||
if (cpu_has_sha256_accel)
|
||||
{
|
||||
/* Process all full blocks at once */
|
||||
if (len >= SHA256_BLOCKSIZE) {
|
||||
/* Calculate full blocks, in bytes */
|
||||
num = (len / SHA256_BLOCKSIZE) * SHA256_BLOCKSIZE;
|
||||
/* SHA-256 acceleration using intrinsics */
|
||||
sha256_transform_x86(ctx->state, buf, num);
|
||||
buf += num;
|
||||
len -= num;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* Process data in blocksize chunks */
|
||||
while (len >= SHA256_BLOCKSIZE) {
|
||||
PREFETCH64(buf + SHA256_BLOCKSIZE);
|
||||
sha256_transform(ctx, buf);
|
||||
buf += SHA256_BLOCKSIZE;
|
||||
len -= SHA256_BLOCKSIZE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle any remaining bytes of data. */
|
||||
|
@ -1280,7 +1798,7 @@ BOOL IsFileInDB(const char* path)
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
#if defined(_DEBUG)
|
||||
#if defined(_DEBUG) || defined(TEST) || defined(ALPHA)
|
||||
/* Convert a lowercase hex string to binary. Returned value must be freed */
|
||||
uint8_t* to_bin(const char* str)
|
||||
{
|
||||
|
@ -1303,7 +1821,7 @@ uint8_t* to_bin(const char* str)
|
|||
return ret;
|
||||
}
|
||||
|
||||
const char* test_msg = "Did you ever hear the tragedy of Darth Plagueis The Wise? "
|
||||
const char test_msg[] = "Did you ever hear the tragedy of Darth Plagueis The Wise? "
|
||||
"I thought not. It's not a story the Jedi would tell you. It's a Sith legend. "
|
||||
"Darth Plagueis was a Dark Lord of the Sith, so powerful and so wise he could "
|
||||
"use the Force to influence the midichlorians to create life... He had such a "
|
||||
|
@ -1356,6 +1874,10 @@ int TestChecksum(void)
|
|||
if (msg == NULL)
|
||||
return -1;
|
||||
|
||||
/* Display accelerations available */
|
||||
uprintf("SHA1 acceleration: %s", (cpu_has_sha1_accel ? "TRUE" : "FALSE"));
|
||||
uprintf("SHA256 acceleration: %s", (cpu_has_sha256_accel ? "TRUE" : "FALSE"));
|
||||
|
||||
for (j = 0; j < CHECKSUM_MAX; j++) {
|
||||
size_t copy_msg_len[4];
|
||||
copy_msg_len[0] = 0;
|
||||
|
|
123
src/cpu.c
Normal file
123
src/cpu.c
Normal file
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Rufus: The Reliable USB Formatting Utility
|
||||
* CPU features detection
|
||||
* Copyright © 2022 Pete Batard <pete@akeo.ie>
|
||||
* Copyright © 2022 Jeffrey Walton <noloader@gmail.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "cpu.h"
|
||||
|
||||
#if (defined(CPU_X86_SHA1_ACCELERATION) || defined(CPU_X86_SHA256_ACCELERATION))
|
||||
#if defined(RUFUS_MSC_VERSION)
|
||||
#include <intrin.h>
|
||||
#elif (defined(RUFUS_GCC_VERSION) || defined(RUFUS_CLANG_VERSION))
|
||||
#include <x86Intrin.h>
|
||||
#elif defined(RUFUS_INTEL_VERSION)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
BOOL cpu_has_sha1_accel = FALSE;
|
||||
BOOL cpu_has_sha256_accel = FALSE;
|
||||
|
||||
/*
|
||||
* Three elements must be in place to make a meaningful call to the
|
||||
* DetectSHA###Acceleration() calls. First, the compiler must support
|
||||
* the underlying intrinsics. Second, the platform must provide a
|
||||
* cpuid() function. And third, the cpu must actually support the SHA-1
|
||||
* and SHA-256 instructions.
|
||||
*
|
||||
* If any of the conditions are not met, then DetectSHA###Acceleration()
|
||||
* returns FALSE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Detect if the processor supports SHA-1 acceleration. We only check for
|
||||
* the three ISAs we need - SSSE3, SSE4.1 and SHA. We don't check for OS
|
||||
* support or XSAVE because that's been enabled since Windows 2000.
|
||||
*/
|
||||
BOOL DetectSHA1Acceleration(void)
|
||||
{
|
||||
#if defined(CPU_X86_SHA1_ACCELERATION)
|
||||
#if defined(_MSC_VER)
|
||||
uint32_t regs0[4] = {0,0,0,0}, regs1[4] = {0,0,0,0}, regs7[4] = {0,0,0,0};
|
||||
const uint32_t SSSE3_BIT = 1u << 9; /* Function 1, Bit 9 of ECX */
|
||||
const uint32_t SSE41_BIT = 1u << 19; /* Function 1, Bit 19 of ECX */
|
||||
const uint32_t SHA_BIT = 1u << 29; /* Function 7, Bit 29 of EBX */
|
||||
|
||||
__cpuid(regs0, 0);
|
||||
const uint32_t highest = regs0[0]; /*EAX*/
|
||||
|
||||
if (highest >= 0x01) {
|
||||
__cpuidex(regs1, 1, 0);
|
||||
}
|
||||
if (highest >= 0x07) {
|
||||
__cpuidex(regs7, 7, 0);
|
||||
}
|
||||
|
||||
return (regs1[2] /*ECX*/ & SSSE3_BIT) && (regs1[2] /*ECX*/ & SSE41_BIT) && (regs7[1] /*EBX*/ & SHA_BIT) ? TRUE : FALSE;
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
/* __builtin_cpu_supports available in GCC 4.8.1 and above */
|
||||
return __builtin_cpu_supports("ssse3") && __builtin_cpu_supports("sse4.1") && __builtin_cpu_supports("sha") ? TRUE : FALSE;
|
||||
#elif defined(__INTEL_COMPILER)
|
||||
/* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_may_i_use_cpu_feature */
|
||||
return _may_i_use_cpu_feature(_FEATURE_SSSE3|_FEATURE_SSE4_1|_FEATURE_SHA) ? TRUE : FALSE;
|
||||
#else
|
||||
return FALSE;
|
||||
#endif
|
||||
#else
|
||||
return FALSE;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Detect if the processor supports SHA-256 acceleration. We only check for
|
||||
* the three ISAs we need - SSSE3, SSE4.1 and SHA. We don't check for OS
|
||||
* support or XSAVE because that's been enabled since Windows 2000.
|
||||
*/
|
||||
BOOL DetectSHA256Acceleration(void)
|
||||
{
|
||||
#if defined(CPU_X86_SHA256_ACCELERATION)
|
||||
#if defined(_MSC_VER)
|
||||
uint32_t regs0[4] = {0,0,0,0}, regs1[4] = {0,0,0,0}, regs7[4] = {0,0,0,0};
|
||||
const uint32_t SSSE3_BIT = 1u << 9; /* Function 1, Bit 9 of ECX */
|
||||
const uint32_t SSE41_BIT = 1u << 19; /* Function 1, Bit 19 of ECX */
|
||||
const uint32_t SHA_BIT = 1u << 29; /* Function 7, Bit 29 of EBX */
|
||||
|
||||
__cpuid(regs0, 0);
|
||||
const uint32_t highest = regs0[0]; /*EAX*/
|
||||
|
||||
if (highest >= 0x01) {
|
||||
__cpuidex(regs1, 1, 0);
|
||||
}
|
||||
if (highest >= 0x07) {
|
||||
__cpuidex(regs7, 7, 0);
|
||||
}
|
||||
|
||||
return (regs1[2] /*ECX*/ & SSSE3_BIT) && (regs1[2] /*ECX*/ & SSE41_BIT) && (regs7[1] /*EBX*/ & SHA_BIT) ? TRUE : FALSE;
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
/* __builtin_cpu_supports available in GCC 4.8.1 and above */
|
||||
return __builtin_cpu_supports("ssse3") && __builtin_cpu_supports("sse4.1") && __builtin_cpu_supports("sha") ? TRUE : FALSE;
|
||||
#elif defined(__INTEL_COMPILER)
|
||||
/* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_may_i_use_cpu_feature */
|
||||
return _may_i_use_cpu_feature(_FEATURE_SSSE3|_FEATURE_SSE4_1|_FEATURE_SHA) ? TRUE : FALSE;
|
||||
#else
|
||||
return FALSE;
|
||||
#endif
|
||||
#else
|
||||
return FALSE;
|
||||
#endif
|
||||
}
|
79
src/cpu.h
Normal file
79
src/cpu.h
Normal file
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Rufus: The Reliable USB Formatting Utility
|
||||
* CPU features detection
|
||||
* Copyright © 2022 Pete Batard <pete@akeo.ie>
|
||||
* Copyright © 2022 Jeffrey Walton <noloader@gmail.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Primarily added to support SHA instructions on x86 machines.
|
||||
* SHA acceleration is becoming as ubiquitous as AES acceleration.
|
||||
* SHA support was introduced in Intel Goldmont architecture, like
|
||||
* Celeron J3455 and Pentium J4205. The instructions are now present
|
||||
* in AMD Ryzen 3 (Zen architecture) and above, and Intel Core
|
||||
* 10th-gen processors (Ice Lake), 11th-gen processors (Rocket Lake)
|
||||
* and above.
|
||||
*
|
||||
* Typical benchmarks for x86 SHA acceleration is about a 6x to 10x
|
||||
* speedup over a C/C++ implementation. The rough measurements are
|
||||
* 1.0 to 1.8 cpb for SHA-1, and 1.5 to 2.5 cpb for SHA-256. On a
|
||||
* Celeron J3455, that's 1.1 GB/s for SHA-1 and 800 MB/s for SHA-256.
|
||||
* On a 10th-gen Core i5, that's about 1.65 GB/s for SHA-1 and about
|
||||
* 1.3 GB/s for SHA-256.
|
||||
*/
|
||||
|
||||
#include "rufus.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define RUFUS_MSC_VERSION (_MSC_VER)
|
||||
#if (RUFUS_MSC_VERSION < 1900)
|
||||
#error "Your compiler is too old to build this application"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define RUFUS_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
||||
#if (RUFUS_GCC_VERSION < 40900)
|
||||
#error "Your compiler is too old to build this application"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define RUFUS_INTEL_VERSION (__INTEL_COMPILER)
|
||||
#if (RUFUS_INTEL_VERSION < 1600)
|
||||
#error "Your compiler is too old to build this application"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#define RUFUS_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
|
||||
#if (RUFUS_CLANG_VERSION < 30400)
|
||||
#error "Your compiler is too old to build this application"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__i386) || \
|
||||
defined(_X86_) || defined(__I86__) || defined(__x86_64__))
|
||||
#define CPU_X86_SHA1_ACCELERATION 1
|
||||
#define CPU_X86_SHA256_ACCELERATION 1
|
||||
#endif
|
||||
|
||||
extern BOOL cpu_has_sha1_accel, cpu_has_sha256_accel;
|
||||
|
||||
extern BOOL DetectSHA1Acceleration(void);
|
||||
extern BOOL DetectSHA256Acceleration(void);
|
|
@ -47,6 +47,7 @@
|
|||
|
||||
#include "ui.h"
|
||||
#include "re.h"
|
||||
#include "cpu.h"
|
||||
#include "vhd.h"
|
||||
#include "wue.h"
|
||||
#include "drive.h"
|
||||
|
@ -3688,6 +3689,10 @@ skip_args_processing:
|
|||
uprintf("Failed to enable AutoMount");
|
||||
}
|
||||
|
||||
// Detect CPU acceleration for SHA-1/SHA-256
|
||||
cpu_has_sha1_accel = DetectSHA1Acceleration();
|
||||
cpu_has_sha256_accel = DetectSHA256Acceleration();
|
||||
|
||||
relaunch:
|
||||
ubprintf("Localization set to '%s'", selected_locale->txt[0]);
|
||||
right_to_left_mode = ((selected_locale->ctrl_id) & LOC_RIGHT_TO_LEFT);
|
||||
|
@ -3774,7 +3779,8 @@ relaunch:
|
|||
SendMessage(hMainDialog, WM_COMMAND, IDC_LOG, 0);
|
||||
continue;
|
||||
}
|
||||
#if defined(_DEBUG) || defined(TEST)
|
||||
#if defined(_DEBUG) || defined(TEST) || defined(ALPHA)
|
||||
extern int TestChecksum(void);
|
||||
// Ctrl-T => Alternate Test mode that doesn't require a full rebuild
|
||||
if ((ctrl_without_focus || ((GetKeyState(VK_CONTROL) & 0x8000) && (msg.message == WM_KEYDOWN)))
|
||||
&& (msg.wParam == 'T')) {
|
||||
|
|
10
src/rufus.rc
10
src/rufus.rc
|
@ -33,7 +33,7 @@ LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL
|
|||
IDD_DIALOG DIALOGEX 12, 12, 232, 326
|
||||
STYLE DS_SETFONT | DS_MODALFRAME | DS_CENTER | WS_MINIMIZEBOX | WS_POPUP | WS_CAPTION | WS_SYSMENU
|
||||
EXSTYLE WS_EX_ACCEPTFILES
|
||||
CAPTION "Rufus 3.21.1949"
|
||||
CAPTION "Rufus 3.21.1950"
|
||||
FONT 9, "Segoe UI Symbol", 400, 0, 0x0
|
||||
BEGIN
|
||||
LTEXT "Drive Properties",IDS_DRIVE_PROPERTIES_TXT,8,6,53,12,NOT WS_GROUP
|
||||
|
@ -396,8 +396,8 @@ END
|
|||
//
|
||||
|
||||
VS_VERSION_INFO VERSIONINFO
|
||||
FILEVERSION 3,21,1949,0
|
||||
PRODUCTVERSION 3,21,1949,0
|
||||
FILEVERSION 3,21,1950,0
|
||||
PRODUCTVERSION 3,21,1950,0
|
||||
FILEFLAGSMASK 0x3fL
|
||||
#ifdef _DEBUG
|
||||
FILEFLAGS 0x1L
|
||||
|
@ -415,13 +415,13 @@ BEGIN
|
|||
VALUE "Comments", "https://rufus.ie"
|
||||
VALUE "CompanyName", "Akeo Consulting"
|
||||
VALUE "FileDescription", "Rufus"
|
||||
VALUE "FileVersion", "3.21.1949"
|
||||
VALUE "FileVersion", "3.21.1950"
|
||||
VALUE "InternalName", "Rufus"
|
||||
VALUE "LegalCopyright", "© 2011-2022 Pete Batard (GPL v3)"
|
||||
VALUE "LegalTrademarks", "https://www.gnu.org/licenses/gpl-3.0.html"
|
||||
VALUE "OriginalFilename", "rufus-3.21.exe"
|
||||
VALUE "ProductName", "Rufus"
|
||||
VALUE "ProductVersion", "3.21.1949"
|
||||
VALUE "ProductVersion", "3.21.1950"
|
||||
END
|
||||
END
|
||||
BLOCK "VarFileInfo"
|
||||
|
|
Loading…
Reference in a new issue