mirror of
				https://git.wownero.com/wownero/RandomWOW.git
				synced 2024-08-15 00:23:14 +00:00 
			
		
		
		
	Optimized Argon2 (SSSE3/AVX2)
This commit is contained in:
		
							parent
							
								
									298cc77095
								
							
						
					
					
						commit
						900a936816
					
				
					 17 changed files with 886 additions and 255 deletions
				
			
		|  | @ -31,6 +31,8 @@ cmake_minimum_required(VERSION 2.8.7) | |||
| set (randomx_sources | ||||
| src/aes_hash.cpp | ||||
| src/argon2_ref.c | ||||
| src/argon2_sse3.c | ||||
| src/argon2_avx2.c | ||||
| src/bytecode_machine.cpp | ||||
| src/dataset.cpp | ||||
| src/soft_aes.cpp | ||||
|  | @ -103,6 +105,14 @@ if (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL " | |||
|   else() | ||||
|     # default build has hardware AES enabled (software AES can be selected at runtime) | ||||
|     add_flag("-maes") | ||||
|     check_c_compiler_flag(-mssse3 HAVE_SSSE3) | ||||
|     if(HAVE_SSSE3) | ||||
|       set_source_files_properties(src/argon2_sse3.c COMPILE_FLAGS -mssse3) | ||||
|     endif() | ||||
|     check_c_compiler_flag(-mavx2 HAVE_AVX2) | ||||
|     if(HAVE_AVX2) | ||||
|       set_source_files_properties(src/argon2_avx2.c COMPILE_FLAGS -mavx2) | ||||
|     endif() | ||||
|   endif() | ||||
| endif() | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										32
									
								
								src/argon2.h
									
										
									
									
									
								
							
							
						
						
									
										32
									
								
								src/argon2.h
									
										
									
									
									
								
							|  | @ -227,3 +227,35 @@ typedef enum Argon2_version { | |||
| 	ARGON2_VERSION_13 = 0x13, | ||||
| 	ARGON2_VERSION_NUMBER = ARGON2_VERSION_13 | ||||
| } argon2_version; | ||||
| 
 | ||||
| //Argon2 instance - forward declaration
 | ||||
| typedef struct Argon2_instance_t argon2_instance_t; | ||||
| 
 | ||||
| //Argon2 position = forward declaration
 | ||||
| typedef struct Argon2_position_t argon2_position_t; | ||||
| 
 | ||||
| //Argon2 implementation function
 | ||||
| typedef void randomx_argon2_impl(const argon2_instance_t* instance, | ||||
| 	argon2_position_t position); | ||||
| 
 | ||||
| #if defined(__cplusplus) | ||||
| extern "C" { | ||||
| #endif | ||||
| 
 | ||||
| /*
 | ||||
|  * Function that fills the segment using previous segments also from other | ||||
|  * threads | ||||
|  * @param context current context | ||||
|  * @param instance Pointer to the current instance | ||||
|  * @param position Current position | ||||
|  * @pre all block pointers must be valid | ||||
|  */ | ||||
| void randomx_argon2_fill_segment_ref(const argon2_instance_t* instance, | ||||
| 	argon2_position_t position); | ||||
| 
 | ||||
| randomx_argon2_impl *randomx_argon2_impl_sse3(); | ||||
| randomx_argon2_impl *randomx_argon2_impl_avx2(); | ||||
| 
 | ||||
| #if defined(__cplusplus) | ||||
| } | ||||
| #endif | ||||
|  |  | |||
							
								
								
									
										174
									
								
								src/argon2_avx2.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								src/argon2_avx2.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,174 @@ | |||
| /*
 | ||||
| Copyright (c) 2018-2019, tevador <tevador@gmail.com> | ||||
| 
 | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are met: | ||||
| 	* Redistributions of source code must retain the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer. | ||||
| 	* Redistributions in binary form must reproduce the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer in the | ||||
| 	  documentation and/or other materials provided with the distribution. | ||||
| 	* Neither the name of the copyright holder nor the | ||||
| 	  names of its contributors may be used to endorse or promote products | ||||
| 	  derived from this software without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||||
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||||
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||||
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||||
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| */ | ||||
| 
 | ||||
| /* Original code from Argon2 reference source code package used under CC0 Licence
 | ||||
|  * https://github.com/P-H-C/phc-winner-argon2
 | ||||
|  * Copyright 2015 | ||||
|  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves | ||||
| */ | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| #include <string.h> | ||||
| #include <stdlib.h> | ||||
| 
 | ||||
| #include "argon2.h" | ||||
| 
 | ||||
| void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance, | ||||
| 	argon2_position_t position); | ||||
| 
 | ||||
| randomx_argon2_impl* randomx_argon2_impl_avx2() { | ||||
| #if defined(__AVX2__) | ||||
| 	return &randomx_argon2_fill_segment_avx2; | ||||
| #endif | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| #if defined(__AVX2__) | ||||
| 
 | ||||
| #include "argon2_core.h" | ||||
| 
 | ||||
| #include "blake2/blamka-round-avx2.h" | ||||
| #include "blake2/blake2-impl.h" | ||||
| #include "blake2/blake2.h" | ||||
| 
 | ||||
| static void fill_block(__m256i* state, const block* ref_block, | ||||
| 	block* next_block, int with_xor) { | ||||
| 	__m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; | ||||
| 	unsigned int i; | ||||
| 
 | ||||
| 	if (with_xor) { | ||||
| 		for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { | ||||
| 			state[i] = _mm256_xor_si256( | ||||
| 				state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i)); | ||||
| 			block_XY[i] = _mm256_xor_si256( | ||||
| 				state[i], _mm256_loadu_si256((const __m256i*)next_block->v + i)); | ||||
| 		} | ||||
| 	} | ||||
| 	else { | ||||
| 		for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { | ||||
| 			block_XY[i] = state[i] = _mm256_xor_si256( | ||||
| 				state[i], _mm256_loadu_si256((const __m256i*)ref_block->v + i)); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < 4; ++i) { | ||||
| 		BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], | ||||
| 			state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < 4; ++i) { | ||||
| 		BLAKE2_ROUND_2(state[0 + i], state[4 + i], state[8 + i], state[12 + i], | ||||
| 			state[16 + i], state[20 + i], state[24 + i], state[28 + i]); | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { | ||||
| 		state[i] = _mm256_xor_si256(state[i], block_XY[i]); | ||||
| 		_mm256_storeu_si256((__m256i*)next_block->v + i, state[i]); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void randomx_argon2_fill_segment_avx2(const argon2_instance_t* instance, | ||||
| 	argon2_position_t position) { | ||||
| 	block* ref_block = NULL, * curr_block = NULL; | ||||
| 	block address_block, input_block; | ||||
| 	uint64_t pseudo_rand, ref_index, ref_lane; | ||||
| 	uint32_t prev_offset, curr_offset; | ||||
| 	uint32_t starting_index, i; | ||||
| 	__m256i state[ARGON2_HWORDS_IN_BLOCK]; | ||||
| 
 | ||||
| 	if (instance == NULL) { | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	starting_index = 0; | ||||
| 
 | ||||
| 	if ((0 == position.pass) && (0 == position.slice)) { | ||||
| 		starting_index = 2; /* we have already generated the first two blocks */ | ||||
| 	} | ||||
| 
 | ||||
| 	/* Offset of the current block */ | ||||
| 	curr_offset = position.lane * instance->lane_length + | ||||
| 		position.slice * instance->segment_length + starting_index; | ||||
| 
 | ||||
| 	if (0 == curr_offset % instance->lane_length) { | ||||
| 		/* Last block in this lane */ | ||||
| 		prev_offset = curr_offset + instance->lane_length - 1; | ||||
| 	} | ||||
| 	else { | ||||
| 		/* Previous block */ | ||||
| 		prev_offset = curr_offset - 1; | ||||
| 	} | ||||
| 
 | ||||
| 	memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); | ||||
| 
 | ||||
| 	for (i = starting_index; i < instance->segment_length; | ||||
| 		++i, ++curr_offset, ++prev_offset) { | ||||
| 		/*1.1 Rotating prev_offset if needed */ | ||||
| 		if (curr_offset % instance->lane_length == 1) { | ||||
| 			prev_offset = curr_offset - 1; | ||||
| 		} | ||||
| 
 | ||||
| 		/* 1.2 Computing the index of the reference block */ | ||||
| 		/* 1.2.1 Taking pseudo-random value from the previous block */ | ||||
| 		pseudo_rand = instance->memory[prev_offset].v[0]; | ||||
| 
 | ||||
| 		/* 1.2.2 Computing the lane of the reference block */ | ||||
| 		ref_lane = ((pseudo_rand >> 32)) % instance->lanes; | ||||
| 
 | ||||
| 		if ((position.pass == 0) && (position.slice == 0)) { | ||||
| 			/* Can not reference other lanes yet */ | ||||
| 			ref_lane = position.lane; | ||||
| 		} | ||||
| 
 | ||||
| 		/* 1.2.3 Computing the number of possible reference block within the
 | ||||
| 		 * lane. | ||||
| 		 */ | ||||
| 		position.index = i; | ||||
| 		ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, | ||||
| 			ref_lane == position.lane); | ||||
| 
 | ||||
| 		/* 2 Creating a new block */ | ||||
| 		ref_block = | ||||
| 			instance->memory + instance->lane_length * ref_lane + ref_index; | ||||
| 		curr_block = instance->memory + curr_offset; | ||||
| 		if (ARGON2_VERSION_10 == instance->version) { | ||||
| 			/* version 1.2.1 and earlier: overwrite, not XOR */ | ||||
| 			fill_block(state, ref_block, curr_block, 0); | ||||
| 		} | ||||
| 		else { | ||||
| 			if (0 == position.pass) { | ||||
| 				fill_block(state, ref_block, curr_block, 0); | ||||
| 			} | ||||
| 			else { | ||||
| 				fill_block(state, ref_block, curr_block, 1); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
|  | @ -70,18 +70,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | ||||
| 
 | ||||
| /***************Instance and Position constructors**********/ | ||||
| void rxa2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } | ||||
| 
 | ||||
| void rxa2_copy_block(block *dst, const block *src) { | ||||
| 	memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); | ||||
| } | ||||
| 
 | ||||
| void rxa2_xor_block(block *dst, const block *src) { | ||||
| 	int i; | ||||
| 	for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { | ||||
| 		dst->v[i] ^= src->v[i]; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void load_block(block *dst, const void *input) { | ||||
| 	unsigned i; | ||||
|  | @ -97,69 +85,7 @@ static void store_block(void *output, const block *src) { | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| /***************Memory functions*****************/ | ||||
| 
 | ||||
| int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory, | ||||
| 	size_t num, size_t size) { | ||||
| 	size_t memory_size = num * size; | ||||
| 	if (memory == NULL) { | ||||
| 		return ARGON2_MEMORY_ALLOCATION_ERROR; | ||||
| 	} | ||||
| 
 | ||||
| 	/* 1. Check for multiplication overflow */ | ||||
| 	if (size != 0 && memory_size / size != num) { | ||||
| 		return ARGON2_MEMORY_ALLOCATION_ERROR; | ||||
| 	} | ||||
| 
 | ||||
| 	/* 2. Try to allocate with appropriate allocator */ | ||||
| 	if (context->allocate_cbk) { | ||||
| 		(context->allocate_cbk)(memory, memory_size); | ||||
| 	} | ||||
| 	else { | ||||
| 		*memory = (uint8_t*)malloc(memory_size); | ||||
| 	} | ||||
| 
 | ||||
| 	if (*memory == NULL) { | ||||
| 		return ARGON2_MEMORY_ALLOCATION_ERROR; | ||||
| 	} | ||||
| 
 | ||||
| 	return ARGON2_OK; | ||||
| } | ||||
| 
 | ||||
| void rxa2_free_memory(const argon2_context *context, uint8_t *memory, | ||||
| 	size_t num, size_t size) { | ||||
| 	size_t memory_size = num * size; | ||||
| 	rxa2_clear_internal_memory(memory, memory_size); | ||||
| 	if (context->free_cbk) { | ||||
| 		(context->free_cbk)(memory, memory_size); | ||||
| 	} | ||||
| 	else { | ||||
| 		free(memory); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void NOT_OPTIMIZED rxa2_secure_wipe_memory(void *v, size_t n) { | ||||
| #if defined(_MSC_VER) && VC_GE_2005(_MSC_VER) | ||||
| 	SecureZeroMemory(v, n); | ||||
| #elif defined memset_s | ||||
| 	memset_s(v, n, 0, n); | ||||
| #elif defined(__OpenBSD__) | ||||
| 	explicit_bzero(v, n); | ||||
| #else | ||||
| 	static void *(*const volatile memset_sec)(void *, int, size_t) = &memset; | ||||
| 	memset_sec(v, 0, n); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /* Memory clear flag defaults to true. */ | ||||
| #define FLAG_clear_internal_memory 0 | ||||
| void rxa2_clear_internal_memory(void *v, size_t n) { | ||||
| 	if (FLAG_clear_internal_memory && v) { | ||||
| 		rxa2_secure_wipe_memory(v, n); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| uint32_t rxa2_index_alpha(const argon2_instance_t *instance, | ||||
| uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance, | ||||
| 	const argon2_position_t *position, uint32_t pseudo_rand, | ||||
| 	int same_lane) { | ||||
| 	/*
 | ||||
|  | @ -241,24 +167,22 @@ static int fill_memory_blocks_st(argon2_instance_t *instance) { | |||
| 		for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { | ||||
| 			for (l = 0; l < instance->lanes; ++l) { | ||||
| 				argon2_position_t position = { r, l, (uint8_t)s, 0 }; | ||||
| 				rxa2_fill_segment(instance, position); | ||||
| 				//fill the segment using the selected implementation
 | ||||
| 				instance->impl(instance, position); | ||||
| 			} | ||||
| 		} | ||||
| #ifdef GENKAT | ||||
| 		internal_kat(instance, r); /* Print all memory blocks */ | ||||
| #endif | ||||
| 	} | ||||
| 	return ARGON2_OK; | ||||
| } | ||||
| 
 | ||||
| int rxa2_fill_memory_blocks(argon2_instance_t *instance) { | ||||
| int randomx_argon2_fill_memory_blocks(argon2_instance_t *instance) { | ||||
| 	if (instance == NULL || instance->lanes == 0) { | ||||
| 		return ARGON2_INCORRECT_PARAMETER; | ||||
| 	} | ||||
| 	return fill_memory_blocks_st(instance); | ||||
| } | ||||
| 
 | ||||
| int rxa2_validate_inputs(const argon2_context *context) { | ||||
| int randomx_argon2_validate_inputs(const argon2_context *context) { | ||||
| 	if (NULL == context) { | ||||
| 		return ARGON2_INCORRECT_PARAMETER; | ||||
| 	} | ||||
|  | @ -394,7 +318,6 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc | |||
| 		load_block(&instance->memory[l * instance->lane_length + 1], | ||||
| 			blockhash_bytes); | ||||
| 	} | ||||
| 	rxa2_clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); | ||||
| } | ||||
| 
 | ||||
| void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) { | ||||
|  | @ -431,11 +354,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type | |||
| 	if (context->pwd != NULL) { | ||||
| 		blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, | ||||
| 			context->pwdlen); | ||||
| 
 | ||||
| 		if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { | ||||
| 			rxa2_secure_wipe_memory(context->pwd, context->pwdlen); | ||||
| 			context->pwdlen = 0; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	store32(&value, context->saltlen); | ||||
|  | @ -451,11 +369,6 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type | |||
| 	if (context->secret != NULL) { | ||||
| 		blake2b_update(&BlakeHash, (const uint8_t *)context->secret, | ||||
| 			context->secretlen); | ||||
| 
 | ||||
| 		if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { | ||||
| 			rxa2_secure_wipe_memory(context->secret, context->secretlen); | ||||
| 			context->secretlen = 0; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	store32(&value, context->adlen); | ||||
|  | @ -469,7 +382,7 @@ void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type | |||
| 	blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); | ||||
| } | ||||
| 
 | ||||
| int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) { | ||||
| int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context) { | ||||
| 	uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; | ||||
| 	int result = ARGON2_OK; | ||||
| 
 | ||||
|  | @ -478,10 +391,7 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) | |||
| 	instance->context_ptr = context; | ||||
| 
 | ||||
| 	/* 1. Memory allocation */ | ||||
| 	/*result = allocate_memory(context, (uint8_t **)&(instance->memory), instance->memory_blocks, sizeof(block));
 | ||||
| 	if (result != ARGON2_OK) { | ||||
| 		return result; | ||||
| 	}*/ | ||||
| 	//RandomX takes care of memory allocation
 | ||||
| 
 | ||||
| 	/* 2. Initial hashing */ | ||||
| 	/* H_0 + 8 extra bytes to produce the first blocks */ | ||||
|  | @ -489,15 +399,13 @@ int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) | |||
| 	/* Hashing all inputs */ | ||||
| 	rxa2_initial_hash(blockhash, context, instance->type); | ||||
| 	/* Zeroing 8 extra bytes */ | ||||
| 	rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, | ||||
| 	/*rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
 | ||||
| 		ARGON2_PREHASH_SEED_LENGTH - | ||||
| 		ARGON2_PREHASH_DIGEST_LENGTH); | ||||
| 		ARGON2_PREHASH_DIGEST_LENGTH);*/ | ||||
| 
 | ||||
| 	/* 3. Creating first blocks, we always have at least two blocks in a slice
 | ||||
| 	 */ | ||||
| 	rxa2_fill_first_blocks(blockhash, instance); | ||||
| 	/* Clearing the hash */ | ||||
| 	rxa2_clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH); | ||||
| 
 | ||||
| 	return ARGON2_OK; | ||||
| } | ||||
|  |  | |||
|  | @ -73,17 +73,6 @@ enum argon2_core_constants { | |||
|  */ | ||||
| typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block; | ||||
| 
 | ||||
| /*****************Functions that work with the block******************/ | ||||
| 
 | ||||
| /* Initialize each byte of the block with @in */ | ||||
| void rxa2_init_block_value(block *b, uint8_t in); | ||||
| 
 | ||||
| /* Copy block @src to block @dst */ | ||||
| void rxa2_copy_block(block *dst, const block *src); | ||||
| 
 | ||||
| /* XOR @src onto @dst bytewise */ | ||||
| void rxa2_xor_block(block *dst, const block *src); | ||||
| 
 | ||||
| /*
 | ||||
|  * Argon2 instance: memory pointer, number of passes, amount of memory, type, | ||||
|  * and derived values. | ||||
|  | @ -102,6 +91,7 @@ typedef struct Argon2_instance_t { | |||
| 	argon2_type type; | ||||
| 	int print_internals; /* whether to print the memory blocks */ | ||||
| 	argon2_context *context_ptr; /* points back to original context */ | ||||
| 	randomx_argon2_impl *impl; | ||||
| } argon2_instance_t; | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -123,42 +113,6 @@ typedef struct Argon2_thread_data { | |||
| 
 | ||||
| /*************************Argon2 core functions********************************/ | ||||
| 
 | ||||
| /* Allocates memory to the given pointer, uses the appropriate allocator as
 | ||||
|  * specified in the context. Total allocated memory is num*size. | ||||
|  * @param context argon2_context which specifies the allocator | ||||
|  * @param memory pointer to the pointer to the memory | ||||
|  * @param size the size in bytes for each element to be allocated | ||||
|  * @param num the number of elements to be allocated | ||||
|  * @return ARGON2_OK if @memory is a valid pointer and memory is allocated | ||||
|  */ | ||||
| int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory, | ||||
| 	size_t num, size_t size); | ||||
| 
 | ||||
| /*
 | ||||
|  * Frees memory at the given pointer, uses the appropriate deallocator as | ||||
|  * specified in the context. Also cleans the memory using clear_internal_memory. | ||||
|  * @param context argon2_context which specifies the deallocator | ||||
|  * @param memory pointer to buffer to be freed | ||||
|  * @param size the size in bytes for each element to be deallocated | ||||
|  * @param num the number of elements to be deallocated | ||||
|  */ | ||||
| void rxa2_free_memory(const argon2_context *context, uint8_t *memory, | ||||
| 	size_t num, size_t size); | ||||
| 
 | ||||
| /* Function that securely cleans the memory. This ignores any flags set
 | ||||
|  * regarding clearing memory. Usually one just calls clear_internal_memory. | ||||
|  * @param mem Pointer to the memory | ||||
|  * @param s Memory size in bytes | ||||
|  */ | ||||
| void rxa2_secure_wipe_memory(void *v, size_t n); | ||||
| 
 | ||||
| /* Function that securely clears the memory if FLAG_clear_internal_memory is
 | ||||
|  * set. If the flag isn't set, this function does nothing. | ||||
|  * @param mem Pointer to the memory | ||||
|  * @param s Memory size in bytes | ||||
|  */ | ||||
| void rxa2_clear_internal_memory(void *v, size_t n); | ||||
| 
 | ||||
| /*
 | ||||
|  * Computes absolute position of reference block in the lane following a skewed | ||||
|  * distribution and using a pseudo-random value as input | ||||
|  | @ -169,7 +123,7 @@ void rxa2_clear_internal_memory(void *v, size_t n); | |||
|  * If so we can reference the current segment | ||||
|  * @pre All pointers must be valid | ||||
|  */ | ||||
| uint32_t rxa2_index_alpha(const argon2_instance_t *instance, | ||||
| uint32_t randomx_argon2_index_alpha(const argon2_instance_t *instance, | ||||
| 	const argon2_position_t *position, uint32_t pseudo_rand, | ||||
| 	int same_lane); | ||||
| 
 | ||||
|  | @ -180,28 +134,7 @@ uint32_t rxa2_index_alpha(const argon2_instance_t *instance, | |||
|  * @return ARGON2_OK if everything is all right, otherwise one of error codes | ||||
|  * (all defined in <argon2.h> | ||||
|  */ | ||||
| int rxa2_validate_inputs(const argon2_context *context); | ||||
| 
 | ||||
| /*
 | ||||
|  * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears | ||||
|  * password and secret if needed | ||||
|  * @param  context  Pointer to the Argon2 internal structure containing memory | ||||
|  * pointer, and parameters for time and space requirements. | ||||
|  * @param  blockhash Buffer for pre-hashing digest | ||||
|  * @param  type Argon2 type | ||||
|  * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes | ||||
|  * allocated | ||||
|  */ | ||||
| void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, | ||||
| 	argon2_type type); | ||||
| 
 | ||||
| /*
 | ||||
|  * Function creates first 2 blocks per lane | ||||
|  * @param instance Pointer to the current instance | ||||
|  * @param blockhash Pointer to the pre-hashing digest | ||||
|  * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values | ||||
|  */ | ||||
| void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance); | ||||
| int randomx_argon2_validate_inputs(const argon2_context *context); | ||||
| 
 | ||||
| /*
 | ||||
|  * Function allocates memory, hashes the inputs with Blake,  and creates first | ||||
|  | @ -213,31 +146,7 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc | |||
|  * @return Zero if successful, -1 if memory failed to allocate. @context->state | ||||
|  * will be modified if successful. | ||||
|  */ | ||||
| int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context); | ||||
| 
 | ||||
| /*
 | ||||
|  * XORing the last block of each lane, hashing it, making the tag. Deallocates | ||||
|  * the memory. | ||||
|  * @param context Pointer to current Argon2 context (use only the out parameters | ||||
|  * from it) | ||||
|  * @param instance Pointer to current instance of Argon2 | ||||
|  * @pre instance->state must point to necessary amount of memory | ||||
|  * @pre context->out must point to outlen bytes of memory | ||||
|  * @pre if context->free_cbk is not NULL, it should point to a function that | ||||
|  * deallocates memory | ||||
|  */ | ||||
| void rxa2_finalize(const argon2_context *context, argon2_instance_t *instance); | ||||
| 
 | ||||
| /*
 | ||||
|  * Function that fills the segment using previous segments also from other | ||||
|  * threads | ||||
|  * @param context current context | ||||
|  * @param instance Pointer to the current instance | ||||
|  * @param position Current position | ||||
|  * @pre all block pointers must be valid | ||||
|  */ | ||||
| void rxa2_fill_segment(const argon2_instance_t *instance, | ||||
| 	argon2_position_t position); | ||||
| int randomx_argon2_initialize(argon2_instance_t *instance, argon2_context *context); | ||||
| 
 | ||||
| /*
 | ||||
|  * Function that fills the entire memory t_cost times based on the first two | ||||
|  | @ -245,7 +154,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance, | |||
|  * @param instance Pointer to the current instance | ||||
|  * @return ARGON2_OK if successful, @context->state | ||||
|  */ | ||||
| int rxa2_fill_memory_blocks(argon2_instance_t *instance); | ||||
| int randomx_argon2_fill_memory_blocks(argon2_instance_t* instance); | ||||
| 
 | ||||
| #if defined(__cplusplus) | ||||
| } | ||||
|  |  | |||
|  | @ -43,6 +43,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "blake2/blake2-impl.h" | ||||
| #include "blake2/blake2.h" | ||||
| 
 | ||||
| static void copy_block(block* dst, const block* src) { | ||||
| 	memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); | ||||
| } | ||||
| 
 | ||||
| static void xor_block(block* dst, const block* src) { | ||||
| 	int i; | ||||
| 	for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { | ||||
| 		dst->v[i] ^= src->v[i]; | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
|  /*
 | ||||
|   * Function fills a new memory block and optionally XORs the old block over the new one. | ||||
|   * @next_block must be initialized. | ||||
|  | @ -57,13 +68,13 @@ static void fill_block(const block *prev_block, const block *ref_block, | |||
| 	block blockR, block_tmp; | ||||
| 	unsigned i; | ||||
| 
 | ||||
| 	rxa2_copy_block(&blockR, ref_block); | ||||
| 	rxa2_xor_block(&blockR, prev_block); | ||||
| 	rxa2_copy_block(&block_tmp, &blockR); | ||||
| 	copy_block(&blockR, ref_block); | ||||
| 	xor_block(&blockR, prev_block); | ||||
| 	copy_block(&block_tmp, &blockR); | ||||
| 	/* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */ | ||||
| 	if (with_xor) { | ||||
| 		/* Saving the next block contents for XOR over: */ | ||||
| 		rxa2_xor_block(&block_tmp, next_block); | ||||
| 		xor_block(&block_tmp, next_block); | ||||
| 		/* Now blockR = ref_block + prev_block and
 | ||||
| 		   block_tmp = ref_block + prev_block + next_block */ | ||||
| 	} | ||||
|  | @ -92,18 +103,11 @@ static void fill_block(const block *prev_block, const block *ref_block, | |||
| 			blockR.v[2 * i + 113]); | ||||
| 	} | ||||
| 
 | ||||
| 	rxa2_copy_block(next_block, &block_tmp); | ||||
| 	rxa2_xor_block(next_block, &blockR); | ||||
| 	copy_block(next_block, &block_tmp); | ||||
| 	xor_block(next_block, &blockR); | ||||
| } | ||||
| 
 | ||||
| static void next_addresses(block *address_block, block *input_block, | ||||
| 	const block *zero_block) { | ||||
| 	input_block->v[6]++; | ||||
| 	fill_block(zero_block, input_block, address_block, 0); | ||||
| 	fill_block(zero_block, address_block, address_block, 0); | ||||
| } | ||||
| 
 | ||||
| void rxa2_fill_segment(const argon2_instance_t *instance, | ||||
| void randomx_argon2_fill_segment_ref(const argon2_instance_t *instance, | ||||
| 	argon2_position_t position) { | ||||
| 	block *ref_block = NULL, *curr_block = NULL; | ||||
| 	block address_block, input_block, zero_block; | ||||
|  | @ -111,38 +115,15 @@ void rxa2_fill_segment(const argon2_instance_t *instance, | |||
| 	uint32_t prev_offset, curr_offset; | ||||
| 	uint32_t starting_index; | ||||
| 	uint32_t i; | ||||
| 	int data_independent_addressing; | ||||
| 
 | ||||
| 	if (instance == NULL) { | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	data_independent_addressing = | ||||
| 		(instance->type == Argon2_i) || | ||||
| 		(instance->type == Argon2_id && (position.pass == 0) && | ||||
| 		(position.slice < ARGON2_SYNC_POINTS / 2)); | ||||
| 
 | ||||
| 	if (data_independent_addressing) { | ||||
| 		rxa2_init_block_value(&zero_block, 0); | ||||
| 		rxa2_init_block_value(&input_block, 0); | ||||
| 
 | ||||
| 		input_block.v[0] = position.pass; | ||||
| 		input_block.v[1] = position.lane; | ||||
| 		input_block.v[2] = position.slice; | ||||
| 		input_block.v[3] = instance->memory_blocks; | ||||
| 		input_block.v[4] = instance->passes; | ||||
| 		input_block.v[5] = instance->type; | ||||
| 	} | ||||
| 
 | ||||
| 	starting_index = 0; | ||||
| 
 | ||||
| 	if ((0 == position.pass) && (0 == position.slice)) { | ||||
| 		starting_index = 2; /* we have already generated the first two blocks */ | ||||
| 
 | ||||
| 		/* Don't forget to generate the first block of addresses: */ | ||||
| 		if (data_independent_addressing) { | ||||
| 			next_addresses(&address_block, &input_block, &zero_block); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* Offset of the current block */ | ||||
|  | @ -167,15 +148,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance, | |||
| 
 | ||||
| 		/* 1.2 Computing the index of the reference block */ | ||||
| 		/* 1.2.1 Taking pseudo-random value from the previous block */ | ||||
| 		if (data_independent_addressing) { | ||||
| 			if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { | ||||
| 				next_addresses(&address_block, &input_block, &zero_block); | ||||
| 			} | ||||
| 			pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; | ||||
| 		} | ||||
| 		else { | ||||
| 			pseudo_rand = instance->memory[prev_offset].v[0]; | ||||
| 		} | ||||
| 		pseudo_rand = instance->memory[prev_offset].v[0]; | ||||
| 
 | ||||
| 		/* 1.2.2 Computing the lane of the reference block */ | ||||
| 		ref_lane = ((pseudo_rand >> 32)) % instance->lanes; | ||||
|  | @ -189,7 +162,7 @@ void rxa2_fill_segment(const argon2_instance_t *instance, | |||
| 		 * lane. | ||||
| 		 */ | ||||
| 		position.index = i; | ||||
| 		ref_index = rxa2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, | ||||
| 		ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, | ||||
| 			ref_lane == position.lane); | ||||
| 
 | ||||
| 		/* 2 Creating a new block */ | ||||
|  |  | |||
							
								
								
									
										182
									
								
								src/argon2_sse3.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										182
									
								
								src/argon2_sse3.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,182 @@ | |||
| /*
 | ||||
| Copyright (c) 2018-2019, tevador <tevador@gmail.com> | ||||
| 
 | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are met: | ||||
| 	* Redistributions of source code must retain the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer. | ||||
| 	* Redistributions in binary form must reproduce the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer in the | ||||
| 	  documentation and/or other materials provided with the distribution. | ||||
| 	* Neither the name of the copyright holder nor the | ||||
| 	  names of its contributors may be used to endorse or promote products | ||||
| 	  derived from this software without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||||
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||||
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||||
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||||
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| */ | ||||
| 
 | ||||
| /* Original code from Argon2 reference source code package used under CC0 Licence
 | ||||
|  * https://github.com/P-H-C/phc-winner-argon2
 | ||||
|  * Copyright 2015 | ||||
|  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves | ||||
| */ | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| #include <string.h> | ||||
| #include <stdlib.h> | ||||
| 
 | ||||
| #include "argon2.h" | ||||
| 
 | ||||
| #if defined(_MSC_VER) //MSVC doesn't define SSSE3
 | ||||
| #define __SSSE3__ | ||||
| #endif | ||||
| 
 | ||||
| void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance, | ||||
| 	argon2_position_t position); | ||||
| 
 | ||||
| randomx_argon2_impl* randomx_argon2_impl_sse3() { | ||||
| #if defined(__SSSE3__) | ||||
| 	return &randomx_argon2_fill_segment_sse3; | ||||
| #endif | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| #if defined(__SSSE3__) | ||||
| 
 | ||||
| #include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ | ||||
| 
 | ||||
| #include "argon2_core.h" | ||||
| 
 | ||||
| #include "blake2/blamka-round-sse3.h" | ||||
| #include "blake2/blake2-impl.h" | ||||
| #include "blake2/blake2.h" | ||||
| 
 | ||||
| static void fill_block(__m128i* state, const block* ref_block, | ||||
| 	block* next_block, int with_xor) { | ||||
| 	__m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; | ||||
| 	unsigned int i; | ||||
| 
 | ||||
| 	if (with_xor) { | ||||
| 		for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { | ||||
| 			state[i] = _mm_xor_si128( | ||||
| 				state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i)); | ||||
| 			block_XY[i] = _mm_xor_si128( | ||||
| 				state[i], _mm_loadu_si128((const __m128i*)next_block->v + i)); | ||||
| 		} | ||||
| 	} | ||||
| 	else { | ||||
| 		for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { | ||||
| 			block_XY[i] = state[i] = _mm_xor_si128( | ||||
| 				state[i], _mm_loadu_si128((const __m128i*)ref_block->v + i)); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < 8; ++i) { | ||||
| 		BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], | ||||
| 			state[8 * i + 3], state[8 * i + 4], state[8 * i + 5], | ||||
| 			state[8 * i + 6], state[8 * i + 7]); | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < 8; ++i) { | ||||
| 		BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i], | ||||
| 			state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i], | ||||
| 			state[8 * 6 + i], state[8 * 7 + i]); | ||||
| 	} | ||||
| 
 | ||||
| 	for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { | ||||
| 		state[i] = _mm_xor_si128(state[i], block_XY[i]); | ||||
| 		_mm_storeu_si128((__m128i*)next_block->v + i, state[i]); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void randomx_argon2_fill_segment_sse3(const argon2_instance_t* instance, | ||||
| 	argon2_position_t position) { | ||||
| 	block* ref_block = NULL, * curr_block = NULL; | ||||
| 	block address_block, input_block; | ||||
| 	uint64_t pseudo_rand, ref_index, ref_lane; | ||||
| 	uint32_t prev_offset, curr_offset; | ||||
| 	uint32_t starting_index, i; | ||||
| 	__m128i state[ARGON2_OWORDS_IN_BLOCK]; | ||||
| 
 | ||||
| 	if (instance == NULL) { | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	starting_index = 0; | ||||
| 
 | ||||
| 	if ((0 == position.pass) && (0 == position.slice)) { | ||||
| 		starting_index = 2; /* we have already generated the first two blocks */ | ||||
| 	} | ||||
| 
 | ||||
| 	/* Offset of the current block */ | ||||
| 	curr_offset = position.lane * instance->lane_length + | ||||
| 		position.slice * instance->segment_length + starting_index; | ||||
| 
 | ||||
| 	if (0 == curr_offset % instance->lane_length) { | ||||
| 		/* Last block in this lane */ | ||||
| 		prev_offset = curr_offset + instance->lane_length - 1; | ||||
| 	} | ||||
| 	else { | ||||
| 		/* Previous block */ | ||||
| 		prev_offset = curr_offset - 1; | ||||
| 	} | ||||
| 
 | ||||
| 	memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE); | ||||
| 
 | ||||
| 	for (i = starting_index; i < instance->segment_length; | ||||
| 		++i, ++curr_offset, ++prev_offset) { | ||||
| 		/*1.1 Rotating prev_offset if needed */ | ||||
| 		if (curr_offset % instance->lane_length == 1) { | ||||
| 			prev_offset = curr_offset - 1; | ||||
| 		} | ||||
| 
 | ||||
| 		/* 1.2 Computing the index of the reference block */ | ||||
| 		/* 1.2.1 Taking pseudo-random value from the previous block */ | ||||
| 		pseudo_rand = instance->memory[prev_offset].v[0]; | ||||
| 
 | ||||
| 		/* 1.2.2 Computing the lane of the reference block */ | ||||
| 		ref_lane = ((pseudo_rand >> 32)) % instance->lanes; | ||||
| 
 | ||||
| 		if ((position.pass == 0) && (position.slice == 0)) { | ||||
| 			/* Can not reference other lanes yet */ | ||||
| 			ref_lane = position.lane; | ||||
| 		} | ||||
| 
 | ||||
| 		/* 1.2.3 Computing the number of possible reference block within the
 | ||||
| 		 * lane. | ||||
| 		 */ | ||||
| 		position.index = i; | ||||
| 		ref_index = randomx_argon2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, | ||||
| 			ref_lane == position.lane); | ||||
| 
 | ||||
| 		/* 2 Creating a new block */ | ||||
| 		ref_block = | ||||
| 			instance->memory + instance->lane_length * ref_lane + ref_index; | ||||
| 		curr_block = instance->memory + curr_offset; | ||||
| 		if (ARGON2_VERSION_10 == instance->version) { | ||||
| 			/* version 1.2.1 and earlier: overwrite, not XOR */ | ||||
| 			fill_block(state, ref_block, curr_block, 0); | ||||
| 		} | ||||
| 		else { | ||||
| 			if (0 == position.pass) { | ||||
| 				fill_block(state, ref_block, curr_block, 0); | ||||
| 			} | ||||
| 			else { | ||||
| 				fill_block(state, ref_block, curr_block, 1); | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										189
									
								
								src/blake2/blamka-round-avx2.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										189
									
								
								src/blake2/blamka-round-avx2.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,189 @@ | |||
| /*
 | ||||
| Copyright (c) 2018-2019, tevador <tevador@gmail.com> | ||||
| 
 | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are met: | ||||
| 	* Redistributions of source code must retain the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer. | ||||
| 	* Redistributions in binary form must reproduce the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer in the | ||||
| 	  documentation and/or other materials provided with the distribution. | ||||
| 	* Neither the name of the copyright holder nor the | ||||
| 	  names of its contributors may be used to endorse or promote products | ||||
| 	  derived from this software without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||||
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||||
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||||
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||||
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| */ | ||||
| 
 | ||||
| /* Original code from Argon2 reference source code package used under CC0 Licence
 | ||||
|  * https://github.com/P-H-C/phc-winner-argon2
 | ||||
|  * Copyright 2015 | ||||
|  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves | ||||
| */ | ||||
| 
 | ||||
| #ifndef BLAKE_ROUND_MKA_OPT_H | ||||
| #define BLAKE_ROUND_MKA_OPT_H | ||||
| 
 | ||||
| #include "blake2-impl.h" | ||||
| 
 | ||||
| #ifdef __GNUC__ | ||||
| #include <x86intrin.h> | ||||
| #else | ||||
| #include <intrin.h> | ||||
| #endif | ||||
| 
 | ||||
| #define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) | ||||
| #define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) | ||||
| #define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) | ||||
| #define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) | ||||
| 
 | ||||
| #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|     do { \ | ||||
|         __m256i ml = _mm256_mul_epu32(A0, B0); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ | ||||
|         D0 = _mm256_xor_si256(D0, A0); \ | ||||
|         D0 = rotr32(D0); \ | ||||
|         \ | ||||
|         ml = _mm256_mul_epu32(C0, D0); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ | ||||
|         \ | ||||
|         B0 = _mm256_xor_si256(B0, C0); \ | ||||
|         B0 = rotr24(B0); \ | ||||
|         \ | ||||
|         ml = _mm256_mul_epu32(A1, B1); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ | ||||
|         D1 = _mm256_xor_si256(D1, A1); \ | ||||
|         D1 = rotr32(D1); \ | ||||
|         \ | ||||
|         ml = _mm256_mul_epu32(C1, D1); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ | ||||
|         \ | ||||
|         B1 = _mm256_xor_si256(B1, C1); \ | ||||
|         B1 = rotr24(B1); \ | ||||
|     } while((void)0, 0); | ||||
| 
 | ||||
| #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|     do { \ | ||||
|         __m256i ml = _mm256_mul_epu32(A0, B0); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ | ||||
|         D0 = _mm256_xor_si256(D0, A0); \ | ||||
|         D0 = rotr16(D0); \ | ||||
|         \ | ||||
|         ml = _mm256_mul_epu32(C0, D0); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ | ||||
|         B0 = _mm256_xor_si256(B0, C0); \ | ||||
|         B0 = rotr63(B0); \ | ||||
|         \ | ||||
|         ml = _mm256_mul_epu32(A1, B1); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ | ||||
|         D1 = _mm256_xor_si256(D1, A1); \ | ||||
|         D1 = rotr16(D1); \ | ||||
|         \ | ||||
|         ml = _mm256_mul_epu32(C1, D1); \ | ||||
|         ml = _mm256_add_epi64(ml, ml); \ | ||||
|         C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ | ||||
|         B1 = _mm256_xor_si256(B1, C1); \ | ||||
|         B1 = rotr63(B1); \ | ||||
|     } while((void)0, 0); | ||||
| 
 | ||||
| #define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ | ||||
|     do { \ | ||||
|         B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ | ||||
|         C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ | ||||
|         D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ | ||||
|         \ | ||||
|         B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ | ||||
|         C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ | ||||
|         D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ | ||||
|     } while((void)0, 0); | ||||
| 
 | ||||
| #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|     do { \ | ||||
|         __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ | ||||
|         __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ | ||||
|         B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|         B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|         \ | ||||
|         tmp1 = C0; \ | ||||
|         C0 = C1; \ | ||||
|         C1 = tmp1; \ | ||||
|         \ | ||||
|         tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ | ||||
|         tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ | ||||
|         D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|         D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|     } while(0); | ||||
| 
 | ||||
| #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ | ||||
|     do { \ | ||||
|         B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ | ||||
|         C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ | ||||
|         D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ | ||||
|         \ | ||||
|         B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ | ||||
|         C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ | ||||
|         D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ | ||||
|     } while((void)0, 0); | ||||
| 
 | ||||
| #define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|     do { \ | ||||
|         __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ | ||||
|         __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ | ||||
|         B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|         B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|         \ | ||||
|         tmp1 = C0; \ | ||||
|         C0 = C1; \ | ||||
|         C1 = tmp1; \ | ||||
|         \ | ||||
|         tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ | ||||
|         tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ | ||||
|         D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|         D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ | ||||
|     } while((void)0, 0); | ||||
| 
 | ||||
| #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|     do{ \ | ||||
|         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         \ | ||||
|         DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ | ||||
|         \ | ||||
|         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         \ | ||||
|         UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ | ||||
|     } while((void)0, 0); | ||||
| 
 | ||||
| #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|     do{ \ | ||||
|         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         \ | ||||
|         DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         \ | ||||
|         G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|         \ | ||||
|         UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ | ||||
|     } while((void)0, 0); | ||||
| 
 | ||||
| #endif /* BLAKE_ROUND_MKA_OPT_H */ | ||||
							
								
								
									
										158
									
								
								src/blake2/blamka-round-sse3.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										158
									
								
								src/blake2/blamka-round-sse3.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,158 @@ | |||
| /*
 | ||||
| Copyright (c) 2018-2019, tevador <tevador@gmail.com> | ||||
| 
 | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions are met: | ||||
| 	* Redistributions of source code must retain the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer. | ||||
| 	* Redistributions in binary form must reproduce the above copyright | ||||
| 	  notice, this list of conditions and the following disclaimer in the | ||||
| 	  documentation and/or other materials provided with the distribution. | ||||
| 	* Neither the name of the copyright holder nor the | ||||
| 	  names of its contributors may be used to endorse or promote products | ||||
| 	  derived from this software without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | ||||
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | ||||
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||||
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||||
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
| */ | ||||
| 
 | ||||
| /* Original code from Argon2 reference source code package used under CC0 Licence
 | ||||
|  * https://github.com/P-H-C/phc-winner-argon2
 | ||||
|  * Copyright 2015 | ||||
|  * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves | ||||
| */ | ||||
| 
 | ||||
| #ifndef BLAKE_ROUND_MKA_OPT_H | ||||
| #define BLAKE_ROUND_MKA_OPT_H | ||||
| 
 | ||||
| #include "blake2-impl.h" | ||||
| 
 | ||||
| #ifdef __GNUC__ | ||||
| #include <x86intrin.h> | ||||
| #else | ||||
| #include <intrin.h> | ||||
| #endif | ||||
| 
 | ||||
| #define r16                                                                    \ | ||||
|     (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) | ||||
| #define r24                                                                    \ | ||||
|     (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) | ||||
| #define _mm_roti_epi64(x, c)                                                   \ | ||||
|     (-(c) == 32)                                                               \ | ||||
|         ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \ | ||||
|         : (-(c) == 24)                                                         \ | ||||
|               ? _mm_shuffle_epi8((x), r24)                                     \ | ||||
|               : (-(c) == 16)                                                   \ | ||||
|                     ? _mm_shuffle_epi8((x), r16)                               \ | ||||
|                     : (-(c) == 63)                                             \ | ||||
|                           ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \ | ||||
|                                           _mm_add_epi64((x), (x)))             \ | ||||
|                           : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \ | ||||
|                                           _mm_slli_epi64((x), 64 - (-(c)))) | ||||
| 
 | ||||
| static FORCE_INLINE __m128i fBlaMka(__m128i x, __m128i y) { | ||||
|     const __m128i z = _mm_mul_epu32(x, y); | ||||
|     return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); | ||||
| } | ||||
| 
 | ||||
| #define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \ | ||||
|     do {                                                                       \ | ||||
|         A0 = fBlaMka(A0, B0);                                                  \ | ||||
|         A1 = fBlaMka(A1, B1);                                                  \ | ||||
|                                                                                \ | ||||
|         D0 = _mm_xor_si128(D0, A0);                                            \ | ||||
|         D1 = _mm_xor_si128(D1, A1);                                            \ | ||||
|                                                                                \ | ||||
|         D0 = _mm_roti_epi64(D0, -32);                                          \ | ||||
|         D1 = _mm_roti_epi64(D1, -32);                                          \ | ||||
|                                                                                \ | ||||
|         C0 = fBlaMka(C0, D0);                                                  \ | ||||
|         C1 = fBlaMka(C1, D1);                                                  \ | ||||
|                                                                                \ | ||||
|         B0 = _mm_xor_si128(B0, C0);                                            \ | ||||
|         B1 = _mm_xor_si128(B1, C1);                                            \ | ||||
|                                                                                \ | ||||
|         B0 = _mm_roti_epi64(B0, -24);                                          \ | ||||
|         B1 = _mm_roti_epi64(B1, -24);                                          \ | ||||
|     } while ((void)0, 0) | ||||
| 
 | ||||
| #define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \ | ||||
|     do {                                                                       \ | ||||
|         A0 = fBlaMka(A0, B0);                                                  \ | ||||
|         A1 = fBlaMka(A1, B1);                                                  \ | ||||
|                                                                                \ | ||||
|         D0 = _mm_xor_si128(D0, A0);                                            \ | ||||
|         D1 = _mm_xor_si128(D1, A1);                                            \ | ||||
|                                                                                \ | ||||
|         D0 = _mm_roti_epi64(D0, -16);                                          \ | ||||
|         D1 = _mm_roti_epi64(D1, -16);                                          \ | ||||
|                                                                                \ | ||||
|         C0 = fBlaMka(C0, D0);                                                  \ | ||||
|         C1 = fBlaMka(C1, D1);                                                  \ | ||||
|                                                                                \ | ||||
|         B0 = _mm_xor_si128(B0, C0);                                            \ | ||||
|         B1 = _mm_xor_si128(B1, C1);                                            \ | ||||
|                                                                                \ | ||||
|         B0 = _mm_roti_epi64(B0, -63);                                          \ | ||||
|         B1 = _mm_roti_epi64(B1, -63);                                          \ | ||||
|     } while ((void)0, 0) | ||||
| 
 | ||||
| #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \ | ||||
|     do {                                                                       \ | ||||
|         __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \ | ||||
|         __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \ | ||||
|         B0 = t0;                                                               \ | ||||
|         B1 = t1;                                                               \ | ||||
|                                                                                \ | ||||
|         t0 = C0;                                                               \ | ||||
|         C0 = C1;                                                               \ | ||||
|         C1 = t0;                                                               \ | ||||
|                                                                                \ | ||||
|         t0 = _mm_alignr_epi8(D1, D0, 8);                                       \ | ||||
|         t1 = _mm_alignr_epi8(D0, D1, 8);                                       \ | ||||
|         D0 = t1;                                                               \ | ||||
|         D1 = t0;                                                               \ | ||||
|     } while ((void)0, 0) | ||||
| 
 | ||||
| #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \ | ||||
|     do {                                                                       \ | ||||
|         __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \ | ||||
|         __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \ | ||||
|         B0 = t0;                                                               \ | ||||
|         B1 = t1;                                                               \ | ||||
|                                                                                \ | ||||
|         t0 = C0;                                                               \ | ||||
|         C0 = C1;                                                               \ | ||||
|         C1 = t0;                                                               \ | ||||
|                                                                                \ | ||||
|         t0 = _mm_alignr_epi8(D0, D1, 8);                                       \ | ||||
|         t1 = _mm_alignr_epi8(D1, D0, 8);                                       \ | ||||
|         D0 = t1;                                                               \ | ||||
|         D1 = t0;                                                               \ | ||||
|     } while ((void)0, 0) | ||||
| 
 | ||||
| #define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \ | ||||
|     do {                                                                       \ | ||||
|         G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \ | ||||
|         G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \ | ||||
|                                                                                \ | ||||
|         DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \ | ||||
|                                                                                \ | ||||
|         G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \ | ||||
|         G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \ | ||||
|                                                                                \ | ||||
|         UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \ | ||||
|     } while ((void)0, 0) | ||||
| 
 | ||||
| 
 | ||||
| #endif /* BLAKE_ROUND_MKA_OPT_H */ | ||||
|  | @ -92,7 +92,7 @@ namespace randomx { | |||
| 		context.flags = ARGON2_DEFAULT_FLAGS; | ||||
| 		context.version = ARGON2_VERSION_NUMBER; | ||||
| 
 | ||||
| 		int inputsValid = rxa2_validate_inputs(&context); | ||||
| 		int inputsValid = randomx_argon2_validate_inputs(&context); | ||||
| 		assert(inputsValid == ARGON2_OK); | ||||
| 
 | ||||
| 		/* 2. Align memory size */ | ||||
|  | @ -111,6 +111,7 @@ namespace randomx { | |||
| 		instance.threads = context.threads; | ||||
| 		instance.type = Argon2_d; | ||||
| 		instance.memory = (block*)cache->memory; | ||||
| 		instance.impl = cache->argonImpl; | ||||
| 
 | ||||
| 		if (instance.threads > instance.lanes) { | ||||
| 			instance.threads = instance.lanes; | ||||
|  | @ -119,9 +120,9 @@ namespace randomx { | |||
| 		/* 3. Initialization: Hashing inputs, allocating memory, filling first
 | ||||
| 		 * blocks | ||||
| 		 */ | ||||
| 		rxa2_argon_initialize(&instance, &context); | ||||
| 		randomx_argon2_initialize(&instance, &context); | ||||
| 
 | ||||
| 		rxa2_fill_memory_blocks(&instance); | ||||
| 		randomx_argon2_fill_memory_blocks(&instance); | ||||
| 
 | ||||
| 		cache->reciprocalCache.clear(); | ||||
| 		randomx::Blake2Generator gen(key, keySize); | ||||
|  |  | |||
|  | @ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.hpp" | ||||
| #include "superscalar_program.hpp" | ||||
| #include "allocator.hpp" | ||||
| #include "argon2.h" | ||||
| 
 | ||||
| /* Global scope for C binding */ | ||||
| struct randomx_dataset { | ||||
|  | @ -51,6 +52,7 @@ struct randomx_cache { | |||
| 	randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES]; | ||||
| 	std::vector<uint64_t> reciprocalCache; | ||||
| 	std::string cacheKey; | ||||
| 	randomx_argon2_impl* argonImpl; | ||||
| 
 | ||||
| 	bool isInitialized() { | ||||
| 		return programs[0].getSize() != 0; | ||||
|  | @ -79,4 +81,21 @@ namespace randomx { | |||
| 	void initCacheCompile(randomx_cache*, const void*, size_t); | ||||
| 	void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t blockNumber); | ||||
| 	void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); | ||||
| 
 | ||||
| 	inline randomx_argon2_impl* selectArgonImpl(randomx_flags flags) { | ||||
| 		if ((flags & RANDOMX_FLAG_ARGON2) == 0) { | ||||
| 			return &randomx_argon2_fill_segment_ref; | ||||
| 		} | ||||
| 		randomx_argon2_impl* impl = nullptr; | ||||
| 		if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_SSE3) { | ||||
| 			impl = randomx_argon2_impl_sse3(); | ||||
| 		} | ||||
| 		if ((flags & RANDOMX_FLAG_ARGON2) == RANDOMX_FLAG_ARGON2_AVX2) { | ||||
| 			impl = randomx_argon2_impl_avx2(); | ||||
| 		} | ||||
| 		if (impl != nullptr) { | ||||
| 			return impl; | ||||
| 		} | ||||
| 		throw std::runtime_error("Unsupported Argon2 implementation"); | ||||
| 	} | ||||
| } | ||||
|  |  | |||
|  | @ -39,10 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| extern "C" { | ||||
| 
 | ||||
| 	randomx_cache *randomx_alloc_cache(randomx_flags flags) { | ||||
| 		randomx_cache *cache; | ||||
| 		randomx_cache *cache = nullptr; | ||||
| 
 | ||||
| 		try { | ||||
| 			cache = new randomx_cache(); | ||||
| 			cache->argonImpl = randomx::selectArgonImpl(flags); | ||||
| 			switch (flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)) { | ||||
| 				case RANDOMX_FLAG_DEFAULT: | ||||
| 					cache->dealloc = &randomx::deallocCache<randomx::DefaultAllocator>; | ||||
|  | @ -103,7 +104,9 @@ extern "C" { | |||
| 
 | ||||
| 	void randomx_release_cache(randomx_cache* cache) { | ||||
| 		assert(cache != nullptr); | ||||
| 		cache->dealloc(cache); | ||||
| 		if (cache->memory != nullptr) { | ||||
| 			cache->dealloc(cache); | ||||
| 		} | ||||
| 		delete cache; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -114,7 +117,7 @@ extern "C" { | |||
| 			return nullptr; | ||||
| 		} | ||||
| 
 | ||||
| 		randomx_dataset *dataset; | ||||
| 		randomx_dataset *dataset = nullptr; | ||||
| 
 | ||||
| 		try { | ||||
| 			dataset = new randomx_dataset(); | ||||
|  |  | |||
|  | @ -44,7 +44,10 @@ typedef enum { | |||
|   RANDOMX_FLAG_HARD_AES = 2, | ||||
|   RANDOMX_FLAG_FULL_MEM = 4, | ||||
|   RANDOMX_FLAG_JIT = 8, | ||||
|   RANDOMX_FLAG_SECURE = 16 | ||||
|   RANDOMX_FLAG_SECURE = 16, | ||||
|   RANDOMX_FLAG_ARGON2_SSE3 = 32, | ||||
|   RANDOMX_FLAG_ARGON2_AVX2 = 64, | ||||
|   RANDOMX_FLAG_ARGON2 = 96 | ||||
| } randomx_flags; | ||||
| 
 | ||||
| typedef struct randomx_dataset randomx_dataset; | ||||
|  | @ -62,10 +65,17 @@ extern "C" { | |||
|  *        RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages | ||||
|  *        RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes | ||||
|  *                           subsequent Dataset initialization faster | ||||
|  *        Optionally, one of these two flags may be selected: | ||||
|  *        RANDOMX_FLAG_ARGON2_SSE3 - optimized Argon2 for CPUs with the SSSE3 instruction set | ||||
|  *                                   makes subsequent cache initialization faster | ||||
|  *        RANDOMX_FLAG_ARGON2_AVX2 - optimized Argon2 for CPUs with the AVX2 instruction set | ||||
|  *                                   makes subsequent cache initialization faster | ||||
|  * | ||||
|  * @return Pointer to an allocated randomx_cache structure. | ||||
|  *         NULL is returned if memory allocation fails or if the RANDOMX_FLAG_JIT | ||||
|  *         is set and JIT compilation is not supported on the current platform. | ||||
|  *         Returns NULL if: | ||||
|  *         (1) memory allocation fails | ||||
|  *         (2) the RANDOMX_FLAG_JIT is set and JIT compilation is not supported on the current platform | ||||
|  *         (3) an invalid or unsupported RANDOMX_FLAG_ARGON2 value is set | ||||
|  */ | ||||
| RANDOMX_EXPORT randomx_cache *randomx_alloc_cache(randomx_flags flags); | ||||
| 
 | ||||
|  |  | |||
|  | @ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "stopwatch.hpp" | ||||
| #include "utility.hpp" | ||||
| #include "../randomx.h" | ||||
| #include "../dataset.hpp" | ||||
| #include "../blake2/endian.h" | ||||
| #include "../common.hpp" | ||||
| #ifdef _WIN32 | ||||
|  | @ -90,6 +91,8 @@ void printUsage(const char* executable) { | |||
| 	std::cout << "  --init Q      initialize dataset with Q threads (default: 1)" << std::endl; | ||||
| 	std::cout << "  --nonces N    run N nonces (default: 1000)" << std::endl; | ||||
| 	std::cout << "  --seed S      seed for cache initialization (default: 0)" << std::endl; | ||||
| 	std::cout << "  --sse3        use optimized Argon2 for SSSE3 CPUs" << std::endl; | ||||
| 	std::cout << "  --avx2        use optimized Argon2 for AVX2 CPUs" << std::endl; | ||||
| } | ||||
| 
 | ||||
| struct MemoryException : public std::exception { | ||||
|  | @ -127,7 +130,7 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result | |||
| } | ||||
| 
 | ||||
| int main(int argc, char** argv) { | ||||
| 	bool softAes, miningMode, verificationMode, help, largePages, jit, secure; | ||||
| 	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, sse3, avx2; | ||||
| 	int noncesCount, threadCount, initThreadCount; | ||||
| 	uint64_t threadAffinity; | ||||
| 	int32_t seedValue; | ||||
|  | @ -148,6 +151,8 @@ int main(int argc, char** argv) { | |||
| 	readOption("--jit", argc, argv, jit); | ||||
| 	readOption("--help", argc, argv, help); | ||||
| 	readOption("--secure", argc, argv, secure); | ||||
| 	readOption("--sse3", argc, argv, sse3); | ||||
| 	readOption("--avx2", argc, argv, avx2); | ||||
| 
 | ||||
| 	store32(&seed, seedValue); | ||||
| 
 | ||||
|  | @ -166,6 +171,16 @@ int main(int argc, char** argv) { | |||
| 	randomx_cache* cache; | ||||
| 	randomx_flags flags = RANDOMX_FLAG_DEFAULT; | ||||
| 
 | ||||
| 	if (sse3) { | ||||
| 		flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_SSE3); | ||||
| 		std::cout << " - Argon2 implementation: SSE3" << std::endl; | ||||
| 	} | ||||
| 
 | ||||
| 	if (avx2) { | ||||
| 		flags = (randomx_flags)(flags | RANDOMX_FLAG_ARGON2_AVX2); | ||||
| 		std::cout << " - Argon2 implementation: AVX2" << std::endl; | ||||
| 	} | ||||
| 
 | ||||
| 	if (miningMode) { | ||||
| 		flags = (randomx_flags)(flags | RANDOMX_FLAG_FULL_MEM); | ||||
| 		std::cout << " - full memory mode (2080 MiB)" << std::endl; | ||||
|  | @ -213,6 +228,7 @@ int main(int argc, char** argv) { | |||
| 	std::cout << " ..." << std::endl; | ||||
| 
 | ||||
| 	try { | ||||
| 		randomx::selectArgonImpl(flags); //just to check if flags are valid
 | ||||
| 		if (jit && !RANDOMX_HAVE_COMPILER) { | ||||
| 			throw std::runtime_error("JIT compilation is not supported on this platform. Try without --jit"); | ||||
| 		} | ||||
|  |  | |||
|  | @ -997,8 +997,9 @@ int main() { | |||
| 
 | ||||
| 	if (RANDOMX_HAVE_COMPILER) { | ||||
| 		randomx_release_cache(cache); | ||||
| 		cache = randomx_alloc_cache(RANDOMX_FLAG_JIT); | ||||
| 		randomx_destroy_vm(vm); | ||||
| 		vm = nullptr; | ||||
| 		cache = randomx_alloc_cache(RANDOMX_FLAG_JIT); | ||||
| 		initCache("test key 000"); | ||||
| 		vm = randomx_create_vm(RANDOMX_FLAG_JIT, cache, nullptr); | ||||
| 	} | ||||
|  | @ -1013,6 +1014,35 @@ int main() { | |||
| 
 | ||||
| 	runTest("Hash test 2e (compiler)", RANDOMX_HAVE_COMPILER && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), test_e); | ||||
| 
 | ||||
| 	randomx_destroy_vm(vm); | ||||
| 	vm = nullptr; | ||||
| 
 | ||||
| 	randomx_release_cache(cache); | ||||
| 	cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_SSE3); | ||||
| 
 | ||||
| 	runTest("Cache initialization: SSSE3", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { | ||||
| 		initCache("test key 000"); | ||||
| 		uint64_t* cacheMemory = (uint64_t*)cache->memory; | ||||
| 		assert(cacheMemory[0] == 0x191e0e1d23c02186); | ||||
| 		assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1); | ||||
| 		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); | ||||
| 	}); | ||||
| 
 | ||||
| 	if (cache != nullptr) | ||||
| 		randomx_release_cache(cache); | ||||
| 	cache = randomx_alloc_cache(RANDOMX_FLAG_ARGON2_AVX2); | ||||
| 
 | ||||
| 	runTest("Cache initialization: AVX2", cache != nullptr && RANDOMX_ARGON_ITERATIONS == 3 && RANDOMX_ARGON_LANES == 1 && RANDOMX_ARGON_MEMORY == 262144 && stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { | ||||
| 		initCache("test key 000"); | ||||
| 		uint64_t* cacheMemory = (uint64_t*)cache->memory; | ||||
| 		assert(cacheMemory[0] == 0x191e0e1d23c02186); | ||||
| 		assert(cacheMemory[1568413] == 0xf1b62fe6210bf8b1); | ||||
| 		assert(cacheMemory[33554431] == 0x1f47f056d05cd99b); | ||||
| 	}); | ||||
| 
 | ||||
| 	if (cache != nullptr) | ||||
| 		randomx_release_cache(cache); | ||||
| 
 | ||||
| 	std::cout << std::endl << "All tests PASSED" << std::endl; | ||||
| 
 | ||||
| 	if (skipped) { | ||||
|  |  | |||
|  | @ -114,6 +114,7 @@ | |||
|       <ConformanceMode>true</ConformanceMode> | ||||
|       <AssemblerOutput>AssemblyCode</AssemblerOutput> | ||||
|       <PreprocessorDefinitions>_MBCS;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions> | ||||
|       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet> | ||||
|     </ClCompile> | ||||
|     <Link> | ||||
|       <EnableCOMDATFolding>true</EnableCOMDATFolding> | ||||
|  | @ -131,8 +132,10 @@ SET ERRORLEVEL = 0</Command> | |||
|   </ItemDefinitionGroup> | ||||
|   <ItemGroup> | ||||
|     <ClCompile Include="..\src\allocator.cpp" /> | ||||
|     <ClCompile Include="..\src\argon2_avx2.c" /> | ||||
|     <ClCompile Include="..\src\argon2_core.c" /> | ||||
|     <ClCompile Include="..\src\argon2_ref.c" /> | ||||
|     <ClCompile Include="..\src\argon2_sse3.c" /> | ||||
|     <ClCompile Include="..\src\assembly_generator_x86.cpp" /> | ||||
|     <ClCompile Include="..\src\blake2_generator.cpp" /> | ||||
|     <ClCompile Include="..\src\blake2\blake2b.c" /> | ||||
|  | @ -163,7 +166,9 @@ SET ERRORLEVEL = 0</Command> | |||
|     <ClInclude Include="..\src\assembly_generator_x86.hpp" /> | ||||
|     <ClInclude Include="..\src\blake2\blake2-impl.h" /> | ||||
|     <ClInclude Include="..\src\blake2\blake2.h" /> | ||||
|     <ClInclude Include="..\src\blake2\blamka-round-avx2.h" /> | ||||
|     <ClInclude Include="..\src\blake2\blamka-round-ref.h" /> | ||||
|     <ClInclude Include="..\src\blake2\blamka-round-sse3.h" /> | ||||
|     <ClInclude Include="..\src\blake2\endian.h" /> | ||||
|     <ClInclude Include="..\src\blake2_generator.hpp" /> | ||||
|     <ClInclude Include="..\src\bytecode_machine.hpp" /> | ||||
|  |  | |||
|  | @ -81,6 +81,12 @@ | |||
|     <ClCompile Include="..\src\bytecode_machine.cpp"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\src\argon2_sse3.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\src\argon2_avx2.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|   </ItemGroup> | ||||
|   <ItemGroup> | ||||
|     <ClInclude Include="..\src\argon2.h"> | ||||
|  | @ -185,6 +191,12 @@ | |||
|     <ClInclude Include="..\src\bytecode_machine.hpp"> | ||||
|       <Filter>Header Files</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\src\blake2\blamka-round-sse3.h"> | ||||
|       <Filter>Header Files</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\src\blake2\blamka-round-avx2.h"> | ||||
|       <Filter>Header Files</Filter> | ||||
|     </ClInclude> | ||||
|   </ItemGroup> | ||||
|   <ItemGroup> | ||||
|     <MASM Include="..\src\jit_compiler_x86_static.asm"> | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue