From 32d827d0a63c66f13cade760aeb0894e4868ee26 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 9 Feb 2019 15:45:26 +0100 Subject: [PATCH] Interpreter with bytecode Fixed some undefined behavior with signed types Fixed different results on big endian systems Removed unused code files Restored FNEG_R instructions Updated documentation --- README.md | 134 ++-- doc/dataset.md | 85 +- doc/isa-ops.md | 187 ++--- doc/isa.md | 237 ++---- src/AssemblyGeneratorX86.cpp | 65 +- src/AssemblyGeneratorX86.hpp | 5 +- src/Cache.cpp | 1 - src/CompiledVirtualMachine.cpp | 44 +- src/CompiledVirtualMachine.hpp | 3 +- src/Instruction.cpp | 22 +- src/Instruction.hpp | 6 +- src/InterpretedVirtualMachine.cpp | 725 +++++++++++++++-- src/InterpretedVirtualMachine.hpp | 100 +-- src/JitCompilerX86.cpp | 79 +- src/JitCompilerX86.hpp | 7 +- src/LightClientAsyncWorker.cpp | 27 +- src/Pcg32.hpp | 72 -- src/Program.cpp | 8 +- src/Program.hpp | 11 +- src/VirtualMachine.cpp | 68 +- src/VirtualMachine.hpp | 20 +- src/asm/program_loop_store.inc | 4 - src/asm/program_prologue_load.inc | 3 +- src/blake2/blake2-impl.h | 105 +-- src/blake2/blake2b.c | 10 +- src/blake2/blamka-round-ref.h | 2 +- src/blake2/endian.h | 99 +++ src/common.hpp | 53 +- src/dataset.cpp | 89 +-- src/dataset.hpp | 6 +- src/executeProgram-win64.asm | 4 +- src/instructionWeights.hpp | 8 +- src/instructions.hpp | 57 -- src/instructionsPortable.cpp | 179 ++--- src/intrinPortable.h | 30 +- src/main.cpp | 19 +- src/program.inc | 281 +++---- src/squareHash.h | 5 + src/t1ha/t1ha.h | 723 ----------------- src/t1ha/t1ha2.c | 329 -------- src/t1ha/t1ha_bits.h | 1226 ----------------------------- 41 files changed, 1517 insertions(+), 3621 deletions(-) delete mode 100644 src/Pcg32.hpp create mode 100644 src/blake2/endian.h delete mode 100644 src/instructions.hpp delete mode 100644 src/t1ha/t1ha.h delete mode 100644 src/t1ha/t1ha2.c delete mode 100644 src/t1ha/t1ha_bits.h diff --git a/README.md b/README.md index fc18ed8..ed5d594 100644 --- a/README.md +++ b/README.md @@ -1,111 +1,78 @@ + + + # RandomX -RandomX is an experimental proof of work (PoW) algorithm that uses random code execution. +RandomX is a proof-of-work (PoW) algorithm that is optimized for general-purpose CPUs. RandomX uses random code execution (hence the name) together with several memory-hard techniques to achieve the following goals: -### Key features +* Prevent the development of a single-chip [ASIC](https://en.wikipedia.org/wiki/Application-specific_integrated_circuit) +* Minimize the efficiency advantage of specialized hardware compared to a general-purpose CPU -* Memory-hard (requires >4 GiB of memory) -* CPU-friendly (especially for x86 and ARM architectures) -* arguably ASIC-resistant -* inefficient on GPUs -* unusable for web-mining +## Design -## Virtual machine +The core of RandomX is a virtual machine (VM), which can be summarized by the following schematic: -RandomX is intended to be run efficiently on a general-purpose CPU. The virtual machine (VM) which runs RandomX code attempts to simulate a generic CPU using the following set of components: +![Imgur](https://i.imgur.com/8RYNWLk.png) -![Imgur](https://i.imgur.com/ZAfbX9m.png) +Notable parts of the RandomX VM are: -Full description: [vm.md](doc/vm.md). +* a large read-only 4 GiB dataset +* a 2 MiB scratchpad (read/write), which is structured into three levels L1, L2 and L3 +* 8 integer and 12 floating point registers +* an arithmetic logic unit (ALU) +* a floating point unit (FPU) +* a 2 KiB program buffer -## Dataset +The structure of the VM mimics the components that are found in a typical general purpose computer equipped with a CPU and a large amount of DRAM. The scratchpad is designed to fit into the CPU cache. The first 16 KiB and 256 KiB of the scratchpad are used more often take advantage of the faster L1 and L2 caches. The ratio of random reads from L1/L2/L3 is approximately 9:3:1, which matches the inverse latencies of typical CPU caches. -RandomX uses a 4 GiB read-only dataset. The dataset is constructed using a combination of the [Argon2d](https://en.wikipedia.org/wiki/Argon2) hashing function, [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption/decryption and a random permutation. The dataset is regenerated every ~34 hours. +The VM executes programs in a special instruction set, which was designed in such way that any random 8-byte word is a valid instruction and any sequence of valid instructions is a valid program. For more details see [RandomX ISA documentation](doc/isa.md). Because there are no "syntax" rules, generating a random program is as easy as filling the program buffer with random data. A RandomX program consists of 256 instructions. See [program.inc](../src/program.inc) as an example of a RandomX program translated into x86-64 assembly. -Full description: [dataset.md](doc/dataset.md). +#### Hash calculation -## Instruction set +Calculating a RandomX hash consists of initializing the 2 MiB scratchpad with random data, executing 8 RandomX loops and calculating a hash of the scratchpad. -RandomX uses a simple low-level language (instruction set), which was designed so that any random bitstring forms a valid program. Each RandomX instruction has a length of 128 bits. +Each RandomX loop is repeated 2048 times. The loop body has 4 parts: +1. The values of all registers are loaded randomly from the scratchpad (L3) +2. The RandomX program is executed +3. A random block is loaded from the dataset and mixed with integer registers +4. All register values are stored into the scratchpad (L3) -Full description: [isa.md](doc/isa.md). +Hash of the register state after 2048 interations is used to initialize the random program for the next loop. The use of 8 different programs in the course of a single hash calculation prevents mining strategies that search for "easy" programs. -## Implementation -Proof-of-concept implementation is written in C++. -``` -> bin/randomx --help -Usage: bin/randomx [OPTIONS] -Supported options: - --help shows this message - --compiled use x86-64 JIT-compiled VM (default: interpreted VM) - --lightClient use 'light-client' mode (default: full dataset mode) - --softAes use software AES (default: x86 AES-NI) - --threads T use T threads (default: 1) - --nonces N run N nonces (default: 1000) - --genAsm generate x86 asm code for nonce N -``` +The loads from the dataset are fully prefetched, so they don't slow down the loop. -Two RandomX virtual machines are implemented: +RandomX uses the [Blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) cryptographic hash function. Special hashing functions based on [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) encryption are used to initialize and hash the scratchpad. -### Interpreted VM -The interpreted VM is the reference implementation, which aims for maximum portability. +#### Hash verification -The VM has been tested for correctness on the following platforms: -* Linux: x86-64, ARMv7 (32-bit), ARMv8 (64-bit) -* Windows: x86, x86-64 -* MacOS: x86-64 +RandomX is a symmetric PoW algorithm, so the verifying party has to repeat the same steps as when a hash is calculated. -The interpreted VM supports two modes: "full dataset" mode, which requires more than 4 GiB of virtual memory, and a "light-client" mode, which requires about 64 MiB of memory, but runs significantly slower because dataset blocks are created on the fly rather than simply fetched from memory. +However, to allow hash verification on devices that cannot store the whole 4 GiB dataset, RandomX allows a time-memory tradeoff by using just 256 MiB of memory at the cost of 16 times more random memory accesses. See [Dataset initialization](doc/dataset.md) for more details. -Software AES implementation is available for CPUs which don't support [AES-NI](https://en.wikipedia.org/wiki/AES_instruction_set). +#### Documentation +* [RandomX ISA](doc/isa.md) +* [RandomX instruction listing](doc/isa-ops.md) +* [Dataset initialization](doc/dataset.md) -The following table lists the performance for Intel Core i5-3230M (Ivy Bridge) CPU using a single core on Windows 64-bit, compiled with Visual Studio 2017: +# FAQ -|mode|required memory|AES|initialization time [s]|performance [programs/s]| -|------|----|-----|-------------------------|------------------| -|light client|64 MiB|software|1.0|9.2| -|light client|64 MiB|AES-NI|1.0|16| -|full dataset|4 GiB|software|54|40| -|full dataset|4 GiB|AES-NI|26|40| +### Can RandomX run on a GPU? -### JIT-compiled VM -A JIT compiler is available for x86-64 CPUs. This implementation shows the approximate performance that can be achieved using optimized mining software. The JIT compiler generates generic x86-64 code without any architecture-specific optimizations. Only "full dataset" mode is supported. +We don't expect GPUs will ever be competitive in mining RandomX. The reference miner is CPU-only. -For optimal performance, an x86-64 CPU needs: -* 32 KiB of L1 instruction cache per thread -* 16 KiB of L1 data cache per thread -* 240 KiB of L2 cache (exclusive) per thread +RandomX was designed to be efficient on CPUs. Designing an algorithm compatible with both CPUs and GPUs brings too many limitations and ultimately decreases ASIC resistance. CPUs have the advantage of not needing proprietary drivers and most CPU architectures support a large common subset of primitive operations. -The following table lists the performance of AMD Ryzen 7 1700 (clock fixed at 3350 MHz, 1.05 Vcore, dual channel DDR4 2400 MHz) on Linux 64-bit (compiled with GCC 5.4.0). +Additionally, targeting CPUs allows for more decentralized mining for several reasons: -Power consumption was measured for the whole system using a wall socket wattmeter (±1W). Table lists difference over idle power consumption. [Prime95](https://en.wikipedia.org/wiki/Prime95#Use_for_stress_testing) (small/in-place FFT) and [Cryptonight V2](https://github.com/monero-project/monero/pull/4218) power consumption are listed for comparison. +* Every computer has a CPU and even laptops will be able to mine efficiently. +* CPU mining is easier to set up - no driver compatibility issues, BIOS flashing etc. +* CPU mining is more difficult to centralize because computers can usually have only one CPU except for expensive server parts. -||threads|initialization time [s]|performance [programs/s]|power [W] -|-|------|----|-----|-------------------------| -|RandomX (interpreted)|1|27|52|16| -|RandomX (interpreted)|8|4.0|390|63| -|RandomX (interpreted)|16|3.5|620|74| -|RandomX (compiled)|1|27|407|17| -|RandomX (compiled)|2|14|810|26| -|RandomX (compiled)|4|7.3|1620|42| -|RandomX (compiled)|6|5.1|2410|56| -|RandomX (compiled)|8|4.0|3200|71| -|RandomX (compiled)|12|4.0|3670|82| -|RandomX (compiled)|16|3.5|4110|92| -|Cryptonight v2|8|-|-|47| -|Prime95|8|-|-|77| -|Prime95|16|-|-|81| +### Does RandomX facilitate botnets/malware mining or web mining? +Quite the opposite. Efficient mining requires 4 GiB of memory, which is very difficult to hide in an infected computer and disqualifies many low-end machines. Web mining is nearly impossible due to the large memory requirement and the need for a rather lengthy initialization of the dataset. -## Proof of work +### Since RandomX uses floating point calculations, how can it give reproducible results on different platforms? -RandomX VM can be used for PoW using the following steps: - -1. Initialize the VM using a 256-bit hash of any data. -2. Execute the RandomX program. -3. Calculate `blake2b(RegisterFile || t1ha2(Scratchpad))`* - -\* [blake2b](https://en.wikipedia.org/wiki/BLAKE_%28hash_function%29#BLAKE2) is a cryptographic hash function, [t1ha2](https://github.com/leo-yuriev/t1ha) is a fast hashing function. - -The above steps can be chained multiple times to prevent mining strategies that search for programs with particular properties (for example, without division). +RandomX uses only operations that are guaranteed to give correctly rounded results by the [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754) standard: addition, subtraction, multiplication, division and square root. Special care is taken to avoid corner cases such as NaN values or denormals. ## Acknowledgements The following people have contributed to the design of RandomX: @@ -114,13 +81,10 @@ The following people have contributed to the design of RandomX: RandomX uses some source code from the following 3rd party repositories: * Argon2d, Blake2b hashing functions: https://github.com/P-H-C/phc-winner-argon2 -* PCG32 random number generator: https://github.com/imneme/pcg-c-basic * Software AES implementation https://github.com/fireice-uk/xmr-stak -* t1ha2 hashing function: https://github.com/leo-yuriev/t1ha ## Donations - XMR: ``` -4B9nWtGhZfAWsTxWujPDGoWfVpJvADxkxJJTmMQp3zk98n8PdLkEKXA5g7FEUjB8JPPHdP959WDWMem3FPDTK2JUU1UbVHo -``` +845xHUh5GvfHwc2R8DVJCE7BT2sd4YEcmjG8GNSdmeNsP5DTEjXd1CNgxTcjHjiFuthRHAoVEJjM7GyKzQKLJtbd56xbh7V +``` \ No newline at end of file diff --git a/doc/dataset.md b/doc/dataset.md index b3c0ee3..bb562bd 100644 --- a/doc/dataset.md +++ b/doc/dataset.md @@ -1,13 +1,13 @@ ## Dataset -The dataset serves as the source of the first operand of all instructions and provides the memory-hardness of RandomX. The size of the dataset is fixed at 4 GiB and it's divided into 65536 blocks, each 64 KiB in size. +The dataset is randomly accessed 16384 times during each hash calculation, which significantly increases memory-hardness of RandomX. The size of the dataset is fixed at 4 GiB and it's divided into 67108864 block of 64 bytes. -In order to allow PoW verification with less than 4 GiB of memory, the dataset is constructed from a 64 MiB cache, which can be used to calculate dataset blocks on the fly. To facilitate this, all random reads from the dataset are aligned to the beginning of a block. +In order to allow PoW verification with less than 4 GiB of memory, the dataset is constructed from a 256 MiB cache, which can be used to calculate dataset rows on the fly. -Because the initialization of the dataset is computationally intensive, it's recalculated on average every 1024 blocks (~34 hours). The following figure visualizes the construction of the dataset: +Because the initialization of the dataset is computationally intensive, it is recalculated only every 1024 blocks (~34 hours). The following figure visualizes the construction of the dataset: -![Imgur](https://i.imgur.com/JgLCjeq.png) +![Imgur](https://i.imgur.com/b9WHOwo.png) ### Seed block The whole dataset is constructed from a 256-bit hash of the last block whose height is divisible by 1024 **and** has at least 64 confirmations. @@ -21,7 +21,7 @@ The whole dataset is constructed from a 256-bit hash of the last block whose hei ### Cache construction -The 32-byte seed block hash is expanded into the 64 MiB cache using the "memory fill" function of Argon2d. [Argon2](https://github.com/P-H-C/phc-winner-argon2) is a memory-hard password hashing function, which is highly customizable. The variant with "d" suffix uses a data-dependent memory access pattern and provides the highest resistance against time-memory tradeoffs. +The 32-byte seed block hash is expanded into the 256 MiB cache using the "memory fill" function of Argon2d. [Argon2](https://github.com/P-H-C/phc-winner-argon2) is a memory-hard password hashing function, which is highly customizable. The variant with "d" suffix uses a data-dependent memory access pattern and provides the highest resistance against time-memory tradeoffs. Argon2 is used with the following parameters: @@ -29,8 +29,8 @@ Argon2 is used with the following parameters: |------------|--| |parallelism|1| |output size|0| -|memory|65536 (64 MiB)| -|iterations|12| +|memory|262144 (256 MiB)| +|iterations|3| |version|`0x13`| |hash type|0 (Argon2d) |password|seed block hash (32 bytes) @@ -40,43 +40,66 @@ Argon2 is used with the following parameters: The finalizer and output calculation steps of Argon2 are omitted. The output is the filled memory array. -The use of 12 iterations makes time-memory tradeoffs infeasible and thus 64 MiB is the minimum amount of memory required by RandomX. - -When the memory fill is complete, the whole memory array is cyclically shifted backwards by 512 bytes (i.e. bytes 0-511 are moved to the end of the array). This is done to misalign the array so that each 1024-byte cache block spans two subsequent Argon2 blocks. +The use of 3 iterations makes time-memory tradeoffs infeasible and thus 256 MiB is the minimum amount of memory required by RandomX. ### Dataset block generation -The full 4 GiB dataset can be generated from the 64 MiB cache. Each block is generated separately: a 1024 byte block of the cache is expanded into 64 KiB of the dataset. The algorithm has 3 steps: expansion, AES and shuffle. +The full 4 GiB dataset can be generated from the 256 MiB cache. Each 64-byte block is generated independently by XORing 16 pseudorandom Cache blocks selected by the `SquareHash` function. -#### Expansion -The 1024 cache bytes are split into 128 quadwords and interleaved with 504-byte chunks of null bytes. The resulting sequence is: 8 cache bytes + 504 null bytes + 8 cache bytes + 504 null bytes etc. Total length of the expanded block is 65536 bytes. +#### SquareHash +`SquareHash` is a custom hash function with 64-bit input and 64-bit output. It is calculated by repeatedly squaring the input, splitting the 128-bit result in to two 64-bit halves and subtracting the high half from the low half. This is repeated 42 times. It's available as a [portable C implementation](../src/squareHash.h) and [x86-64 assembly version](../src/asm/squareHash.inc). -#### AES -The 256-bit seed block hash is expanded into 10 AES round keys `k0`-`k9`. Let `i = 0...65535` be the index of the block that is being expanded. If `i` is an even number, this step uses AES *decryption* and if `i` is an odd number, it uses AES *encryption*. Since both encryption and decryption scramble random data, no distinction is made between them in the text below. +Properties of `SquareHash`: -The AES encryption is performed with 10 identical rounds using round keys `k0`-`k9`. Note that this is different from the typical AES procedure, which uses a different key schedule for decryption and a modified last round. +* It achieves full [Avalanche effect](https://en.wikipedia.org/wiki/Avalanche_effect). +* Since the whole calculation is a long dependency chain, which uses only multiplication and subtraction, the performance gains by using custom hardware are very limited. +* A single `SquareHash` calculation takes 40-80 ns, which is about the same time as DRAM access latency. Devices using low-latency memory will be bottlenecked by `SquareHash`, while CPUs will finish the hash calculation in about the same time it takes to fetch data from RAM. -Before the AES encryption is applied, each 16-byte chunk is XORed with the ciphertext of the previous chunk. This is similar to the [AES-CBC](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_Block_Chaining_%28CBC%29) mode of operation and forces the encryption to be sequential. For XORing the initial block, an initialization vector is formed by zero-extending `i` to 128 bits. +The output of 16 chained SquareHash calculations is used to determine Cache blocks that are XORed together to produce a Dataset block: -#### Shuffle -When the AES step is complete, the last 16-byte chunk of the block is used to initialize a PCG32 random number generator. Bits 0-63 are used as the initial state and bits 64-127 are used as the increment. The least-significant bit of the increment is always set to 1 to form an odd number. +```c++ +void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber) { + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; -The whole block is then divided into 16384 doublewords (4 bytes) and the [Fisher–Yates shuffle](https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle) algorithm is applied to it. The algorithm generates a random in-place permutation of the 16384 doublewords. The result of the shuffle is the `i`-th block of the dataset. + r0 = 4ULL * blockNumber; + r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0; -The shuffle algorithm requires a uniform distribution of random numbers. The output of the PCG32 generator is always properly filtered to avoid the [modulo bias](https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#Modulo_bias). + constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask; + + for (auto i = 0; i < DatasetIterations; ++i) { + const uint8_t* mixBlock = cache + (r0 & mask); + PREFETCHNTA(mixBlock); + r0 = squareHash(r0); + r0 ^= load64(mixBlock + 0); + r1 ^= load64(mixBlock + 8); + r2 ^= load64(mixBlock + 16); + r3 ^= load64(mixBlock + 24); + r4 ^= load64(mixBlock + 32); + r5 ^= load64(mixBlock + 40); + r6 ^= load64(mixBlock + 48); + r7 ^= load64(mixBlock + 56); + } + + store64(out + 0, r0); + store64(out + 8, r1); + store64(out + 16, r2); + store64(out + 24, r3); + store64(out + 32, r4); + store64(out + 40, r5); + store64(out + 48, r6); + store64(out + 56, r7); +} +``` + +*Note: `SquareHash` doesn't calculate squaring modulo 264+1 because the subtraction is performed modulo 264. Squaring modulo 264+1 can be calculated by adding the carry bit in every iteration (i.e. the sequence in x86-64 assembly would have to be: `mul rax; sub rax, rdx; adc rax, 0`), but this would decrease ASIC-resistance of `SquareHash`.* ### Performance -The initial 64-MiB cache construction using Argon2d takes around 1 second using an older laptop with an Intel i5-3230M CPU (Ivy Bridge). Cache generation is strictly serial and cannot be easily parallelized. +The initial 256-MiB cache construction using Argon2d takes around 1 second using an older laptop with an Intel i5-3230M CPU (Ivy Bridge). Cache generation is strictly serial and cannot be parallelized. -Dataset generation performance depends on the support of the AES-NI instruction set. The following table lists the generation runtimes using the same Ivy Bridge laptop with a single thread: +On the same laptop, full Dataset initialization takes around 100 seconds using a single thread (1.5 µs per block). -|AES|4 GiB dataset generation|single block generation| -|-----|-----------------------------|----------------| -|hardware (AES-NI)|25 s|380 µs| -|software|53 s|810 µs| +While the generation of a single block is strictly serial, multiple blocks can be easily generated in parallel, so the Dataset generation time decreases linearly with the number of threads. Using an 8-core AMD Ryzen CPU, the whole dataset can be generated in under 10 seconds. -While the generation of a single block is strictly serial, multiple blocks can be easily generated in parallel, so the dataset generation time decreases linearly with the number of threads. Using a recent 6-core CPU with AES-NI support, the whole dataset can be generated in about 4 seconds. - -Moreover, the seed block hash is known up to 64 blocks in advance, so miners can slowly precalculate the whole dataset by generating ~512 dataset blocks per minute (corresponds to less than 1% utilization of a single CPU core). +Moreover, the seed block hash is known up to 64 blocks in advance, so miners can slowly precalculate the whole dataset by generating 524288 dataset blocks per minute (corresponds to about 1% utilization of a single CPU core). ### Light clients -Light clients, who cannot or do not want to generate and keep the whole dataset in memory, can generate just the cache and then generate blocks on the fly as the program is being executed. In this case, the program execution time will be increased by roughly 100 times the single block generation time. For the Intel Ivy Bridge laptop, this amounts to around 40 milliseconds per program. \ No newline at end of file +Light clients, who cannot or do not want to generate and keep the whole dataset in memory, can generate just the cache and then generate blocks on the fly during hash calculation. In this case, the hash calculation time will be increased by 16384 times the single block generation time. For the Intel Ivy Bridge laptop, this amounts to around 24.5 milliseconds per hash. \ No newline at end of file diff --git a/doc/isa-ops.md b/doc/isa-ops.md index 5e389e3..4a1cca5 100644 --- a/doc/isa-ops.md +++ b/doc/isa-ops.md @@ -1,130 +1,103 @@ - # RandomX instruction listing -There are 31 unique instructions divided into 3 groups: - -|group|# operations|# opcodes|| -|---------|-----------------|----|-| -|integer (IA)|22|144|56.3%| -|floating point (FP)|5|76|29.7%| -|control (CL)|4|36|14.0% -||**31**|**256**|**100%** - ## Integer instructions -There are 22 integer instructions. They are divided into 3 classes (MATH, DIV, SHIFT) with different B operand selection rules. +For integer instructions, the destination is always an integer register (register group R). Source operand (if applicable) can be either an integer register or memory value. If `dst` and `src` refer to the same register, most instructions use `imm32` as the source operand instead of the register. This is indicated in the 'src == dst' column. -|# opcodes|instruction|class|signed|A width|B width|C|C width| +Memory operands are loaded as 8-byte values from the address indicated by `src`. This indirect addressing is marked with square brackets: `[src]`. + +|frequency|instruction|dst|src|`src == dst ?`|operation| |-|-|-|-|-|-|-|-| -|12|ADD_64|MATH|no|64|64|`A + B`|64| -|2|ADD_32|MATH|no|32|32|`A + B`|32| -|12|SUB_64|MATH|no|64|64|`A - B`|64| -|2|SUB_32|MATH|no|32|32|`A - B`|32| -|21|MUL_64|MATH|no|64|64|`A * B`|64| -|10|MULH_64|MATH|no|64|64|`A * B`|64| -|15|MUL_32|MATH|no|32|32|`A * B`|64| -|15|IMUL_32|MATH|yes|32|32|`A * B`|64| -|10|IMULH_64|MATH|yes|64|64|`A * B`|64| -|4|DIV_64|DIV|no|64|32|`A / B`|64| -|4|IDIV_64|DIV|yes|64|32|`A / B`|64| -|4|AND_64|MATH|no|64|64|`A & B`|64| -|2|AND_32|MATH|no|32|32|`A & B`|32| -|4|OR_64|MATH|no|64|64|`A | B`|64| -|2|OR_32|MATH|no|32|32|`A | B`|32| -|4|XOR_64|MATH|no|64|64|`A ^ B`|64| -|2|XOR_32|MATH|no|32|32|`A ^ B`|32| -|3|SHL_64|SHIFT|no|64|6|`A << B`|64| -|3|SHR_64|SHIFT|no|64|6|`A >> B`|64| -|3|SAR_64|SHIFT|yes|64|6|`A >> B`|64| -|6|ROL_64|SHIFT|no|64|6|`A <<< B`|64| -|6|ROR_64|SHIFT|no|64|6|`A >>> B`|64| +|12/256|IADD_R|R|R|`src = imm32`|`dst = dst + src`| +|7/256|IADD_M|R|mem|`src = imm32`|`dst = dst + [src]`| +|16/256|IADD_RC|R|R|`src = dst`|`dst = dst + src + imm32`| +|12/256|ISUB_R|R|R|`src = imm32`|`dst = dst - src`| +|7/256|ISUB_M|R|mem|`src = imm32`|`dst = dst - [src]`| +|9/256|IMUL_9C|R|-|-|`dst = 9 * dst + imm32`| +|16/256|IMUL_R|R|R|`src = imm32`|`dst = dst * src`| +|4/256|IMUL_M|R|mem|`src = imm32`|`dst = dst * [src]`| +|4/256|IMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64`| +|1/256|IMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64`| +|4/256|ISMULH_R|R|R|`src = dst`|`dst = (dst * src) >> 64` (signed)| +|1/256|ISMULH_M|R|mem|`src = imm32`|`dst = (dst * [src]) >> 64` (signed)| +|4/256|IDIV_C|R|-|-|`dst = dst + dst / imm32`| +|4/256|ISDIV_C|R|-|-|`dst = dst + dst / imm32` (signed)| +|2/256|INEG_R|R|-|-|`dst = -dst`| +|16/256|IXOR_R|R|R|`src = imm32`|`dst = dst ^ src`| +|4/256|IXOR_M|R|mem|`src = imm32`|`dst = dst ^ [src]`| +|10/256|IROR_R|R|R|`src = imm32`|`dst = dst >>> src`| +|4/256|ISWAP_R|R|R|`src = dst`|`temp = src; src = dst; dst = temp`| -#### 32-bit operations -Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations is 32 bits long and bits 32-63 of C are set to zero. +#### IMULH and ISMULH +These instructions output the high 64 bits of the whole 128-bit multiplication result. The result differs for signed and unsigned multiplication (`IMULH` is unsigned, `ISMULH` is signed). The variants with a register source operand do not use `imm32` (they perform a squaring operation if `dst` equals `src`). -#### Multiplication -There are 5 different multiplication operations. MUL_64 and MULH_64 both take 64-bit unsigned operands, but MUL_64 produces the low 64 bits of the result and MULH_64 produces the high 64 bits. MUL_32 and IMUL_32 use only the low-order 32 bits of the operands and produce a 64-bit result. The signed variant interprets the arguments as signed integers. IMULH_64 takes two 64-bit signed operands and produces the high-order 64 bits of the result. +#### IDIV_C and ISDIV_C +The division instructions use a constant divisor, so they can be optimized into a [multiplication by fixed-point reciprocal](https://en.wikipedia.org/wiki/Division_algorithm#Division_by_a_constant). `IDIV_C` performs unsigned division (`imm32` is zero-extended to 64 bits), while `ISDIV_C` performs signed division. In the case of division by zero, the instructions become a no-op. In the very rare case of signed overflow, the destination register is set to zero. -#### Division -For the division instructions, the dividend is 64 bits long and the divisor 32 bits long. The IDIV_64 instruction interprets both operands as signed integers. In case of division by zero or signed overflow, the result is equal to the dividend `A`. - -75% of division instructions use a runtime-constant divisor and can be optimized using a multiplication and shifts. - -#### Shift and rotate -The shift/rotate instructions use just the bottom 6 bits of the `B` operand (`imm8` is used as the immediate value). All treat `A` as unsigned except SAR_64, which performs an arithmetic right shift by copying the sign bit. +#### ISWAP_R +This instruction swaps the values of two registers. If source and destination refer to the same register, the result is a no-op. ## Floating point instructions -There are 5 floating point instructions. All floating point instructions are vector instructions that operate on two packed double precision floating point values. +For floating point instructions, the destination can be a group F or group E register. Source operand is either a group A register or a memory value. -|# opcodes|instruction|C| -|-|-|-| -|20|FPADD|`A + B`| -|20|FPSUB|`A - B`| -|22|FPMUL|`A * B`| -|8|FPDIV|`A / B`| -|6|FPSQRT|`sqrt(abs(A))`| +Memory operands are loaded as 8-byte values from the address indicated by `src`. The 8 byte value is interpreted as two 32-bit signed integers and implicitly converted to floating point format. The lower and upper memory operands are marked as `[src][0]` and `[src][1]`. -#### Conversion of operand A -Operand A is loaded from memory as a 64-bit value. All floating point instructions interpret A as two packed 32-bit signed integers and convert them into two packed double precision floating point values. +|frequency|instruction|dst|src|operation| +|-|-|-|-|-|-|-| +|8/256|FSWAP_R|F+E|-|`(dst0, dst1) = (dst1, dst0)`| +|20/256|FADD_R|F|A|`(dst0, dst1) = (dst0 + src0, dst1 + src1)`| +|5/256|FADD_M|F|mem|`(dst0, dst1) = (dst0 + [src][0], dst1 + [src][1])`| +|20/256|FSUB_R|F|A|`(dst0, dst1) = (dst0 - src0, dst1 - src1)`| +|5/256|FSUB_M|F|mem|`(dst0, dst1) = (dst0 - [src][0], dst1 - [src][1])`| +|6/256|FNEG_R|F|-|`(dst0, dst1) = (-dst0, -dst1)`| +|20/256|FMUL_R|E|A|`(dst0, dst1) = (dst0 * src0, dst1 * src1)`| +|4/256|FDIV_M|E|mem|`(dst0, dst1) = (dst0 / [src][0], dst1 / [src][1])`| +|6/256|FSQRT_R|E|-|`(dst0, dst1) = (√dst0, √dst1)`| + +#### Denormal and NaN values +Due to restrictions on the values of the floating point registers, no operation results in `NaN`. +`FDIV_M` can produce a denormal result. In that case, the result is set to `DBL_MIN = 2.22507385850720138309e-308`, which is the smallest positive normal number. #### Rounding -FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is *roundTiesToEven*. Rounding mode can be changed by the `FPROUND` control instruction. Denormal values must be always flushed to zero. +All floating point instructions give correctly rounded results. The rounding mode depends on the value of the `fprc` register: -#### NaN -If an operation produces NaN, the result is converted into positive zero. NaN results may never be written into registers or memory. Only division and multiplication must be checked for NaN results (`0.0 / 0.0` and `0.0 * Infinity` result in NaN). - -## Control instructions -There are 4 control instructions. - -|# opcodes|instruction|description|condition| -|-|-|-|-| -|2|FPROUND|change floating point rounding mode|- -|11|JUMP|conditional jump|(see condition table below) -|11|CALL|conditional procedure call|(see condition table below) -|12|RET|return from procedure|stack is not empty - -All control instructions behave as 'arithmetic no-op' and simply copy the input operand A into the destination C. - -The JUMP and CALL instructions use a condition function, which takes the lower 32 bits of operand B (register) and the value `imm32` and evaluates a condition based on the `B.LOC.C` flag: - -|`B.LOC.C`|signed|jump condition|probability|*x86*|*ARM* -|---|---|----------|-----|--|----| -|0|no|`B <= imm32`|0% - 100%|`JBE`|`BLS` -|1|no|`B > imm32`|0% - 100%|`JA`|`BHI` -|2|yes|`B - imm32 < 0`|50%|`JS`|`BMI` -|3|yes|`B - imm32 >= 0`|50%|`JNS`|`BPL` -|4|yes|`B - imm32` overflows|0% - 50%|`JO`|`BVS` -|5|yes|`B - imm32` doesn't overflow|50% - 100%|`JNO`|`BVC` -|6|yes|`B < imm32`|0% - 100%|`JL`|`BLT` -|7|yes|`B >= imm32`|0% - 100%|`JGE`|`BGE` - -The 'signed' column specifies if the operands are interpreted as signed or unsigned 32-bit numbers. Column 'probability' lists the expected jump probability (range means that the actual value for a specific instruction depends on `imm32`). *Columns 'x86' and 'ARM' list the corresponding hardware instructions (following a `CMP` instruction).* - -### FPROUND -The FPROUND instruction changes the rounding mode for all subsequent FPU operations depending on a two-bit flag. The flag is calculated by rotating A `imm8` bits to the right and taking the two least-significant bits: - -``` -rounding flag = (A >>> imm8)[1:0] -``` - -|rounding flag|rounding mode| +|`fprc`|rounding mode| |-------|------------| -|00|roundTiesToEven| -|01|roundTowardNegative| -|10|roundTowardPositive| -|11|roundTowardZero| +|0|roundTiesToEven| +|1|roundTowardNegative| +|2|roundTowardPositive| +|3|roundTowardZero| The rounding modes are defined by the IEEE-754 standard. -*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 23 and 22 (reversed) of the ARM `FPSCR` register.* +## Other instructions +There are 4 special instructions that have more than one source operand or the destination operand is a memory value. -### JUMP -If the jump condition is `true`, the JUMP instruction performs a forward jump relative to the value of `pc`. The forward offset is equal to `16 * (imm8[6:0] + 1)` bytes (1-128 instructions forward). +|frequency|instruction|dst|src|operation| +|-|-|-|-|-| +|7/256|COND_R|R|R, `imm32`|`if(condition(src, imm32)) dst = dst + 1` +|1/256|COND_M|R|mem, `imm32`|`if(condition([src], imm32)) dst = dst + 1` +|1/256|CFROUND|`fprc`|R, `imm32`|`fprc = src >>> imm32` +|16/256|ISTORE|mem|R|`[dst] = src` -### CALL -If the jump condition is `true`, the CALL instruction pushes the value of `pc` (program counter) onto the stack and then performs a forward jump relative to the value of `pc`. The forward offset is equal to `16 * (imm8[6:0] + 1)` bytes (1-128 instructions forward). +#### COND -### RET -If the stack is not empty, the RET instruction pops the return address from the stack (it's the instruction following the previous CALL) and jumps to it. +These instructions conditionally increment the destination register. The condition function depends on the `mod.cond` flag and takes the lower 32 bits of the source operand and the value `imm32`. -## Reference implementation -A portable C++ implementation of all integer and floating point instructions is available in [instructionsPortable.cpp](../src/instructionsPortable.cpp). \ No newline at end of file +|`mod.cond`|signed|`condition`|probability|*x86*|*ARM* +|---|---|----------|-----|--|----| +|0|no|`src <= imm32`|0% - 100%|`JBE`|`BLS` +|1|no|`src > imm32`|0% - 100%|`JA`|`BHI` +|2|yes|`src - imm32 < 0`|50%|`JS`|`BMI` +|3|yes|`src - imm32 >= 0`|50%|`JNS`|`BPL` +|4|yes|`src - imm32` overflows|0% - 50%|`JO`|`BVS` +|5|yes|`src - imm32` doesn't overflow|50% - 100%|`JNO`|`BVC` +|6|yes|`src < imm32`|0% - 100%|`JL`|`BLT` +|7|yes|`src >= imm32`|0% - 100%|`JGE`|`BGE` + +The 'signed' column specifies if the operands are interpreted as signed or unsigned 32-bit numbers. Column 'probability' lists the expected probability the condition is true (range means that the actual value for a specific instruction depends on `imm32`). *Columns 'x86' and 'ARM' list the corresponding hardware instructions (following a `CMP` instruction).* + +#### CFROUND +This instruction sets the value of the `fprc` register to the 2 least significant bits of the source register rotated right by `imm32`. This changes the rounding mode of all subsequent floating point instructions. + +#### ISTORE +The `ISTORE` instruction stores the value of the source integer register to the memory at the address specified by the destination register. The `src` and `dst` register can be the same. diff --git a/doc/isa.md b/doc/isa.md index d46b16e..36a2634 100644 --- a/doc/isa.md +++ b/doc/isa.md @@ -1,182 +1,91 @@ -# RandomX instruction encoding -The instruction set was designed in such way that any random 16-byte word is a valid instruction and any sequence of valid instructions is a valid program. There are no syntax rules. -The encoding of each 128-bit instruction word is following: +# RandomX instruction set architecture +RandomX VM is a complex instruction set computer ([CISC](https://en.wikipedia.org/wiki/Complex_instruction_set_computer)). All data are loaded and stored in little-endian byte order. Signed integer numbers are represented using [two's complement](https://en.wikipedia.org/wiki/Two%27s_complement). Floating point numbers are represented using the [IEEE-754 double precision format](https://en.wikipedia.org/wiki/Double-precision_floating-point_format). -![Imgur](https://i.imgur.com/xi8zuAZ.png) +## Registers -## opcode -There are 256 opcodes, which are distributed between 3 groups of instructions. There are 31 distinct operations (each operation can be encoded using multiple opcodes - for example opcodes `0x00` to `0x0d` correspond to integer addition). +RandomX has 8 integer registers `r0`-`r7` (group R) and a total of 12 floating point registers split into 3 groups: `a0`-`a3` (group A), `f0`-`f3` (group F) and `e0`-`e3` (group E). Integer registers are 64 bits wide, while floating point registers are 128 bits wide and contain a pair of floating point numbers. The lower and upper half of floating point registers are not separately addressable. -**Table 1: Instruction groups** +*Table 1: Addressable register groups* -|group|# operations|# opcodes|| +|index|R|A|F|E|F+E| +|--|--|--|--|--|--| +|0|`r0`|`a0`|`f0`|`e0`|`f0`| +|1|`r1`|`a1`|`f1`|`e1`|`f1`| +|2|`r2`|`a2`|`f2`|`e2`|`f2`| +|3|`r3`|`a3`|`f3`|`e3`|`f3`| +|4|`r4`||||`e0`| +|5|`r5`||||`e1`| +|6|`r6`||||`e2`| +|7|`r7`||||`e3`| + +Besides the directly addressable registers above, there is a 2-bit `fprc` register for rounding control, which is an implicit destination register of the `CFROUND` instruction, and two architectural 32-bit registers `ma` and `mx`, which are not accessible to any instruction. + +Integer registers `r0`-`r7` can be the source or the destination operands of integer instructions or may be used as address registers for loading the source operand from the memory (scratchpad). + +Floating point registers `a0`-`a3` are read-only and may not be written to except at the moment a program is loaded into the VM. They can be the source operand of any floating point instruction. The value of these registers is restricted to the interval `[1, 4294967296)`. + +Floating point registers `f0`-`f3` are the *additive* registers, which can be the destination of floating point addition and subtraction instructions. The absolute value of these registers will not exceed `1.0e+12`. + +Floating point registers `e0`-`e3` are the *multiplicative* registers, which can be the destination of floating point multiplication, division and square root instructions. Their value is always positive. + +## Instruction encoding + +Each instruction word is 64 bits long and has the following format: + +![Imgur](https://i.imgur.com/FtkWRwe.png) + +### opcode +There are 256 opcodes, which are distributed between 35 distinct instructions. Each instruction can be encoded using multiple opcodes (the number of opcodes specifies the frequency of the instruction in a random program). + +*Table 2: Instruction groups* + +|group|# instructions|# opcodes|| |---------|-----------------|----|-| -|integer (IA)|22|144|56.3%| -|floating point (FP)|5|76|29.7%| -|control (CL)|4|36|14.0% -||**31**|**256**|**100%** +|integer |20|143|55.9%| +|floating point |11|88|34.4%| +|other |4|25|9.7%| +||**35**|**256**|**100%** Full description of all instructions: [isa-ops.md](isa-ops.md). -## A.LOC -**Table 2: `A.LOC` encoding** +### dst +Destination register. Only bits 0-1 (register groups A, F, E) or 0-2 (groups R, F+E) are used to encode a register according to Table 1. -|bits|description| +### src + +The `src` flag encodes a source operand register according to Table 1 (only bits 0-1 or 0-2 are used). + +Immediate value `imm32` is used as the source operand in cases when `dst` and `src` encode the same register. + +For register-memory instructions, the source operand determines the `address_base` value for calculating the memory address (see below). + +### mod + +The `mod` flag is encoded as: + +*Table 3: mod flag encoding* + +|`mod`|description| |----|--------| -|0-1|`A.LOC.W` flag| -|2-5|Reserved| -|6-7|`A.LOC.X` flag| +|0-1|`mod.mem` flag| +|2-4|`mod.cond` flag| +|5-7|Reserved| -The `A.LOC.W` flag determines the address width when reading operand A from the scratchpad: +The `mod.mem` flag determines the address mask when reading from or writing to memory: -**Table 3: Operand A read address width** +*Table 3: memory address mask* -|`A.LOC.W`|address width (W)| -|---------|-| -|0|15 bits (256 KiB)| -|1-3|11 bits (16 KiB)| +|`mod.mem`|`address_mask`|(scratchpad level)| +|---------|-|---| +|0|262136|(L2)| +|1-3|16376|(L1)| -If the `A.LOC.W` flag is zero, the address space covers the whole 256 KiB scratchpad. Otherwise, just the first 16 KiB of the scratchpad are addressed. +Table 3 applies to all memory accesses except for cases when the source operand is an immediate value. In that case, `address_mask` is equal to 2097144 (L3). -If the `A.LOC.X` flag is zero, the instruction mixes the scratchpad read address into the `mx` register using XOR. This mixing happens before the address is truncated to W bits (see pseudocode below). +The address for reading/writing is calculated by applying bitwise AND operation to `address_base` and `address_mask`. -## A.REG -**Table 4: `A.REG` encoding** +The `mod.cond` flag is used only by the `COND` instruction to select a condition to be tested. -|bits|description| -|----|--------| -|0-2|`A.REG.R` flag| -|3-7|Reserved| - -The `A.REG.R` flag encodes "readAddressRegister", which is an integer register `r0`-`r7` to be used for scratchpad read address generation. Read address is generated as follows (pseudocode): - -```python -readAddressRegister = IntegerRegister(A.REG.R) -readAddressRegister = readAddressRegister XOR SignExtend(A.mask32) -readAddress = readAddressRegister[31:0] -# dataset is read if the ic register is divisible by 64 -IF ic mod 64 == 0: - DatasetRead(readAddress) -# optional mixing into the mx register -IF A.LOC.X == 0: - mx = mx XOR readAddress -# truncate to W bits -W = GetAddressWidth(A.LOC.W) -readAddress = readAddress[W-1:0] -``` - -Note that the value of the read address register is modified during address generation. - -## B.LOC -**Table 5: `B.LOC` encoding** - -|bits|description| -|----|--------| -|0-1|`B.LOC.L` flag| -|0-2|`B.LOC.C` flag| -|3-7|Reserved| - -The `B.LOC.L` flag determines the B operand. It can be either a register or immediate value. - -**Table 6: Operand B** - -|`B.LOC.L`|IA/DIV|IA/SHIFT|IA/MATH|FP|CL| -|----|--------|----|------|----|---| -|0|register|`imm8`|`imm32`|register|register| -|1|`imm32`|register|register|register|register| -|2|`imm32`|`imm8`|register|register|register| -|3|`imm32`|register|register|register|register| - -Integer instructions are split into 3 classes: integer division (IA/DIV), shift and rotate (IA/SHIFT) and other (IA/MATH). Floating point (FP) and control (CL) instructions always use a register operand. - -Register to be used as operand B is encoded in the `B.REG.R` flag (see below). - -The `B.LOC.C` flag determines the condition for the JUMP and CALL instructions. The flag partially overlaps with the `B.LOC.L` flag. - -## B.REG -**Table 7: `B.REG` encoding** - -|bits|description| -|----|--------| -|0-2|`B.REG.R` flag| -|3-7|Reserved| - -Register encoded by the `B.REG.R` depends on the instruction group: - -**Table 8: Register operands by group** - -|group|registers| -|----|--------| -|IA|`r0`-`r7`| -|FP|`f0`-`f7`| -|CL|`r0`-`r7`| - -## C.LOC -**Table 9: `C.LOC` encoding** - -|bits|description| -|----|--------| -|0-1|`C.LOC.W` flag| -|2|`C.LOC.R` flag| -|3-6|Reserved| -|7|`C.LOC.H` flag| - -The `C.LOC.W` flag determines the address width when writing operand C to the scratchpad: - -**Table 10: Operand C write address width** - -|`C.LOC.W`|address width (W)| -|---------|-| -|0|15 bits (256 KiB)| -|1-3|11 bits (16 KiB)| - -If the `C.LOC.W` flag is zero, the address space covers the whole 256 KiB scratchpad. Otherwise, just the first 16 KiB of the scratchpad are addressed. - -The `C.LOC.R` determines the destination where operand C is written: - -**Table 11: Operand C destination** - -|`C.LOC.R`|groups IA, CL|group FP -|---------|-|-| -|0|scratchpad|register -|1|register|register + scratchpad - -Integer and control instructions (groups IA and CL) write either to the scratchpad or to a register. Floating point instructions always write to a register and can also write to the scratchpad. In that case, flag `C.LOC.H` determines if the low or high half of the register is written: - -**Table 12: Floating point register write** - -|`C.LOC.H`|write bits| -|---------|----------| -|0|0-63| -|1|64-127| - -## C.REG -**Table 13: `C.REG` encoding** - -|bits|description| -|----|--------| -|0-2|`C.REG.R` flag| -|3-7|Reserved| - -The destination register encoded in the `C.REG.R` flag encodes both the write address register (if writing to the scratchpad) and the destination register (if writing to a register). The destination register depends on the instruction group (see Table 8). Write address is always generated from an integer register: - -```python -writeAddressRegister = IntegerRegister(C.REG.R) -writeAddress = writeAddressRegister[31:0] XOR C.mask32 -# truncate to W bits -W = GetAddressWidth(C.LOC.W) -writeAddress = writeAddress [W-1:0] -``` - -## imm8 -`imm8` is an 8-bit immediate value that is used as the B operand by IA/SHIFT instructions (see Table 6). Additionally, it's used by some control instructions. - -## A.mask32 -`A.mask32` is a 32-bit address mask that is used to calculate the read address for the A operand. It's sign-extended to 64 bits before use. - -## imm32 -`imm32` is a 32-bit immediate value which is used for integer instructions from groups IA/DIV and IA/OTHER (see Table 6). The immediate value is sign-extended for instructions that expect 64-bit operands. - -## C.mask32 -`C.mask32` is a 32-bit address mask that is used to calculate the write address for the C operand. `C.mask32` is equal to `imm32`. +### imm32 +A 32-bit immediate value that can be used as the source operand. The immediate value is sign-extended to 64 bits in most cases. diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index e2eaf44..a2d1b32 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -19,12 +19,12 @@ along with RandomX. If not, see. //#define TRACE #define MAGIC_DIVISION #include "AssemblyGeneratorX86.hpp" -#include "Pcg32.hpp" #include "common.hpp" #include "instructions.hpp" #ifdef MAGIC_DIVISION #include "divideByConstantCodegen.h" #endif +#include "Program.hpp" namespace RandomX { @@ -48,17 +48,10 @@ namespace RandomX { static const char* regDatasetAddr = "rdi"; static const char* regScratchpadAddr = "rsi"; - void AssemblyGeneratorX86::generateProgram(const void* seed) { + void AssemblyGeneratorX86::generateProgram(Program& prog) { asmCode.str(std::string()); //clear - Pcg32 gen(seed); - for (unsigned i = 0; i < sizeof(RegisterFile) / sizeof(Pcg32::result_type); ++i) { - gen(); - } - Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { - for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { - *(((uint32_t*)&instr) + j) = gen(); - } + Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; generateCode(instr, i); @@ -83,7 +76,7 @@ namespace RandomX { } int32_t AssemblyGeneratorX86::genAddressImm(Instruction& instr) { - return instr.imm32 & ScratchpadL3Mask; + return (int32_t)instr.imm32 & ScratchpadL3Mask; } //1 uOP @@ -92,7 +85,7 @@ namespace RandomX { asmCode << "\tadd " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\tadd " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\tadd " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -109,7 +102,7 @@ namespace RandomX { //1 uOP void AssemblyGeneratorX86::h_IADD_RC(Instruction& instr, int i) { - asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl; + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << std::showpos << (int32_t)instr.imm32 << std::noshowpos << "]" << std::endl; } //1 uOP @@ -118,7 +111,7 @@ namespace RandomX { asmCode << "\tsub " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\tsub " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\tsub " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -135,7 +128,7 @@ namespace RandomX { //1 uOP void AssemblyGeneratorX86::h_IMUL_9C(Instruction& instr, int i) { - asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.dst] << "*8" << std::showpos << instr.imm32 << std::noshowpos << "]" << std::endl; + asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.dst] << "*8" << std::showpos << (int32_t)instr.imm32 << std::noshowpos << "]" << std::endl; } //1 uOP @@ -144,7 +137,7 @@ namespace RandomX { asmCode << "\timul " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\timul " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\timul " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -161,16 +154,9 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_IMULH_R(Instruction& instr, int i) { - if (instr.src != instr.dst) { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; - asmCode << "\tmul " << regR[instr.src] << std::endl; - asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; - } - else { - asmCode << "\tmov eax, " << instr.imm32 << std::endl; - asmCode << "\tmul " << regR[instr.dst] << std::endl; - asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; - } + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\tmul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; } //5.75 uOPs @@ -189,16 +175,9 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_ISMULH_R(Instruction& instr, int i) { - if (instr.src != instr.dst) { - asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; - asmCode << "\timul " << regR[instr.src] << std::endl; - asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; - } - else { - asmCode << "\tmov rax, " << instr.imm32 << std::endl; - asmCode << "\timul " << regR[instr.dst] << std::endl; - asmCode << "\tadd " << regR[instr.dst] << ", rdx" << std::endl; - } + asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; + asmCode << "\timul " << regR[instr.src] << std::endl; + asmCode << "\tmov " << regR[instr.dst] << ", rdx" << std::endl; } //5.75 uOPs @@ -226,7 +205,7 @@ namespace RandomX { asmCode << "\txor " << regR[instr.dst] << ", " << regR[instr.src] << std::endl; } else { - asmCode << "\txor " << regR[instr.dst] << ", " << instr.imm32 << std::endl; + asmCode << "\txor " << regR[instr.dst] << ", " << (int32_t)instr.imm32 << std::endl; } } @@ -300,7 +279,7 @@ namespace RandomX { //~8.5 uOPs void AssemblyGeneratorX86::h_ISDIV_C(Instruction& instr, int i) { - int64_t divisor = instr.imm32; + int64_t divisor = (int32_t)instr.imm32; if ((divisor & -divisor) == divisor || (divisor & -divisor) == -divisor) { asmCode << "\tmov rax, " << regR[instr.dst] << std::endl; // +/- power of two @@ -395,9 +374,9 @@ namespace RandomX { } //1 uOP - void AssemblyGeneratorX86::h_CFSUM_R(Instruction& instr, int i) { + void AssemblyGeneratorX86::h_FNEG_R(Instruction& instr, int i) { instr.dst %= 4; - asmCode << "\t" << fsumInstr[instr.mod % 4] << " " << signMask << ", " << regF[instr.dst] << std::endl; + asmCode << "\txorps " << regF[instr.dst] << ", " << signMask << std::endl; } //1 uOPs @@ -478,7 +457,7 @@ namespace RandomX { //4 uOPs void AssemblyGeneratorX86::h_COND_R(Instruction& instr, int i) { asmCode << "\txor ecx, ecx" << std::endl; - asmCode << "\tcmp " << regR32[instr.src] << ", " << instr.imm32 << std::endl; + asmCode << "\tcmp " << regR32[instr.src] << ", " << (int32_t)instr.imm32 << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } @@ -487,7 +466,7 @@ namespace RandomX { void AssemblyGeneratorX86::h_COND_M(Instruction& instr, int i) { asmCode << "\txor ecx, ecx" << std::endl; genAddressReg(instr); - asmCode << "\tcmp dword ptr [rsi+rax], " << instr.imm32 << std::endl; + asmCode << "\tcmp dword ptr [rsi+rax], " << (int32_t)instr.imm32 << std::endl; asmCode << "\tset" << condition(instr) << " cl" << std::endl; asmCode << "\tadd " << regR[instr.dst] << ", rcx" << std::endl; } @@ -542,7 +521,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(CFSUM_R) + INST_HANDLE(FNEG_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/AssemblyGeneratorX86.hpp b/src/AssemblyGeneratorX86.hpp index 5abf707..0c1844e 100644 --- a/src/AssemblyGeneratorX86.hpp +++ b/src/AssemblyGeneratorX86.hpp @@ -24,13 +24,14 @@ along with RandomX. If not, see. namespace RandomX { + class Program; class AssemblyGeneratorX86; typedef void(AssemblyGeneratorX86::*InstructionGenerator)(Instruction&, int); class AssemblyGeneratorX86 { public: - void generateProgram(const void* seed); + void generateProgram(Program&); void printCode(std::ostream& os) { os << asmCode.rdbuf(); } @@ -69,7 +70,7 @@ namespace RandomX { void h_FADD_M(Instruction&, int); void h_FSUB_R(Instruction&, int); void h_FSUB_M(Instruction&, int); - void h_CFSUM_R(Instruction&, int); + void h_FNEG_R(Instruction&, int); void h_FMUL_R(Instruction&, int); void h_FMUL_M(Instruction&, int); void h_FDIV_R(Instruction&, int); diff --git a/src/Cache.cpp b/src/Cache.cpp index bb1758f..85d481e 100644 --- a/src/Cache.cpp +++ b/src/Cache.cpp @@ -23,7 +23,6 @@ along with RandomX. If not, see. #include "Cache.hpp" #include "softAes.h" #include "argon2.h" -#include "Pcg32.hpp" #include "argon2_core.h" namespace RandomX { diff --git a/src/CompiledVirtualMachine.cpp b/src/CompiledVirtualMachine.cpp index 3bf3371..b3b5db8 100644 --- a/src/CompiledVirtualMachine.cpp +++ b/src/CompiledVirtualMachine.cpp @@ -18,19 +18,12 @@ along with RandomX. If not, see. */ #include "CompiledVirtualMachine.hpp" -#include "Pcg32.hpp" #include "common.hpp" #include "instructions.hpp" #include namespace RandomX { - constexpr int mantissaSize = 52; - constexpr int exponentSize = 11; - constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; - constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; - constexpr int exponentBias = 1023; - CompiledVirtualMachine::CompiledVirtualMachine() { totalSize = 0; } @@ -39,40 +32,9 @@ namespace RandomX { mem.ds = ds; } - void CompiledVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { - memcpy(scratchpad, mem.ds.dataset + ScratchpadSize * index, ScratchpadSize); - } - - static uint64_t getSmallPositiveFloatBits(uint64_t entropy) { - auto exponent = entropy >> 59; //0..31 - auto mantissa = entropy & mantissaMask; - exponent += exponentBias; - exponent &= exponentMask; - exponent <<= mantissaSize; - return exponent | mantissa; - } - - void CompiledVirtualMachine::initializeProgram(const void* seed) { - Pcg32 gen(seed); - for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { - *(((uint32_t*)®) + i) = gen(); - } - initFpu(); - /*for (int i = 0; i < RegistersCount / 2; ++i) { - reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; - reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; - } - for (int i = 0; i < RegistersCount / 2; ++i) { - reg.g[i].lo.f64 = std::abs((double)reg.g[i].lo.i64); - reg.g[i].hi.f64 = std::abs((double)reg.g[i].hi.i64); - }*/ - for (int i = 0; i < RegistersCount / 2; ++i) { - reg.a[i].lo.u64 = getSmallPositiveFloatBits(reg.f[i].lo.u64); - reg.a[i].hi.u64 = getSmallPositiveFloatBits(reg.f[i].hi.u64); - } - compiler.generateProgram(gen); - mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & -64; - mem.mx = *(((uint32_t*)seed) + 5); + void CompiledVirtualMachine::initialize() { + VirtualMachine::initialize(); + compiler.generateProgram(program); } void CompiledVirtualMachine::execute() { diff --git a/src/CompiledVirtualMachine.hpp b/src/CompiledVirtualMachine.hpp index f969732..e3b6bf0 100644 --- a/src/CompiledVirtualMachine.hpp +++ b/src/CompiledVirtualMachine.hpp @@ -39,8 +39,7 @@ namespace RandomX { } CompiledVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; - void initializeProgram(const void* seed) override; + void initialize() override; virtual void execute() override; void* getProgram() { return compiler.getCode(); diff --git a/src/Instruction.cpp b/src/Instruction.cpp index 5784c99..35cc737 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -116,12 +116,7 @@ namespace RandomX { } void Instruction::h_IMULH_R(std::ostream& os) const { - if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << std::endl; - } - else { - os << "r" << (int)dst << ", " << imm32 << std::endl; - } + os << "r" << (int)dst << ", r" << (int)src << std::endl; } void Instruction::h_IMULH_M(std::ostream& os) const { @@ -138,12 +133,7 @@ namespace RandomX { } void Instruction::h_ISMULH_R(std::ostream& os) const { - if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << std::endl; - } - else { - os << "r" << (int)dst << ", " << imm32 << std::endl; - } + os << "r" << (int)dst << ", r" << (int)src << std::endl; } void Instruction::h_ISMULH_M(std::ostream& os) const { @@ -247,9 +237,9 @@ namespace RandomX { os << std::endl; } - void Instruction::h_CFSUM_R(std::ostream& os) const { + void Instruction::h_FNEG_R(std::ostream& os) const { auto dstIndex = dst % 4; - os << "f" << dstIndex << ", " << (1 << ((mod % 4) + 3)) << std::endl; + os << "f" << dstIndex << std::endl; } void Instruction::h_FMUL_R(std::ostream& os) const { @@ -370,7 +360,7 @@ namespace RandomX { INST_NAME(FADD_M) INST_NAME(FSUB_R) INST_NAME(FSUB_M) - INST_NAME(CFSUM_R) + INST_NAME(FNEG_R) //Floating point group E INST_NAME(FMUL_R) @@ -421,7 +411,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(CFSUM_R) + INST_HANDLE(FNEG_R) //Floating point group E INST_HANDLE(FMUL_R) diff --git a/src/Instruction.hpp b/src/Instruction.hpp index 4f9e178..5cfd833 100644 --- a/src/Instruction.hpp +++ b/src/Instruction.hpp @@ -54,7 +54,7 @@ namespace RandomX { constexpr int FADD_M = 22; constexpr int FSUB_R = 23; constexpr int FSUB_M = 24; - constexpr int CFSUM_R = 25; + constexpr int FNEG_R = 25; constexpr int FMUL_R = 26; constexpr int FMUL_M = 27; constexpr int FDIV_R = 28; @@ -74,7 +74,7 @@ namespace RandomX { uint8_t dst; uint8_t src; uint8_t mod; - int32_t imm32; + uint32_t imm32; const char* getName() const { return names[opcode]; } @@ -116,7 +116,7 @@ namespace RandomX { void h_FADD_M(std::ostream&) const; void h_FSUB_R(std::ostream&) const; void h_FSUB_M(std::ostream&) const; - void h_CFSUM_R(std::ostream&) const; + void h_FNEG_R(std::ostream&) const; void h_FMUL_R(std::ostream&) const; void h_FMUL_M(std::ostream&) const; void h_FDIV_R(std::ostream&) const; diff --git a/src/InterpretedVirtualMachine.cpp b/src/InterpretedVirtualMachine.cpp index af01183..9e0d5e2 100644 --- a/src/InterpretedVirtualMachine.cpp +++ b/src/InterpretedVirtualMachine.cpp @@ -19,7 +19,6 @@ along with RandomX. If not, see. //#define TRACE //#define FPUCHECK #include "InterpretedVirtualMachine.hpp" -#include "Pcg32.hpp" #include "instructions.hpp" #include "dataset.hpp" #include "Cache.hpp" @@ -34,6 +33,7 @@ along with RandomX. If not, see. #ifdef STATS #include #endif +#include "divideByConstantCodegen.h" #ifdef FPUCHECK constexpr bool fpuCheck = true; @@ -61,88 +61,683 @@ namespace RandomX { } else { mem.ds = ds; - if (softAes) { - readDataset = &datasetReadLight; - } - else { - readDataset = &datasetReadLight; - } + readDataset = &datasetReadLight; } } - void InterpretedVirtualMachine::initializeScratchpad(uint8_t* scratchpad, int32_t index) { - uint32_t startingBlock = (ScratchpadSize / CacheLineSize) * index; - if (asyncWorker) { - ILightClientAsyncWorker* worker = mem.ds.asyncWorker; - const uint32_t blocksPerThread = (ScratchpadSize / CacheLineSize) / 2; - worker->prepareBlocks(scratchpad, startingBlock, blocksPerThread); //async first half - worker->getBlocks(scratchpad + ScratchpadLength / 2, startingBlock + blocksPerThread, blocksPerThread); //sync second half - worker->sync(); - } - else { - auto cache = mem.ds.cache; - if (softAes) { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } - else { - for (int i = 0; i < ScratchpadSize / CacheLineSize; ++i) { - initBlock(cache->getCache(), ((uint8_t*)scratchpad) + CacheLineSize * i, (ScratchpadSize / CacheLineSize) * index + i, cache->getKeys()); - } - } + void InterpretedVirtualMachine::initialize() { + VirtualMachine::initialize(); + for (unsigned i = 0; i < ProgramLength; ++i) { + program(i).src %= RegistersCount; + program(i).dst %= RegistersCount; } } - void InterpretedVirtualMachine::initializeProgram(const void* seed) { - Pcg32 gen(seed); - for (unsigned i = 0; i < sizeof(reg) / sizeof(Pcg32::result_type); ++i) { - *(((uint32_t*)®) + i) = gen(); + template + void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + executeBytecode(N, r, f, e, a); + executeBytecode(r, f, e, a); + } + + template<> + void InterpretedVirtualMachine::executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + } + + FORCE_INLINE void InterpretedVirtualMachine::executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + auto& ibc = byteCode[i]; + switch (ibc.type) + { + case InstructionType::IADD_R: { + *ibc.idst += *ibc.isrc; + } break; + + case InstructionType::IADD_M: { + *ibc.idst += load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IADD_RC: { + *ibc.idst += *ibc.isrc + ibc.imm; + } break; + + case InstructionType::ISUB_R: { + *ibc.idst -= *ibc.isrc; + } break; + + case InstructionType::ISUB_M: { + *ibc.idst -= load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IMUL_9C: { + *ibc.idst += 9 * *ibc.idst + ibc.imm; + } break; + + case InstructionType::IMUL_R: { + *ibc.idst *= *ibc.isrc; + } break; + + case InstructionType::IMUL_M: { + *ibc.idst *= load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IMULH_R: { + *ibc.idst = mulh(*ibc.idst, *ibc.isrc); + } break; + + case InstructionType::IMULH_M: { + *ibc.idst = mulh(*ibc.idst, load64(scratchpad + (*ibc.isrc & ibc.memMask))); + } break; + + case InstructionType::ISMULH_R: { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(*ibc.isrc)); + } break; + + case InstructionType::ISMULH_M: { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(scratchpad + (*ibc.isrc & ibc.memMask)))); + } break; + + case InstructionType::IDIV_C: { + if (ibc.signedMultiplier != 0) { + int_reg_t dividend = *ibc.idst; + int_reg_t quotient = dividend >> ibc.preShift; + if (ibc.increment) { + quotient = quotient == UINT64_MAX ? UINT64_MAX : quotient + 1; + } + quotient = mulh(quotient, ibc.signedMultiplier); + quotient >>= ibc.postShift; + *ibc.idst += quotient; + } + else { + *ibc.idst += *ibc.idst >> ibc.shift; + } + } break; + + case InstructionType::ISDIV_C: { + + } break; + + case InstructionType::INEG_R: { + *ibc.idst = ~(*ibc.idst) + 1; //two's complement negative + } break; + + case InstructionType::IXOR_R: { + *ibc.idst ^= *ibc.isrc; + } break; + + case InstructionType::IXOR_M: { + *ibc.idst ^= load64(scratchpad + (*ibc.isrc & ibc.memMask)); + } break; + + case InstructionType::IROR_R: { + *ibc.idst = rotr(*ibc.idst, *ibc.isrc & 63); + } break; + + case InstructionType::IROL_R: { + *ibc.idst = rotl(*ibc.idst, *ibc.isrc & 63); + } break; + + case InstructionType::ISWAP_R: { + int_reg_t temp = *ibc.isrc; + *ibc.isrc = *ibc.idst; + *ibc.idst = temp; + } break; + + case InstructionType::FSWAP_R: { + *ibc.fdst = _mm_shuffle_pd(*ibc.fdst, *ibc.fdst, 1); + } break; + + case InstructionType::FADD_R: { + *ibc.fdst = _mm_add_pd(*ibc.fdst, *ibc.fsrc); + } break; + + case InstructionType::FADD_M: { + __m128d fsrc = load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)); + *ibc.fdst = _mm_add_pd(*ibc.fdst, fsrc); + } break; + + case InstructionType::FSUB_R: { + *ibc.fdst = _mm_sub_pd(*ibc.fdst, *ibc.fsrc); + } break; + + case InstructionType::FSUB_M: { + __m128d fsrc = load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)); + *ibc.fdst = _mm_sub_pd(*ibc.fdst, fsrc); + } break; + + case InstructionType::FNEG_R: { + const __m128d signMask = _mm_castsi128_pd(_mm_set1_epi64x(1ULL << 63)); + *ibc.fdst = _mm_xor_pd(*ibc.fdst, signMask); + } break; + + case InstructionType::FMUL_R: { + *ibc.fdst = _mm_mul_pd(*ibc.fdst, *ibc.fsrc); + } break; + + case InstructionType::FDIV_M: { + __m128d fsrc = load_cvt_i32x2(scratchpad + (*ibc.isrc & ibc.memMask)); + __m128d fdst = _mm_div_pd(*ibc.fdst, fsrc); + *ibc.fdst = _mm_max_pd(fdst, _mm_set_pd(DBL_MIN, DBL_MIN)); + } break; + + case InstructionType::FSQRT_R: { + *ibc.fdst = _mm_sqrt_pd(*ibc.fdst); + } break; + + case InstructionType::COND_R: { + *ibc.idst += condition(*ibc.isrc, ibc.imm, ibc.condition) ? 1 : 0; + } break; + + case InstructionType::COND_M: { + *ibc.idst += condition(load64(scratchpad + (*ibc.isrc & ibc.memMask)), ibc.imm, ibc.condition) ? 1 : 0; + } break; + + case InstructionType::CFROUND: { + setRoundMode(rotr(*ibc.isrc, ibc.imm) % 4); + } break; + + case InstructionType::ISTORE: { + store64(scratchpad + (*ibc.idst & ibc.memMask), *ibc.isrc); + } break; + + case InstructionType::NOP: { + //nothing + } break; + + default: + UNREACHABLE; } - initFpu(); - for (int i = 0; i < RegistersCount; ++i) { - reg.f[i].lo.f64 = (double)reg.f[i].lo.i64; - reg.f[i].hi.f64 = (double)reg.f[i].hi.i64; - } - //std::cout << reg; - p.initialize(gen); - currentTransform = addressTransformations[gen.getUniform(0, TransformationCount - 1)]; - mem.ma = (gen() ^ *(((uint32_t*)seed) + 4)) & ~7; - mem.mx = *(((uint32_t*)seed) + 5); - pc = 0; - ic = InstructionCount; - stack.clear(); } void InterpretedVirtualMachine::execute() { - for(int i = 0; i < InstructionCount; ++i) { - for (int j = 0; j < ProgramLength; ++j) { - auto& ibc = byteCode[j]; - switch (ibc.type) - { - case InstructionType::CFROUND: { - uint64_t rcFlag = rotr(ibc.isrc->u64, ibc.imm.i32); - setRoundMode(rcFlag); - } - break; - } + int_reg_t r[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + __m128d f[4]; + __m128d e[4]; + __m128d a[4]; + + a[0] = _mm_load_pd(®.a[0].lo); + a[1] = _mm_load_pd(®.a[1].lo); + a[2] = _mm_load_pd(®.a[2].lo); + a[3] = _mm_load_pd(®.a[3].lo); + + precompileProgram(r, f, e, a); + + uint32_t spAddr0 = mem.mx; + uint32_t spAddr1 = mem.ma; + + for(int iter = 0; iter < InstructionCount; ++iter) { + //std::cout << "Iteration " << iter << std::endl; + spAddr0 ^= r[readReg0]; + spAddr0 &= ScratchpadL3Mask64; + + r[0] ^= load64(scratchpad + spAddr0 + 0); + r[1] ^= load64(scratchpad + spAddr0 + 8); + r[2] ^= load64(scratchpad + spAddr0 + 16); + r[3] ^= load64(scratchpad + spAddr0 + 24); + r[4] ^= load64(scratchpad + spAddr0 + 32); + r[5] ^= load64(scratchpad + spAddr0 + 40); + r[6] ^= load64(scratchpad + spAddr0 + 48); + r[7] ^= load64(scratchpad + spAddr0 + 56); + + spAddr1 ^= r[readReg1]; + spAddr1 &= ScratchpadL3Mask64; + + f[0] = load_cvt_i32x2(scratchpad + spAddr1 + 0); + f[1] = load_cvt_i32x2(scratchpad + spAddr1 + 8); + f[2] = load_cvt_i32x2(scratchpad + spAddr1 + 16); + f[3] = load_cvt_i32x2(scratchpad + spAddr1 + 24); + e[0] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 32)); + e[1] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 40)); + e[2] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 48)); + e[3] = _mm_abs(load_cvt_i32x2(scratchpad + spAddr1 + 56)); + + executeBytecode<0>(r, f, e, a); + + if (asyncWorker) { + ILightClientAsyncWorker* aw = mem.ds.asyncWorker; + const uint64_t* datasetLine = aw->getBlock(mem.ma); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + mem.mx ^= r[readReg2] ^ r[readReg3]; + mem.mx &= CacheLineAlignMask; //align to cache line + std::swap(mem.mx, mem.ma); + aw->prepareBlock(mem.ma); } + else { + mem.mx ^= r[readReg2] ^ r[readReg3]; + mem.mx &= CacheLineAlignMask; + Cache* cache = mem.ds.cache; + uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; + initBlock(cache->getCache(), (uint8_t*)datasetLine, mem.ma / CacheLineSize, cache->getKeys()); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + std::swap(mem.mx, mem.ma); + } + + store64(scratchpad + spAddr1 + 0, r[0]); + store64(scratchpad + spAddr1 + 8, r[1]); + store64(scratchpad + spAddr1 + 16, r[2]); + store64(scratchpad + spAddr1 + 24, r[3]); + store64(scratchpad + spAddr1 + 32, r[4]); + store64(scratchpad + spAddr1 + 40, r[5]); + store64(scratchpad + spAddr1 + 48, r[6]); + store64(scratchpad + spAddr1 + 56, r[7]); + + _mm_store_pd((double*)(scratchpad + spAddr0 + 0), _mm_mul_pd(f[0], e[0])); + _mm_store_pd((double*)(scratchpad + spAddr0 + 16), _mm_mul_pd(f[1], e[1])); + _mm_store_pd((double*)(scratchpad + spAddr0 + 32), _mm_mul_pd(f[2], e[2])); + _mm_store_pd((double*)(scratchpad + spAddr0 + 48), _mm_mul_pd(f[3], e[3])); + + spAddr0 = 0; + spAddr1 = 0; } + store64(®.r[0], r[0]); + store64(®.r[1], r[1]); + store64(®.r[2], r[2]); + store64(®.r[3], r[3]); + store64(®.r[4], r[4]); + store64(®.r[5], r[5]); + store64(®.r[6], r[6]); + store64(®.r[7], r[7]); + + _mm_store_pd(®.f[0].lo, f[0]); + _mm_store_pd(®.f[1].lo, f[1]); + _mm_store_pd(®.f[2].lo, f[2]); + _mm_store_pd(®.f[3].lo, f[3]); + _mm_store_pd(®.e[0].lo, e[0]); + _mm_store_pd(®.e[1].lo, e[1]); + _mm_store_pd(®.e[2].lo, e[2]); + _mm_store_pd(®.e[3].lo, e[3]); } #include "instructionWeights.hpp" - void InterpretedVirtualMachine::executeInstruction(Instruction& instr) { - switch (instr.opcode) - { - CASE_REP(IADD_R) + void InterpretedVirtualMachine::precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]) { + for (unsigned i = 0; i < ProgramLength; ++i) { + auto& instr = program(i); + auto& ibc = byteCode[i]; + switch (instr.opcode) { + CASE_REP(IADD_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; - break; + CASE_REP(IADD_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IADD_RC) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_RC; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.imm = signExtend2sCompl(instr.imm32); + } break; + + CASE_REP(ISUB_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(ISUB_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IMUL_9C) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::IMUL_9C; + ibc.idst = &r[dst]; + ibc.imm = signExtend2sCompl(instr.imm32); + } break; + + CASE_REP(IMUL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(IMUL_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_R; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + } break; + + CASE_REP(IMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(ISMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_R; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + } break; + + CASE_REP(ISMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IDIV_C) { + uint32_t divisor = instr.imm32; + if (divisor != 0) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::IDIV_C; + ibc.idst = &r[dst]; + if (divisor & (divisor - 1)) { + magicu_info mi = compute_unsigned_magic_info(divisor, sizeof(uint64_t) * 8); + ibc.signedMultiplier = mi.multiplier; + ibc.preShift = mi.pre_shift; + ibc.postShift = mi.post_shift; + ibc.increment = mi.increment; + } + else { + ibc.signedMultiplier = 0; + int shift = 0; + while (divisor >>= 1) + ++shift; + ibc.shift = shift; + } + } + else { + ibc.type = InstructionType::NOP; + } + } break; + + CASE_REP(ISDIV_C) { + ibc.type = InstructionType::NOP; + } break; + + CASE_REP(INEG_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::INEG_R; + ibc.idst = &r[dst]; + } break; + + CASE_REP(IXOR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.imm32); + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(IXOR_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_M; + ibc.idst = &r[dst]; + if (instr.src != instr.dst) { + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + ibc.memMask = ScratchpadL3Mask; + } + } break; + + CASE_REP(IROR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROR_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(IROL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROL_R; + ibc.idst = &r[dst]; + if (src != dst) { + ibc.isrc = &r[src]; + } + else { + ibc.imm = instr.imm32; + ibc.isrc = &ibc.imm; + } + } break; + + CASE_REP(ISWAP_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + if (src != dst) { + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.type = InstructionType::ISWAP_R; + } + else { + ibc.type = InstructionType::NOP; + } + } break; + + CASE_REP(FSWAP_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::FSWAP_R; + ibc.fdst = &f[dst]; + } break; + + CASE_REP(FADD_R) { + auto dst = instr.dst % 4; + auto src = instr.src % 4; + ibc.type = InstructionType::FADD_R; + ibc.fdst = &f[dst]; + ibc.fsrc = &a[src]; + } break; + + CASE_REP(FADD_M) { + auto dst = instr.dst % 4; + auto src = instr.src % 8; + ibc.type = InstructionType::FADD_M; + ibc.fdst = &f[dst]; + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(FSUB_R) { + auto dst = instr.dst % 4; + auto src = instr.src % 4; + ibc.type = InstructionType::FSUB_R; + ibc.fdst = &f[dst]; + ibc.fsrc = &a[src]; + } break; + + CASE_REP(FSUB_M) { + auto dst = instr.dst % 4; + auto src = instr.src % 8; + ibc.type = InstructionType::FSUB_M; + ibc.fdst = &f[dst]; + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(FNEG_R) { + auto dst = instr.dst % 4; + ibc.fdst = &f[dst]; + ibc.type = InstructionType::FNEG_R; + } break; + + CASE_REP(FMUL_R) { + auto dst = instr.dst % 4; + auto src = instr.src % 4; + ibc.type = InstructionType::FMUL_R; + ibc.fdst = &e[dst]; + ibc.fsrc = &a[src]; + } break; + + CASE_REP(FMUL_M) { + } break; + + CASE_REP(FDIV_R) { + } break; + + CASE_REP(FDIV_M) { + auto dst = instr.dst % 4; + auto src = instr.src % 8; + ibc.type = InstructionType::FDIV_M; + ibc.fdst = &e[dst]; + ibc.isrc = &r[src]; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(FSQRT_R) { + auto dst = instr.dst % 4; + ibc.type = InstructionType::FSQRT_R; + ibc.fdst = &e[dst]; + } break; + + CASE_REP(COND_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::COND_R; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.condition = (instr.mod >> 2) & 7; + ibc.imm = instr.imm32; + } break; + + CASE_REP(COND_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::COND_M; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + ibc.condition = (instr.mod >> 2) & 7; + ibc.imm = instr.imm32; + ibc.memMask = ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask); + } break; + + CASE_REP(CFROUND) { + auto src = instr.src % 8; + ibc.isrc = &r[src]; + ibc.type = InstructionType::CFROUND; + ibc.imm = instr.imm32 & 63; + } break; + + CASE_REP(ISTORE) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISTORE; + ibc.idst = &r[dst]; + ibc.isrc = &r[src]; + } break; + + CASE_REP(FSTORE) { + } break; + + CASE_REP(NOP) { + ibc.type = InstructionType::NOP; + } break; + + default: + UNREACHABLE; + } } } - - InstructionHandler InterpretedVirtualMachine::engine[256] = { - - }; } \ No newline at end of file diff --git a/src/InterpretedVirtualMachine.hpp b/src/InterpretedVirtualMachine.hpp index f698d97..4db4ae4 100644 --- a/src/InterpretedVirtualMachine.hpp +++ b/src/InterpretedVirtualMachine.hpp @@ -21,7 +21,7 @@ along with RandomX. If not, see. //#define STATS #include "VirtualMachine.hpp" #include "Program.hpp" -#include +#include "intrinPortable.h" namespace RandomX { @@ -38,15 +38,23 @@ namespace RandomX { typedef void(InterpretedVirtualMachine::*InstructionHandler)(Instruction&); - struct alignas(64) InstructionByteCode { - convertible_t* idst; - convertible_t* isrc; - convertible_t imm; - fpu_reg_t* fdst; - fpu_reg_t* fsrc; + struct alignas(16) InstructionByteCode { + int_reg_t* idst; + int_reg_t* isrc; + int_reg_t imm; + __m128d* fdst; + __m128d* fsrc; uint32_t condition; uint32_t memMask; uint32_t type; + union { + uint64_t unsignedMultiplier; + int64_t signedMultiplier; + }; + unsigned shift; + unsigned preShift; + unsigned postShift; + bool increment; }; constexpr int asedwfagdewsa = sizeof(InstructionByteCode); @@ -56,21 +64,14 @@ namespace RandomX { InterpretedVirtualMachine(bool soft, bool async) : softAes(soft), asyncWorker(async) {} ~InterpretedVirtualMachine(); void setDataset(dataset_t ds) override; - void initializeScratchpad(uint8_t* scratchpad, int32_t index) override; - void initializeProgram(const void* seed) override; + void initialize() override; void execute() override; - const Program& getProgam() { - return p; - } private: static InstructionHandler engine[256]; - static const ITransform* addressTransformations[TransformationCount]; + DatasetReadFunc readDataset; bool softAes, asyncWorker; - Program p; InstructionByteCode byteCode[ProgramLength]; - std::vector stack; - uint64_t pc, ic; - const ITransform* currentTransform; + #ifdef STATS int count_ADD_64 = 0; int count_ADD_32 = 0; @@ -121,66 +122,9 @@ namespace RandomX { int count_FMUL_nop2 = 0; int datasetAccess[256] = { 0 }; #endif - void executeInstruction(Instruction&); - convertible_t loada(Instruction&); - convertible_t loadbiashift(Instruction&); - convertible_t loadbiadiv(Instruction&); - convertible_t loadbia(Instruction&); - convertible_t& getcr(Instruction&); - void writecf(Instruction&, fpu_reg_t&); - - void stackPush(convertible_t& c) { - stack.push_back(c); - } - - void stackPush(uint64_t x) { - convertible_t c; - c.u64 = x; - stack.push_back(c); - } - - convertible_t stackPopValue() { - convertible_t top = stack.back(); - stack.pop_back(); - return top; - } - - uint64_t stackPopAddress() { - convertible_t top = stack.back(); - stack.pop_back(); - return top.u64; - } - - void h_ADD_64(Instruction&); - void h_ADD_32(Instruction&); - void h_SUB_64(Instruction&); - void h_SUB_32(Instruction&); - void h_MUL_64(Instruction&); - void h_MULH_64(Instruction&); - void h_MUL_32(Instruction&); - void h_IMUL_32(Instruction&); - void h_IMULH_64(Instruction&); - void h_DIV_64(Instruction&); - void h_IDIV_64(Instruction&); - void h_AND_64(Instruction&); - void h_AND_32(Instruction&); - void h_OR_64(Instruction&); - void h_OR_32(Instruction&); - void h_XOR_64(Instruction&); - void h_XOR_32(Instruction&); - void h_SHL_64(Instruction&); - void h_SHR_64(Instruction&); - void h_SAR_64(Instruction&); - void h_ROL_64(Instruction&); - void h_ROR_64(Instruction&); - void h_FADD(Instruction&); - void h_FSUB(Instruction&); - void h_FMUL(Instruction&); - void h_FDIV(Instruction&); - void h_FSQRT(Instruction&); - void h_FPROUND(Instruction&); - void h_JUMP(Instruction&); - void h_CALL(Instruction&); - void h_RET(Instruction&); + void precompileProgram(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + template + void executeBytecode(int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); + void executeBytecode(int i, int_reg_t(&r)[8], __m128d (&f)[4], __m128d (&e)[4], __m128d (&a)[4]); }; } \ No newline at end of file diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index de803be..b77da17 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -19,7 +19,7 @@ along with RandomX. If not, see. #define MAGIC_DIVISION #include "JitCompilerX86.hpp" -#include "Pcg32.hpp" +#include "Program.hpp" #include #include #ifdef MAGIC_DIVISION @@ -43,7 +43,7 @@ namespace RandomX { //throw std::runtime_error("JIT compiler only supports x86-64 CPUs"); } - void JitCompilerX86::generateProgram(Pcg32& gen) { + void JitCompilerX86::generateProgram(Program& p) { } @@ -87,7 +87,7 @@ namespace RandomX { ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask 0x7fffffffffffffff7fffffffffffffff - ; xmm15 -> unused + ; xmm15 -> sign mask 0x80000000000000008000000000000000 */ @@ -199,35 +199,32 @@ namespace RandomX { memcpy(code + CodeSize - epilogueSize, codeEpilogue, epilogueSize); } - void JitCompilerX86::generateProgram(Pcg32& gen) { - auto addressRegisters = gen(); - int readReg1 = addressRegisters & 1; + void JitCompilerX86::generateProgram(Program& prog) { + auto addressRegisters = prog.getEntropy(12); + uint32_t readReg0 = 0 + (addressRegisters & 1); addressRegisters >>= 1; - int readReg2 = 2 + (addressRegisters & 1); + uint32_t readReg1 = 2 + (addressRegisters & 1); addressRegisters >>= 1; - int readReg3 = 4 + (addressRegisters & 1); + uint32_t readReg2 = 4 + (addressRegisters & 1); addressRegisters >>= 1; - int readReg4 = 6 + (addressRegisters & 1); + uint32_t readReg3 = 6 + (addressRegisters & 1); codePos = prologueSize; emit(REX_XOR_RAX_R64); - emitByte(0xc0 + readReg1); + emitByte(0xc0 + readReg0); emit(REX_XOR_RAX_R64); - emitByte(0xc0 + readReg2); + emitByte(0xc0 + readReg1); memcpy(code + codePos, codeLoopLoad, loopLoadSize); codePos += loopLoadSize; - Instruction instr; for (unsigned i = 0; i < ProgramLength; ++i) { - for (unsigned j = 0; j < sizeof(instr) / sizeof(Pcg32::result_type); ++j) { - *(((uint32_t*)&instr) + j) = gen(); - } + Instruction& instr = prog(i); instr.src %= RegistersCount; instr.dst %= RegistersCount; generateCode(instr); } emit(REX_MOV_RR); - emitByte(0xc0 + readReg3); + emitByte(0xc0 + readReg2); emit(REX_XOR_EAX); - emitByte(0xc0 + readReg4); + emitByte(0xc0 + readReg3); memcpy(code + codePos, codeReadDataset, readDatasetSize); codePos += readDatasetSize; memcpy(code + codePos, codeLoopStore, loopStoreSize); @@ -365,22 +362,12 @@ namespace RandomX { } void JitCompilerX86::h_IMULH_R(Instruction& instr) { - if (instr.src != instr.dst) { - emit(REX_MOV_RR64); - emitByte(0xc0 + instr.dst); - emit(REX_MUL_R); - emitByte(0xe0 + instr.src); - emit(REX_MOV_R64R); - emitByte(0xc2 + 8 * instr.dst); - } - else { - emitByte(MOV_EAX_I); - emit32(instr.imm32); - emit(REX_MUL_R); - emitByte(0xe0 + instr.dst); - emit(REX_ADD_RM); - emitByte(0xc2 + 8 * instr.dst); - } + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); } void JitCompilerX86::h_IMULH_M(Instruction& instr) { @@ -402,22 +389,12 @@ namespace RandomX { } void JitCompilerX86::h_ISMULH_R(Instruction& instr) { - if (instr.src != instr.dst) { - emit(REX_MOV_RR64); - emitByte(0xc0 + instr.dst); - emit(REX_MUL_R); - emitByte(0xe8 + instr.src); - emit(REX_MOV_R64R); - emitByte(0xc2 + 8 * instr.dst); - } - else { - emitByte(MOV_EAX_I); - emit32(instr.imm32); - emit(REX_MUL_R); - emitByte(0xe8 + instr.dst); - emit(REX_ADD_RM); - emitByte(0xc2 + 8 * instr.dst); - } + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); } void JitCompilerX86::h_ISMULH_M(Instruction& instr) { @@ -648,7 +625,7 @@ namespace RandomX { emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_CFSUM_R(Instruction& instr) { + void JitCompilerX86::h_FNEG_R(Instruction& instr) { instr.dst %= 4; emit(REX_XORPS); emitByte(0xc7 + 8 * instr.dst); @@ -802,7 +779,7 @@ namespace RandomX { INST_HANDLE(FADD_M) INST_HANDLE(FSUB_R) INST_HANDLE(FSUB_M) - INST_HANDLE(CFSUM_R) + INST_HANDLE(FNEG_R) INST_HANDLE(FMUL_R) INST_HANDLE(FMUL_M) INST_HANDLE(FDIV_R) diff --git a/src/JitCompilerX86.hpp b/src/JitCompilerX86.hpp index feba888..e790cfe 100644 --- a/src/JitCompilerX86.hpp +++ b/src/JitCompilerX86.hpp @@ -24,10 +24,9 @@ along with RandomX. If not, see. #include #include -class Pcg32; - namespace RandomX { + class Program; class JitCompilerX86; typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&); @@ -37,7 +36,7 @@ namespace RandomX { class JitCompilerX86 { public: JitCompilerX86(); - void generateProgram(Pcg32&); + void generateProgram(Program&); ProgramFunc getProgramFunc() { return (ProgramFunc)code; } @@ -115,7 +114,7 @@ namespace RandomX { void h_FADD_M(Instruction&); void h_FSUB_R(Instruction&); void h_FSUB_M(Instruction&); - void h_CFSUM_R(Instruction&); + void h_FNEG_R(Instruction&); void h_FMUL_R(Instruction&); void h_FMUL_M(Instruction&); void h_FDIV_R(Instruction&); diff --git a/src/LightClientAsyncWorker.cpp b/src/LightClientAsyncWorker.cpp index 32aa508..f79d03d 100644 --- a/src/LightClientAsyncWorker.cpp +++ b/src/LightClientAsyncWorker.cpp @@ -35,7 +35,7 @@ namespace RandomX { template void LightClientAsyncWorker::prepareBlock(addr_t addr) { #ifdef TRACE - std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr << std::endl; + std::cout << sw.getElapsed() << ": prepareBlock-enter " << addr / CacheLineSize << std::endl; #endif { std::lock_guard lk(mutex); @@ -47,18 +47,24 @@ namespace RandomX { #ifdef TRACE std::cout << sw.getElapsed() << ": prepareBlock-notify " << startBlock << "/" << blockCount << std::endl; #endif - notifier.notify_all(); + notifier.notify_one(); } template const uint64_t* LightClientAsyncWorker::getBlock(addr_t addr) { +#ifdef TRACE + std::cout << sw.getElapsed() << ": getBlock-enter " << addr / CacheLineSize << std::endl; +#endif uint32_t currentBlock = addr / CacheLineSize; if (currentBlock != startBlock || output != currentLine.data()) { - initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)currentLine.data(), currentBlock, cache->getKeys()); } else { sync(); } +#ifdef TRACE + std::cout << sw.getElapsed() << ": getBlock-return " << addr / CacheLineSize << std::endl; +#endif return currentLine.data(); } @@ -73,14 +79,14 @@ namespace RandomX { this->blockCount = blockCount; output = out; hasWork = true; + notifier.notify_one(); } - notifier.notify_all(); } template void LightClientAsyncWorker::getBlocks(void* out, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = 0; i < blockCount; ++i) { - initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)out + CacheLineSize * i, startBlock + i, cache->getKeys()); } } @@ -98,10 +104,17 @@ namespace RandomX { for (;;) { std::unique_lock lk(mutex); notifier.wait(lk, [this] { return hasWork; }); - getBlocks(output, startBlock, blockCount); +#ifdef TRACE + std::cout << sw.getElapsed() << ": runWorker-getBlocks " << startBlock << "/" << blockCount << std::endl; +#endif + //getBlocks(output, startBlock, blockCount); + initBlock(cache->getCache(), (uint8_t*)output, startBlock, cache->getKeys()); hasWork = false; +#ifdef TRACE + std::cout << sw.getElapsed() << ": runWorker-finished " << startBlock << "/" << blockCount << std::endl; +#endif lk.unlock(); - notifier.notify_all(); + notifier.notify_one(); } } diff --git a/src/Pcg32.hpp b/src/Pcg32.hpp deleted file mode 100644 index 906800f..0000000 --- a/src/Pcg32.hpp +++ /dev/null @@ -1,72 +0,0 @@ -/* -Copyright (c) 2018 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -// Based on: -// *Really* minimal PCG32 code / (c) 2014 M.E. O'Neill / pcg-random.org -// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website) - -#pragma once -#include - -#if defined(_MSC_VER) -#pragma warning (disable : 4146) -#endif - -class Pcg32 { -public: - typedef uint32_t result_type; - static constexpr result_type min() { return 0U; } - static constexpr result_type max() { return UINT32_MAX; } - Pcg32(const void* seed) { - auto* u64seed = (const uint64_t*)seed; - state = *(u64seed + 0); - inc = *(u64seed + 1) | 1ull; - } - Pcg32(uint64_t state, uint64_t inc) : state(state), inc(inc | 1ull) { - } - result_type operator()() { - return next(); - } - result_type getUniform(result_type min, result_type max) { - const result_type range = max - min; - const result_type erange = range + 1; - result_type ret; - - for (;;) { - ret = next(); - if (ret / erange < UINT32_MAX / erange || UINT32_MAX % erange == range) { - ret %= erange; - break; - } - } - return ret + min; - } -private: - uint64_t state; - uint64_t inc; - result_type next() { - uint64_t oldstate = state; - // Advance internal state - state = oldstate * 6364136223846793005ULL + inc; - // Calculate output function (XSH RR), uses old state for max ILP - uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; - uint32_t rot = oldstate >> 59u; - return (xorshifted >> rot) | (xorshifted << (-rot & 31)); - } -}; diff --git a/src/Program.cpp b/src/Program.cpp index b78a5ee..bb4e086 100644 --- a/src/Program.cpp +++ b/src/Program.cpp @@ -18,15 +18,9 @@ along with RandomX. If not, see. */ #include "Program.hpp" -#include "Pcg32.hpp" +#include "hashAes1Rx4.hpp" namespace RandomX { - void Program::initialize(Pcg32& gen) { - for (unsigned i = 0; i < sizeof(programBuffer) / sizeof(Pcg32::result_type); ++i) { - *(((uint32_t*)&programBuffer) + i) = gen(); - } - } - void Program::print(std::ostream& os) const { for (int i = 0; i < RandomX::ProgramLength; ++i) { auto instr = programBuffer[i]; diff --git a/src/Program.hpp b/src/Program.hpp index 35b45d2..1f695a0 100644 --- a/src/Program.hpp +++ b/src/Program.hpp @@ -24,22 +24,25 @@ along with RandomX. If not, see. #include "common.hpp" #include "Instruction.hpp" -class Pcg32; - namespace RandomX { class Program { public: - Instruction& operator()(uint64_t pc) { + Instruction& operator()(int pc) { return programBuffer[pc]; } - void initialize(Pcg32& gen); friend std::ostream& operator<<(std::ostream& os, const Program& p) { p.print(os); return os; } + uint64_t getEntropy(int i) { + return entropyBuffer[i]; + } private: void print(std::ostream&) const; + uint64_t entropyBuffer[16]; Instruction programBuffer[ProgramLength]; }; + + static_assert(sizeof(Program) % 64 == 0, "Invalid size of class Program"); } diff --git a/src/VirtualMachine.cpp b/src/VirtualMachine.cpp index 01de3d9..2adf4e4 100644 --- a/src/VirtualMachine.cpp +++ b/src/VirtualMachine.cpp @@ -23,32 +23,72 @@ along with RandomX. If not, see. #include "blake2/blake2.h" #include #include +#include "intrinPortable.h" std::ostream& operator<<(std::ostream& os, const RandomX::RegisterFile& rf) { for (int i = 0; i < RandomX::RegistersCount; ++i) - os << std::hex << "r" << i << " = " << rf.r[i].u64 << std::endl << std::dec; + os << std::hex << "r" << i << " = " << rf.r[i] << std::endl << std::dec; for (int i = 0; i < RandomX::RegistersCount; ++i) - os << std::hex << "f" << i << " = " << rf.f[i].hi.u64 << " (" << rf.f[i].hi.f64 << ")" << std::endl - << " = " << rf.f[i].lo.u64 << " (" << rf.f[i].lo.f64 << ")" << std::endl << std::dec; + os << std::hex << "f" << i << " = " << *(uint64_t*)&rf.f[i].hi << " (" << rf.f[i].hi << ")" << std::endl + << " = " << *(uint64_t*)&rf.f[i].lo << " (" << rf.f[i].lo << ")" << std::endl << std::dec; return os; } namespace RandomX { + constexpr int mantissaSize = 52; + constexpr int exponentSize = 11; + constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; + constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; + constexpr int exponentBias = 1023; + + static inline uint64_t getSmallPositiveFloatBits(uint64_t entropy) { + auto exponent = entropy >> 59; //0..31 + auto mantissa = entropy & mantissaMask; + exponent += exponentBias; + exponent &= exponentMask; + exponent <<= mantissaSize; + return exponent | mantissa; + } + VirtualMachine::VirtualMachine() { mem.ds.dataset = nullptr; } - void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* out) { - constexpr size_t smallStateLength = sizeof(RegisterFile) / sizeof(uint64_t) + 8; - alignas(16) uint64_t smallState[smallStateLength]; - memcpy(smallState, ®, sizeof(RegisterFile)); - if (scratchpadSize > 0) { - hashAes1Rx4(scratchpad, scratchpadSize, smallState + 24); - } - else { - memset(smallState + 24, 0, 64); - } - blake2b(out, ResultSize, smallState, sizeof(smallState), nullptr, 0); + void VirtualMachine::resetRoundingMode() { + initFpu(); } + + void VirtualMachine::initialize() { + store64(®.a[0].lo, getSmallPositiveFloatBits(program.getEntropy(0))); + store64(®.a[0].hi, getSmallPositiveFloatBits(program.getEntropy(1))); + store64(®.a[1].lo, getSmallPositiveFloatBits(program.getEntropy(2))); + store64(®.a[1].hi, getSmallPositiveFloatBits(program.getEntropy(3))); + store64(®.a[2].lo, getSmallPositiveFloatBits(program.getEntropy(4))); + store64(®.a[2].hi, getSmallPositiveFloatBits(program.getEntropy(5))); + store64(®.a[3].lo, getSmallPositiveFloatBits(program.getEntropy(6))); + store64(®.a[3].hi, getSmallPositiveFloatBits(program.getEntropy(7))); + mem.ma = program.getEntropy(8) & CacheLineAlignMask; + mem.mx = program.getEntropy(10); + auto addressRegisters = program.getEntropy(12); + readReg0 = 0 + (addressRegisters & 1); + addressRegisters >>= 1; + readReg1 = 2 + (addressRegisters & 1); + addressRegisters >>= 1; + readReg2 = 4 + (addressRegisters & 1); + addressRegisters >>= 1; + readReg3 = 6 + (addressRegisters & 1); + } + + template + void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* outHash) { + if (scratchpadSize > 0) { + hashAes1Rx4(scratchpad, scratchpadSize, ®.a); + } + blake2b(outHash, ResultSize, ®, sizeof(RegisterFile), nullptr, 0); + } + + template void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* outHash); + template void VirtualMachine::getResult(void* scratchpad, size_t scratchpadSize, void* outHash); + } \ No newline at end of file diff --git a/src/VirtualMachine.hpp b/src/VirtualMachine.hpp index fe48e13..d1dbe26 100644 --- a/src/VirtualMachine.hpp +++ b/src/VirtualMachine.hpp @@ -20,28 +20,36 @@ along with RandomX. If not, see. #pragma once #include #include "common.hpp" +#include "Program.hpp" namespace RandomX { + + class VirtualMachine { public: VirtualMachine(); virtual ~VirtualMachine() {} virtual void setDataset(dataset_t ds) = 0; - virtual void initializeScratchpad(uint8_t* scratchpad, int32_t index) = 0; void setScratchpad(void* ptr) { - scratchpad = (convertible_t*)ptr; + scratchpad = (uint8_t*)ptr; } - virtual void initializeProgram(const void* seed) = 0; + void resetRoundingMode(); + virtual void initialize(); virtual void execute() = 0; - void getResult(void*, size_t, void*); + template + void getResult(void* scratchpad, size_t scratchpadSize, void* outHash); const RegisterFile& getRegisterFile() { return reg; } + Program* getProgramBuffer() { + return &program; + } protected: - DatasetReadFunc readDataset; + alignas(16) Program program; alignas(16) RegisterFile reg; MemoryRegisters mem; - convertible_t* scratchpad; + uint8_t* scratchpad; + uint32_t readReg0, readReg1, readReg2, readReg3; }; } \ No newline at end of file diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc index bd2bbdd..a0acebc 100644 --- a/src/asm/program_loop_store.inc +++ b/src/asm/program_loop_store.inc @@ -12,10 +12,6 @@ mulpd xmm1, xmm5 mulpd xmm2, xmm6 mulpd xmm3, xmm7 - ;# xorpd xmm0, xmm15 - ;# xorpd xmm1, xmm15 - ;# xorpd xmm2, xmm15 - ;# xorpd xmm3, xmm15 movapd xmmword ptr [rcx+0], xmm0 movapd xmmword ptr [rcx+16], xmm1 movapd xmmword ptr [rcx+32], xmm2 diff --git a/src/asm/program_prologue_load.inc b/src/asm/program_prologue_load.inc index 74c2a08..757cf10 100644 --- a/src/asm/program_prologue_load.inc +++ b/src/asm/program_prologue_load.inc @@ -18,5 +18,4 @@ movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - ;# xorpd xmm15, xmm15 - + movapd xmm15, xmmword ptr [signMask] diff --git a/src/blake2/blake2-impl.h b/src/blake2/blake2-impl.h index 60b26fe..f294ba6 100644 --- a/src/blake2/blake2-impl.h +++ b/src/blake2/blake2-impl.h @@ -27,105 +27,10 @@ along with RandomX. If not, see. #define PORTABLE_BLAKE2_IMPL_H #include -#include -#if defined(_MSC_VER) -#define BLAKE2_INLINE __inline -#elif defined(__GNUC__) || defined(__clang__) -#define BLAKE2_INLINE __inline__ -#else -#define BLAKE2_INLINE -#endif +#include "endian.h" - /* Argon2 Team - Begin Code */ - /* - Not an exhaustive list, but should cover the majority of modern platforms - Additionally, the code will always be correct---this is only a performance - tweak. - */ -#if (defined(__BYTE_ORDER__) && \ - (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ - defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ - defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ - defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ - defined(_M_ARM) -#define NATIVE_LITTLE_ENDIAN -#endif - /* Argon2 Team - End Code */ - -static BLAKE2_INLINE uint32_t load32(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint32_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint32_t w = *p++; - w |= (uint32_t)(*p++) << 8; - w |= (uint32_t)(*p++) << 16; - w |= (uint32_t)(*p++) << 24; - return w; -#endif -} - -static BLAKE2_INLINE uint64_t load64(const void *src) { -#if defined(NATIVE_LITTLE_ENDIAN) - uint64_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - w |= (uint64_t)(*p++) << 48; - w |= (uint64_t)(*p++) << 56; - return w; -#endif -} - -static BLAKE2_INLINE void store32(void *dst, uint32_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE void store64(void *dst, uint64_t w) { -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = (uint8_t *)dst; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; - w >>= 8; - *p++ = (uint8_t)w; -#endif -} - -static BLAKE2_INLINE uint64_t load48(const void *src) { +static FORCE_INLINE uint64_t load48(const void *src) { const uint8_t *p = (const uint8_t *)src; uint64_t w = *p++; w |= (uint64_t)(*p++) << 8; @@ -136,7 +41,7 @@ static BLAKE2_INLINE uint64_t load48(const void *src) { return w; } -static BLAKE2_INLINE void store48(void *dst, uint64_t w) { +static FORCE_INLINE void store48(void *dst, uint64_t w) { uint8_t *p = (uint8_t *)dst; *p++ = (uint8_t)w; w >>= 8; @@ -151,11 +56,11 @@ static BLAKE2_INLINE void store48(void *dst, uint64_t w) { *p++ = (uint8_t)w; } -static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { +static FORCE_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { return (w >> c) | (w << (32 - c)); } -static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { +static FORCE_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { return (w >> c) | (w << (64 - c)); } diff --git a/src/blake2/blake2b.c b/src/blake2/blake2b.c index e7569b4..329ed3c 100644 --- a/src/blake2/blake2b.c +++ b/src/blake2/blake2b.c @@ -51,29 +51,29 @@ static const unsigned int blake2b_sigma[12][16] = { {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, }; -static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) { +static FORCE_INLINE void blake2b_set_lastnode(blake2b_state *S) { S->f[1] = (uint64_t)-1; } -static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) { +static FORCE_INLINE void blake2b_set_lastblock(blake2b_state *S) { if (S->last_node) { blake2b_set_lastnode(S); } S->f[0] = (uint64_t)-1; } -static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, +static FORCE_INLINE void blake2b_increment_counter(blake2b_state *S, uint64_t inc) { S->t[0] += inc; S->t[1] += (S->t[0] < inc); } -static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) { +static FORCE_INLINE void blake2b_invalidate_state(blake2b_state *S) { //clear_internal_memory(S, sizeof(*S)); /* wipe */ blake2b_set_lastblock(S); /* invalidate for further use */ } -static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) { +static FORCE_INLINE void blake2b_init0(blake2b_state *S) { memset(S, 0, sizeof(*S)); memcpy(S->h, blake2b_IV, sizeof(S->h)); } diff --git a/src/blake2/blamka-round-ref.h b/src/blake2/blamka-round-ref.h index d7acd68..d087b72 100644 --- a/src/blake2/blamka-round-ref.h +++ b/src/blake2/blamka-round-ref.h @@ -30,7 +30,7 @@ along with RandomX. If not, see. #include "blake2-impl.h" /* designed by the Lyra PHC team */ -static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { +static FORCE_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { const uint64_t m = UINT64_C(0xFFFFFFFF); const uint64_t xy = (x & m) * (y & m); return x + y + 2 * xy; diff --git a/src/blake2/endian.h b/src/blake2/endian.h new file mode 100644 index 0000000..fab1eed --- /dev/null +++ b/src/blake2/endian.h @@ -0,0 +1,99 @@ +#pragma once +#include +#include + +#if defined(_MSC_VER) +#define FORCE_INLINE __inline +#elif defined(__GNUC__) || defined(__clang__) +#define FORCE_INLINE __inline__ +#else +#define FORCE_INLINE +#endif + + /* Argon2 Team - Begin Code */ + /* + Not an exhaustive list, but should cover the majority of modern platforms + Additionally, the code will always be correct---this is only a performance + tweak. + */ +#if (defined(__BYTE_ORDER__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ + defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ + defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ + defined(_M_ARM) +#define NATIVE_LITTLE_ENDIAN +#endif + /* Argon2 Team - End Code */ + +static FORCE_INLINE uint32_t load32(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint32_t w = *p++; + w |= (uint32_t)(*p++) << 8; + w |= (uint32_t)(*p++) << 16; + w |= (uint32_t)(*p++) << 24; + return w; +#endif +} + +static FORCE_INLINE uint64_t load64(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + w |= (uint64_t)(*p++) << 48; + w |= (uint64_t)(*p++) << 56; + return w; +#endif +} + +static FORCE_INLINE void store32(void *dst, uint32_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static FORCE_INLINE void store64(void *dst, uint64_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} diff --git a/src/common.hpp b/src/common.hpp index ea67ff9..8c16825 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -21,13 +21,14 @@ along with RandomX. If not, see. #include #include +#include "blake2/endian.h" namespace RandomX { using addr_t = uint32_t; constexpr int SeedSize = 32; - constexpr int ResultSize = 32; + constexpr int ResultSize = 64; constexpr int ArgonIterations = 3; constexpr uint32_t ArgonMemorySize = 262144; //KiB @@ -36,12 +37,13 @@ namespace RandomX { constexpr int ArgonSaltSize = sizeof(ArgonSalt) - 1; constexpr int CacheLineSize = 64; + constexpr uint32_t CacheLineAlignMask = 0xFFFFFFFF & ~(CacheLineSize - 1); constexpr uint64_t DatasetSize = 4ULL * 1024 * 1024 * 1024; //4 GiB constexpr uint32_t CacheSize = ArgonMemorySize * 1024; constexpr int CacheBlockCount = CacheSize / CacheLineSize; constexpr int BlockExpansionRatio = DatasetSize / CacheSize; constexpr int DatasetBlockCount = BlockExpansionRatio * CacheBlockCount; - constexpr int DatasetIterations = 10; + constexpr int DatasetIterations = 16; #ifdef TRACE @@ -50,35 +52,36 @@ namespace RandomX { constexpr bool trace = false; #endif - union convertible_t { - double f64; - int64_t i64; - uint64_t u64; - int32_t i32; - uint32_t u32; - struct { - int32_t i32lo; - int32_t i32hi; - }; - }; +#ifndef UNREACHABLE +#ifdef __GNUC__ +#define UNREACHABLE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE __assume(false) +#else +#define UNREACHABLE +#endif +#endif + + using int_reg_t = uint64_t; struct fpu_reg_t { - convertible_t lo; - convertible_t hi; + double lo; + double hi; }; constexpr int ProgramLength = 256; constexpr uint32_t InstructionCount = 2048; constexpr uint32_t ScratchpadSize = 2 * 1024 * 1024; - constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(convertible_t); - constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(convertible_t); - constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(convertible_t); - constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(convertible_t); + constexpr uint32_t ScratchpadLength = ScratchpadSize / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL1 = ScratchpadSize / 128 / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL2 = ScratchpadSize / 8 / sizeof(int_reg_t); + constexpr uint32_t ScratchpadL3 = ScratchpadSize / sizeof(int_reg_t); constexpr int ScratchpadL1Mask = (ScratchpadL1 - 1) * 8; constexpr int ScratchpadL2Mask = (ScratchpadL2 - 1) * 8; constexpr int ScratchpadL1Mask16 = (ScratchpadL1 / 2 - 1) * 16; constexpr int ScratchpadL2Mask16 = (ScratchpadL2 / 2 - 1) * 16; constexpr int ScratchpadL3Mask = (ScratchpadLength - 1) * 8; + constexpr int ScratchpadL3Mask64 = (ScratchpadLength / 8 - 1) * 64; constexpr uint32_t TransformationCount = 90; constexpr int RegistersCount = 8; @@ -117,22 +120,20 @@ namespace RandomX { static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct RandomX::MemoryRegisters"); struct RegisterFile { - convertible_t r[RegistersCount]; + int_reg_t r[RegistersCount]; fpu_reg_t f[RegistersCount / 2]; - fpu_reg_t g[RegistersCount / 2]; + fpu_reg_t e[RegistersCount / 2]; fpu_reg_t a[RegistersCount / 2]; }; static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct RandomX::RegisterFile"); - typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, RegisterFile&); + typedef void(*DatasetReadFunc)(addr_t, MemoryRegisters&, int_reg_t(®)[RegistersCount]); - typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); - - typedef bool(*Condition)(convertible_t&, convertible_t&); + typedef void(*ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); extern "C" { - void executeProgram(RegisterFile&, MemoryRegisters&, convertible_t*, uint64_t); + void executeProgram(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); } } diff --git a/src/dataset.cpp b/src/dataset.cpp index b941a75..5b618f9 100644 --- a/src/dataset.cpp +++ b/src/dataset.cpp @@ -24,11 +24,11 @@ along with RandomX. If not, see. #include "common.hpp" #include "dataset.hpp" -#include "Pcg32.hpp" #include "Cache.hpp" #include "virtualMemory.hpp" #include "softAes.h" #include "squareHash.h" +#include "blake2/endian.h" #if defined(__SSE2__) #include @@ -39,56 +39,38 @@ along with RandomX. If not, see. namespace RandomX { - template - static inline void shuffle(T* buffer, size_t bytes, Pcg32& gen) { - auto count = bytes / sizeof(T); - for (auto i = count - 1; i >= 1; --i) { - int j = gen.getUniform(0, i); - std::swap(buffer[j], buffer[i]); - } - } - - template - void initBlock(const uint8_t* intermediate, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { + void initBlock(const uint8_t* cache, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys) { uint64_t r0, r1, r2, r3, r4, r5, r6, r7; r0 = 4ULL * blockNumber; r1 = r2 = r3 = r4 = r5 = r6 = r7 = 0; - constexpr int mask = (CacheSize - 1) & -64; + constexpr uint32_t mask = (CacheSize - 1) & CacheLineAlignMask; for (auto i = 0; i < DatasetIterations; ++i) { - uint64_t* mix = (uint64_t*)(intermediate + (r0 & mask)); - PREFETCHNTA(mix); + const uint8_t* mixBlock = cache + (r0 & mask); + PREFETCHNTA(mixBlock); r0 = squareHash(r0); - r0 ^= mix[0]; - r1 ^= mix[1]; - r2 ^= mix[2]; - r3 ^= mix[3]; - r4 ^= mix[4]; - r5 ^= mix[5]; - r6 ^= mix[6]; - r7 ^= mix[7]; + r0 ^= load64(mixBlock + 0); + r1 ^= load64(mixBlock + 8); + r2 ^= load64(mixBlock + 16); + r3 ^= load64(mixBlock + 24); + r4 ^= load64(mixBlock + 32); + r5 ^= load64(mixBlock + 40); + r6 ^= load64(mixBlock + 48); + r7 ^= load64(mixBlock + 56); } - uint64_t* out64 = (uint64_t*)out; - - out64[0] = r0; - out64[1] = r1; - out64[2] = r2; - out64[3] = r3; - out64[4] = r4; - out64[5] = r5; - out64[6] = r6; - out64[7] = r7; + store64(out + 0, r0); + store64(out + 8, r1); + store64(out + 16, r2); + store64(out + 24, r3); + store64(out + 32, r4); + store64(out + 40, r5); + store64(out + 48, r6); + store64(out + 56, r7); } - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - - template - void initBlock(const uint8_t*, uint8_t*, uint32_t, const KeysContainer&); - void datasetRead(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { uint64_t* datasetLine = (uint64_t*)(memory.ds.dataset + memory.ma); memory.mx ^= addr; @@ -96,34 +78,27 @@ namespace RandomX { std::swap(memory.mx, memory.ma); PREFETCHNTA(memory.ds.dataset + memory.ma); for (int i = 0; i < RegistersCount; ++i) - reg.r[i].u64 ^= datasetLine[i]; + reg.r[i] ^= datasetLine[i]; } - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + void datasetReadLight(addr_t addr, MemoryRegisters& memory, int_reg_t (®)[RegistersCount]) { + memory.mx ^= addr; + memory.mx &= CacheLineAlignMask; //align to cache line Cache* cache = memory.ds.cache; uint64_t datasetLine[CacheLineSize / sizeof(uint64_t)]; - initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); + initBlock(cache->getCache(), (uint8_t*)datasetLine, memory.ma / CacheLineSize, cache->getKeys()); for (int i = 0; i < RegistersCount; ++i) - reg.r[i].u64 ^= datasetLine[i]; - memory.mx ^= addr; - memory.mx &= -64; //align to cache line + reg[i] ^= datasetLine[i]; std::swap(memory.mx, memory.ma); } - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); - - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); - - void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg) { + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]) { ILightClientAsyncWorker* aw = memory.ds.asyncWorker; const uint64_t* datasetLine = aw->getBlock(memory.ma); for (int i = 0; i < RegistersCount; ++i) - reg.r[i].u64 ^= datasetLine[i]; + reg[i] ^= datasetLine[i]; memory.mx ^= addr; - memory.mx &= -64; //align to cache line + memory.mx &= CacheLineAlignMask; //align to cache line std::swap(memory.mx, memory.ma); aw->prepareBlock(memory.ma); } @@ -145,7 +120,7 @@ namespace RandomX { template void datasetInit(Cache* cache, dataset_t ds, uint32_t startBlock, uint32_t blockCount) { for (uint32_t i = startBlock; i < startBlock + blockCount; ++i) { - initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); + initBlock(cache->getCache(), ds.dataset + i * CacheLineSize, i, cache->getKeys()); } } @@ -172,7 +147,7 @@ namespace RandomX { alignas(16) KeysContainer keys; alignas(16) uint8_t buffer[CacheLineSize]; for (uint32_t block = 0; block < blockCount; ++block) { - initBlock(buffer, buffer, 0, keys); + initBlock(buffer, buffer, 0, keys); } } diff --git a/src/dataset.hpp b/src/dataset.hpp index 312b924..77a477d 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -31,7 +31,6 @@ namespace RandomX { template void initBlock(const uint8_t* in, uint8_t* out, uint32_t blockNumber, const KeysContainer& keys); - template void initBlock(const uint8_t* cache, uint8_t* block, uint32_t blockNumber, const KeysContainer& keys); void datasetAlloc(dataset_t& ds, bool largePages); @@ -44,10 +43,9 @@ namespace RandomX { template void datasetInitCache(const void* seed, dataset_t& dataset, bool largePages); - template - void datasetReadLight(addr_t addr, MemoryRegisters& memory, RegisterFile&); + void datasetReadLight(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]); - void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, RegisterFile& reg); + void datasetReadLightAsync(addr_t addr, MemoryRegisters& memory, int_reg_t(®)[RegistersCount]); template void aesBench(uint32_t blockCount); diff --git a/src/executeProgram-win64.asm b/src/executeProgram-win64.asm index ff43578..ac49e50 100644 --- a/src/executeProgram-win64.asm +++ b/src/executeProgram-win64.asm @@ -54,7 +54,7 @@ executeProgram PROC ; xmm12 -> temporary ; xmm13 -> DBL_MIN ; xmm14 -> absolute value mask - ; xmm15 -> unused + ; xmm15 -> sign mask ; store callee-saved registers push rbx @@ -104,7 +104,7 @@ executeProgram PROC movapd xmm11, xmmword ptr [rcx+120] movapd xmm13, xmmword ptr [minDbl] movapd xmm14, xmmword ptr [absMask] - ;# xorps xmm15, xmm15 + movapd xmm15, xmmword ptr [signMask] jmp program_begin diff --git a/src/instructionWeights.hpp b/src/instructionWeights.hpp index 3998d07..c336b29 100644 --- a/src/instructionWeights.hpp +++ b/src/instructionWeights.hpp @@ -37,8 +37,8 @@ along with RandomX. If not, see. #define WT_INEG_R 2 #define WT_IXOR_R 16 #define WT_IXOR_M 4 -#define WT_IROR_R 8 -#define WT_IROL_R 8 +#define WT_IROR_R 10 +#define WT_IROL_R 0 #define WT_ISWAP_R 4 //Common floating point @@ -49,6 +49,7 @@ along with RandomX. If not, see. #define WT_FADD_M 5 #define WT_FSUB_R 20 #define WT_FSUB_M 5 +#define WT_FNEG_R 6 //Floating point group E #define WT_FMUL_R 20 @@ -61,7 +62,6 @@ along with RandomX. If not, see. #define WT_COND_R 7 #define WT_COND_M 1 #define WT_CFROUND 1 -#define WT_CFSUM_R 0 //Store #define WT_ISTORE 16 @@ -74,7 +74,7 @@ WT_ISUB_M + WT_IMUL_9C + WT_IMUL_R + WT_IMUL_M + WT_IMULH_R + \ WT_IMULH_M + WT_ISMULH_R + WT_ISMULH_M + WT_IDIV_C + WT_ISDIV_C + \ WT_INEG_R + WT_IXOR_R + WT_IXOR_M + WT_IROR_R + WT_IROL_R + \ WT_ISWAP_R + WT_FSWAP_R + WT_FADD_R + WT_FADD_M + WT_FSUB_R + WT_FSUB_M + \ -WT_CFSUM_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ +WT_FNEG_R + WT_FMUL_R + WT_FMUL_M + WT_FDIV_R + WT_FDIV_M + \ WT_FSQRT_R + WT_COND_R + WT_COND_M + WT_CFROUND + WT_ISTORE + WT_FSTORE + WT_NOP; static_assert(wtSum == 256, diff --git a/src/instructions.hpp b/src/instructions.hpp deleted file mode 100644 index 6d9a98f..0000000 --- a/src/instructions.hpp +++ /dev/null @@ -1,57 +0,0 @@ -/* -Copyright (c) 2018 tevador - -This file is part of RandomX. - -RandomX is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -RandomX is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with RandomX. If not, see. -*/ - -#include -#include "common.hpp" - -namespace RandomX { - - extern "C" { - void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c); - void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c); - void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c); - void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c); - void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c); - void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c); - void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c); - void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c); - void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c); - void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c); - void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c); - void AND_64(convertible_t& a, convertible_t& b, convertible_t& c); - void AND_32(convertible_t& a, convertible_t& b, convertible_t& c); - void OR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void OR_32(convertible_t& a, convertible_t& b, convertible_t& c); - void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c); - void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c); - void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c); - void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c); - void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c); - bool JMP_COND(uint8_t, convertible_t&, int32_t); - void FPINIT(); - void FPROUND(convertible_t, uint8_t); - void FADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - void FSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c); - } -} \ No newline at end of file diff --git a/src/instructionsPortable.cpp b/src/instructionsPortable.cpp index ca85ffc..59d19c5 100644 --- a/src/instructionsPortable.cpp +++ b/src/instructionsPortable.cpp @@ -18,12 +18,14 @@ along with RandomX. If not, see. */ //#define DEBUG #include "intrinPortable.h" +#include "blake2/endian.h" #pragma STDC FENV_ACCESS on #include #include #ifdef DEBUG #include #endif +#include "common.hpp" #if defined(__SIZEOF_INT128__) typedef unsigned __int128 uint128_t; @@ -136,18 +138,18 @@ static inline int32_t safeSub(int32_t a, int32_t b) { #if defined(__has_builtin) #if __has_builtin(__builtin_sub_overflow) - static inline bool subOverflow__(int32_t a, int32_t b) { + static inline bool subOverflow__(uint32_t a, uint32_t b) { int32_t temp; - return __builtin_sub_overflow(a, b, &temp); + return __builtin_sub_overflow(unsigned32ToSigned2sCompl(a), unsigned32ToSigned2sCompl(b), &temp); } #define HAVE_SUB_OVERFLOW #endif #endif #ifndef HAVE_SUB_OVERFLOW - static inline bool subOverflow__(int32_t a, int32_t b) { - auto c = safeSub(a, b); - return (c < a) != (b > 0); + static inline bool subOverflow__(uint32_t a, uint32_t b) { + auto c = unsigned32ToSigned2sCompl(a - b); + return (c < unsigned32ToSigned2sCompl(a)) != (unsigned32ToSigned2sCompl(b) > 0); } #define HAVE_SUB_OVERFLOW #endif @@ -166,40 +168,44 @@ static inline double FlushNaN(double x) { void setRoundMode(uint32_t rcflag) { switch (rcflag & 3) { - case RoundDown: - setRoundMode__(FE_DOWNWARD); - break; - case RoundUp: - setRoundMode__(FE_UPWARD); - break; - case RoundToZero: - setRoundMode__(FE_TOWARDZERO); - break; - default: - setRoundMode__(FE_TONEAREST); - break; + case RoundDown: + setRoundMode__(FE_DOWNWARD); + break; + case RoundUp: + setRoundMode__(FE_UPWARD); + break; + case RoundToZero: + setRoundMode__(FE_TOWARDZERO); + break; + case RoundToNearest: + setRoundMode__(FE_TONEAREST); + break; + default: + UNREACHABLE; } } -bool condition(uint32_t type, int32_t value, int32_t imm32) { +bool condition(uint32_t type, uint32_t value, uint32_t imm32) { switch (type & 7) { - case 0: - return (uint32_t)value <= (uint32_t)imm32; - case 1: - return (uint32_t)value > (uint32_t)imm32; - case 2: - return safeSub(value, imm32) < 0; - case 3: - return safeSub(value, imm32) >= 0; - case 4: - return subOverflow__(value, imm32); - case 5: - return !subOverflow__(value, imm32); - case 6: - return value < imm32; - case 7: - return value >= imm32; + case 0: + return value <= imm32; + case 1: + return value > imm32; + case 2: + return unsigned32ToSigned2sCompl(value - imm32) < 0; + case 3: + return unsigned32ToSigned2sCompl(value - imm32) >= 0; + case 4: + return subOverflow__(value, imm32); + case 5: + return !subOverflow__(value, imm32); + case 6: + return unsigned32ToSigned2sCompl(value) < unsigned32ToSigned2sCompl(imm32); + case 7: + return unsigned32ToSigned2sCompl(value) >= unsigned32ToSigned2sCompl(imm32); + default: + UNREACHABLE; } } @@ -211,100 +217,13 @@ void initFpu() { #endif } -namespace RandomX { +union double_ser_t { + double f; + uint64_t i; +}; - extern "C" { - /*void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { - c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U); - } - - void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { - if (a.i64 == INT64_MIN && b.i32 == -1) - c.i64 = INT64_MIN; - else - c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1); - } - - void FADD(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_add_pd(ad, bd); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = alo + b.lo.f64; - c.hi.f64 = ahi + b.hi.f64; -#endif - } - - void FSUB(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_sub_pd(ad, bd); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = alo - b.lo.f64; - c.hi.f64 = ahi - b.hi.f64; -#endif - } - - void FMUL(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_mul_pd(ad, bd); - __m128d mask = _mm_cmpeq_pd(cd, cd); - cd = _mm_and_pd(cd, mask); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = FlushNaN(alo * b.lo.f64); - c.hi.f64 = FlushNaN(ahi * b.hi.f64); -#endif - } - - void FDIV(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - __m128d bd = _mm_load_pd(&b.lo.f64); - __m128d cd = _mm_div_pd(ad, bd); - __m128d mask = _mm_cmpeq_pd(cd, cd); - cd = _mm_and_pd(cd, mask); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = FlushDenormalNaN(alo / b.lo.f64); - c.hi.f64 = FlushDenormalNaN(ahi / b.hi.f64); -#endif - } - - void FSQRT(convertible_t& a, fpu_reg_t& b, fpu_reg_t& c) { -#ifdef __SSE2__ - __m128i ai = _mm_loadl_epi64((const __m128i*)&a); - __m128d ad = _mm_cvtepi32_pd(ai); - const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63))); - ad = _mm_and_pd(ad, absmask); - __m128d cd = _mm_sqrt_pd(ad); - _mm_store_pd(&c.lo.f64, cd); -#else - double alo = (double)a.i32lo; - double ahi = (double)a.i32hi; - c.lo.f64 = sqrt(std::abs(alo)); - c.hi.f64 = sqrt(std::abs(ahi)); -#endif - }*/ - - - } -} \ No newline at end of file +double loadDoublePortable(const void* addr) { + double_ser_t ds; + ds.i = load64(addr); + return ds.f; +} diff --git a/src/intrinPortable.h b/src/intrinPortable.h index 3d2136c..2c2e487 100644 --- a/src/intrinPortable.h +++ b/src/intrinPortable.h @@ -33,12 +33,21 @@ along with RandomX. If not, see. #else #include #endif + +inline __m128d _mm_abs(__m128d xd) { + const __m128d absmask = _mm_castsi128_pd(_mm_set1_epi64x(~(1LL << 63))); + return _mm_and_pd(xd, absmask); +} + +#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) + #else #include #include #define _mm_malloc(a,b) malloc(a) #define _mm_free(a) free(a) +#define PREFETCHNTA(x) typedef union { uint64_t u64[2]; @@ -152,10 +161,29 @@ constexpr int RoundDown = 1; constexpr int RoundUp = 2; constexpr int RoundToZero = 3; +constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); +} + +constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) { + return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x); +} + +constexpr uint64_t signExtend2sCompl(uint32_t x) { + return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); +} + +inline __m128d load_cvt_i32x2(const void* addr) { + __m128i ix = _mm_load_si128((const __m128i*)addr); + return _mm_cvtepi32_pd(ix); +} + +double loadDoublePortable(const void* addr); + uint64_t mulh(uint64_t, uint64_t); int64_t smulh(int64_t, int64_t); uint64_t rotl(uint64_t, int); uint64_t rotr(uint64_t, int); void initFpu(); void setRoundMode(uint32_t); -bool condition(uint32_t, int32_t, int32_t); +bool condition(uint32_t, uint32_t, uint32_t); diff --git a/src/main.cpp b/src/main.cpp index 58b381c..b16b13b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,7 +34,6 @@ along with RandomX. If not, see. #include #include "dataset.hpp" #include "Cache.hpp" -#include "Pcg32.hpp" #include "hashAes1Rx4.hpp" const uint8_t seed[32] = { 191, 182, 222, 175, 249, 89, 134, 104, 241, 68, 191, 62, 162, 166, 61, 64, 123, 191, 227, 193, 118, 60, 188, 53, 223, 133, 175, 24, 123, 230, 55, 74 }; @@ -117,7 +116,7 @@ void printUsage(const char* executable) { } void generateAsm(int nonce) { - uint64_t hash[4]; + uint64_t hash[8]; unsigned char blockTemplate[] = { 0x07, 0x07, 0xf7, 0xa4, 0xf0, 0xd6, 0x05, 0xb3, 0x03, 0x26, 0x08, 0x16, 0xba, 0x3f, 0x10, 0x90, 0x2e, 0x1a, 0x14, 0x5a, 0xc5, 0xfa, 0xd3, 0xaa, 0x3a, 0xf6, 0xea, 0x44, 0xc1, 0x18, 0x69, 0xdc, 0x4f, 0x85, 0x3f, 0x00, 0x2b, 0x2e, @@ -128,7 +127,9 @@ void generateAsm(int nonce) { *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); RandomX::AssemblyGeneratorX86 asmX86; - asmX86.generateProgram(hash); + RandomX::Program p; + fillAes1Rx4(hash, sizeof(p), &p); + asmX86.generateProgram(p); asmX86.printCode(std::cout); } @@ -143,9 +144,8 @@ void generateNative(int nonce) { int* noncePtr = (int*)(blockTemplate + 39); *noncePtr = nonce; blake2b(hash, sizeof(hash), blockTemplate, sizeof(blockTemplate), nullptr, 0); - RandomX::Program prog; - Pcg32 gen(hash); - prog.initialize(gen); + alignas(16) RandomX::Program prog; + fillAes1Rx4((void*)hash, sizeof(prog), &prog); for (int i = 0; i < RandomX::ProgramLength; ++i) { prog(i).dst %= 8; prog(i).src %= 8; @@ -173,12 +173,13 @@ void mine(RandomX::VirtualMachine* vm, std::atomic& atomicNonce, AtomicHash vm->setScratchpad(scratchpad); //dump((char*)((RandomX::CompiledVirtualMachine*)vm)->getProgram(), RandomX::CodeSize, "code-1337-jmp.txt"); for (int chain = 0; chain < 8; ++chain) { - vm->initializeProgram(hash); + fillAes1Rx4((void*)hash, sizeof(RandomX::Program), vm->getProgramBuffer()); + vm->initialize(); vm->execute(); - vm->getResult(nullptr, 0, hash); + vm->getResult(nullptr, 0, hash); } //vm->initializeProgram(hash); - vm->getResult(scratchpad, RandomX::ScratchpadSize, hash); + vm->getResult(scratchpad, RandomX::ScratchpadSize, hash); result.xorWith(hash); if (RandomX::trace) { std::cout << "Nonce: " << nonce << " "; diff --git a/src/program.inc b/src/program.inc index ba4b937..5de4504 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,17 +1,14 @@ ; FMUL_R e0, a2 mulpd xmm4, xmm10 - ; IADD_RC r2, r5, -1621224194 + ; IADD_RC r2, r5, 2673743102 lea r10, [r10+r13-1621224194] ; ISTORE L2[r2], r7 mov eax, r10d and eax, 262136 mov qword ptr [rsi+rax], r15 - ; FSUB_M f2, L1[r2] - mov eax, r10d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 - ; IMUL_9C r6, -1003503212 + ; FNEG_R f2 + xorps xmm2, xmm15 + ; IMUL_9C r6, 3291464084 lea r14, [r14+r14*8-1003503212] ; FSUB_R f1, a0 subpd xmm1, xmm8 @@ -19,11 +16,8 @@ mov eax, r11d and eax, 262136 xor r13, qword ptr [rsi+rax] - ; FSUB_M f2, L1[r4] - mov eax, r12d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm2, xmm12 + ; FNEG_R f2 + xorps xmm2, xmm15 ; FSUB_R f3, a0 subpd xmm3, xmm8 ; ISDIV_C r0, 1400272688 @@ -38,35 +32,37 @@ mov eax, r15d and eax, 16376 imul r11, qword ptr [rsi+rax] - ; IROL_R r2, r3 - mov ecx, r11d - rol r10, cl + ; ISWAP_R r2, r3 + xchg r10, r11 ; IMULH_R r6, r0 mov rax, r14 mul r8 mov r14, rdx ; FMUL_R e0, a2 mulpd xmm4, xmm10 - ; IADD_RC r3, r4, -52260428 + ; IADD_RC r3, r4, 4242706868 lea r11, [r11+r12-52260428] - ; IADD_R r7, -1138617760 + ; IADD_R r7, 3156349536 add r15, -1138617760 ; IXOR_M r2, L1[r6] mov eax, r14d and eax, 16376 xor r10, qword ptr [rsi+rax] - ; FSUB_R f2, a1 - subpd xmm2, xmm9 + ; FSUB_M f2, L1[r5] + mov eax, r13d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 ; IXOR_R r7, r1 xor r15, r9 - ; COND_R r2, lt(r7, -41618808) + ; COND_R r2, lt(r7, 4253348488) xor ecx, ecx cmp r15d, -41618808 setl cl add r10, rcx ; FMUL_R e3, a0 mulpd xmm7, xmm8 - ; COND_R r4, sg(r1, -961190365) + ; COND_R r4, sg(r1, 3333776931) xor ecx, ecx cmp r9d, -961190365 sets cl @@ -122,19 +118,21 @@ addpd xmm1, xmm8 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; FADD_R f0, a1 - addpd xmm0, xmm9 + ; FADD_M f0, L2[r5] + mov eax, r13d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 ; IMUL_R r5, r6 imul r13, r14 - ; IADD_RC r1, r2, -1263285243 + ; IADD_RC r1, r2, 3031682053 lea r9, [r9+r10-1263285243] ; ISUB_M r4, L1[r6] mov eax, r14d and eax, 16376 sub r12, qword ptr [rsi+rax] - ; IROL_R r7, r2 - mov ecx, r10d - rol r15, cl + ; FSWAP_R e3 + shufpd xmm7, xmm7, 1 ; IMUL_R r0, r7 imul r8, r15 ; IXOR_R r1, r6 @@ -156,9 +154,8 @@ andps xmm12, xmm14 divpd xmm6, xmm12 maxpd xmm6, xmm13 - ; IROL_R r2, r0 - mov ecx, r8d - rol r10, cl + ; ISWAP_R r2, r0 + xchg r10, r8 ; IADD_R r7, r5 add r15, r13 ; FDIV_M e0, L1[r4] @@ -210,8 +207,11 @@ mov eax, r8d and eax, 16376 mov qword ptr [rsi+rax], r15 - ; FSUB_R f0, a1 - subpd xmm0, xmm9 + ; FSUB_M f0, L2[r1] + mov eax, r9d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 ; FADD_R f3, a1 addpd xmm3, xmm9 ; IXOR_R r5, r4 @@ -220,8 +220,8 @@ mov eax, r15d and eax, 262136 mov qword ptr [rsi+rax], r10 - ; ISWAP_R r6, r7 - xchg r14, r15 + ; FSWAP_R e2 + shufpd xmm6, xmm6, 1 ; FADD_R f3, a2 addpd xmm3, xmm10 ; ISMULH_R r5, r0 @@ -232,7 +232,7 @@ mov eax, r12d and eax, 16376 add r8, qword ptr [rsi+rax] - ; COND_R r7, ge(r6, -1972898485) + ; COND_R r7, ge(r6, 2322068811) xor ecx, ecx cmp r14d, -1972898485 setge cl @@ -242,9 +242,9 @@ ; IROR_R r7, r6 mov ecx, r14d ror r15, cl - ; IADD_RC r2, r4, -117457973 + ; IADD_RC r2, r4, 4177509323 lea r10, [r10+r12-117457973] - ; IMUL_R r0, -1500893068 + ; IMUL_R r0, 2794074228 imul r8, -1500893068 ; IADD_R r2, r3 add r10, r11 @@ -265,19 +265,19 @@ lea r14, [r14+r14+540663146] ; IROR_R r5, 58 ror r13, 58 - ; FSWAP_R f2 - shufpd xmm2, xmm2, 1 - ; FSWAP_R f2 - shufpd xmm2, xmm2, 1 + ; FADD_R f2, a1 + addpd xmm2, xmm9 + ; FADD_R f2, a2 + addpd xmm2, xmm10 ; FMUL_R e1, a2 mulpd xmm5, xmm10 - ; ISWAP_R r5, r6 - xchg r13, r14 + ; FSWAP_R e1 + shufpd xmm5, xmm5, 1 ; IADD_R r5, r3 add r13, r11 - ; IADD_R r7, -1780268176 + ; IADD_R r7, 2514699120 add r15, -1780268176 - ; IADD_RC r7, r0, -1497756854 + ; IADD_RC r7, r0, 2797210442 lea r15, [r15+r8-1497756854] ; ISTORE L2[r0], r7 mov eax, r8d @@ -287,8 +287,11 @@ mov rax, r10 imul r12 mov r10, rdx - ; FSUB_R f0, a2 - subpd xmm0, xmm10 + ; FSUB_M f0, L1[r2] + mov eax, r10d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm0, xmm12 ; ISMULH_R r2, r3 mov rax, r10 imul r11 @@ -301,8 +304,11 @@ addpd xmm2, xmm8 ; FMUL_R e0, a2 mulpd xmm4, xmm10 - ; FADD_R f2, a3 - addpd xmm2, xmm11 + ; FADD_M f2, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm2, xmm12 ; IMUL_R r1, r2 imul r9, r10 ; IMUL_M r7, L1[r5] @@ -313,11 +319,8 @@ imul r11, r10 ; IXOR_R r1, r0 xor r9, r8 - ; FSUB_M f0, L1[r5] - mov eax, r13d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 + ; FNEG_R f0 + xorps xmm0, xmm15 ; IADD_RC r4, r4, 1456841848 lea r12, [r12+r12+1456841848] ; IXOR_R r3, r2 @@ -327,19 +330,16 @@ cmp r12d, 1678513610 seto cl add r8, rcx - ; ISMULH_R r4, -1620573087 - mov rax, -1620573087 + ; ISMULH_R r4, r4 + mov rax, r12 imul r12 - add r12, rdx + mov r12, rdx ; IMUL_R r4, r1 imul r12, r9 - ; FSWAP_R e1 - shufpd xmm5, xmm5, 1 - ; FADD_M f2, L1[r0] - mov eax, r8d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm2, xmm12 + ; FADD_R f1, a2 + addpd xmm1, xmm10 + ; FSUB_R f2, a0 + subpd xmm2, xmm8 ; FMUL_R e1, a2 mulpd xmm5, xmm10 ; FSUB_R f0, a3 @@ -362,29 +362,35 @@ sub r12, qword ptr [rsi+rax] ; FADD_R f2, a2 addpd xmm2, xmm10 - ; FSUB_R f3, a0 - subpd xmm3, xmm8 + ; FSUB_M f3, L2[r4] + mov eax, r12d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 ; IXOR_R r7, r2 xor r15, r10 ; IXOR_R r0, r5 xor r8, r13 - ; FSWAP_R f1 - shufpd xmm1, xmm1, 1 + ; FADD_R f1, a2 + addpd xmm1, xmm10 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; ISWAP_R r7, r1 - xchg r15, r9 - ; ISWAP_R r1, r4 - xchg r9, r12 - ; COND_R r2, ge(r2, -226330940) + ; FSWAP_R e3 + shufpd xmm7, xmm7, 1 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; COND_R r2, ge(r2, 4068636356) xor ecx, ecx cmp r10d, -226330940 setge cl add r10, rcx ; FMUL_R e2, a3 mulpd xmm6, xmm11 - ; FSUB_R f2, a1 - subpd xmm2, xmm9 + ; FSUB_M f2, L2[r1] + mov eax, r9d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 ; FADD_R f1, a0 addpd xmm1, xmm8 ; ISUB_R r7, r5 @@ -395,12 +401,11 @@ sub r8, qword ptr [rsi+rax] ; FSUB_R f3, a1 subpd xmm3, xmm9 - ; IROL_R r3, r5 - mov ecx, r13d - rol r11, cl + ; ISWAP_R r3, r5 + xchg r11, r13 ; IADD_RC r5, r2, 795784298 lea r13, [r13+r10+795784298] - ; IADD_RC r0, r4, -2050178553 + ; IADD_RC r0, r4, 2244788743 lea r8, [r8+r12-2050178553] ; IMUL_9C r5, 1062534001 lea r13, [r13+r13*8+1062534001] @@ -436,16 +441,15 @@ mov rax, r12 imul r10 mov r12, rdx - ; IROL_R r3, r0 - mov ecx, r8d - rol r11, cl + ; ISWAP_R r3, r0 + xchg r11, r8 ; IXOR_R r2, r0 xor r10, r8 ; IXOR_M r0, L2[r1] mov eax, r9d and eax, 262136 xor r8, qword ptr [rsi+rax] - ; ISDIV_C r7, -935446980 + ; ISDIV_C r7, 3359520316 mov rax, 7859804860668271393 imul r15 xor eax, eax @@ -458,11 +462,8 @@ mov eax, r10d and eax, 16376 imul r14, qword ptr [rsi+rax] - ; FSUB_M f3, L1[r6] - mov eax, r14d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm3, xmm12 + ; FNEG_R f3 + xorps xmm3, xmm15 ; IADD_RC r4, r2, 1704868083 lea r12, [r12+r10+1704868083] ; FADD_R f2, a0 @@ -471,8 +472,11 @@ mov eax, r8d and eax, 16376 mov qword ptr [rsi+rax], r8 - ; FADD_R f0, a3 - addpd xmm0, xmm11 + ; FADD_M f0, L1[r7] + mov eax, r15d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 ; FMUL_R e0, a3 mulpd xmm4, xmm11 ; FSUB_R f3, a2 @@ -481,8 +485,8 @@ lea r15, [r15+r15+1302457878] ; ISUB_R r1, 1330165941 sub r9, 1330165941 - ; FSUB_R f1, a3 - subpd xmm1, xmm11 + ; FNEG_R f1 + xorps xmm1, xmm15 ; IROR_R r0, r4 mov ecx, r12d ror r8, cl @@ -491,7 +495,7 @@ ; IROR_R r5, r6 mov ecx, r14d ror r13, cl - ; COND_R r0, ab(r1, -310933871) + ; COND_R r0, ab(r1, 3984033425) xor ecx, ecx cmp r9d, -310933871 seta cl @@ -516,22 +520,22 @@ andps xmm12, xmm14 divpd xmm5, xmm12 maxpd xmm5, xmm13 - ; IROL_R r1, 5 - rol r9, 5 - ; IADD_R r7, -1421188024 + ; FSWAP_R f1 + shufpd xmm1, xmm1, 1 + ; IADD_R r7, 2873779272 add r15, -1421188024 - ; FSUB_R f3, a2 - subpd xmm3, xmm10 + ; FSUB_M f3, L2[r2] + mov eax, r10d + and eax, 262136 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm3, xmm12 ; FSUB_R f2, a3 subpd xmm2, xmm11 - ; FADD_M f3, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm3, xmm12 + ; FSUB_R f3, a1 + subpd xmm3, xmm9 ; FMUL_R e1, a3 mulpd xmm5, xmm11 - ; IADD_RC r2, r4, -317832028 + ; IADD_RC r2, r4, 3977135268 lea r10, [r10+r12-317832028] ; IMUL_M r4, L1[r5] mov eax, r13d @@ -575,12 +579,11 @@ sub r12, r9 ; ISUB_R r3, r0 sub r11, r8 - ; IROL_R r7, r5 - mov ecx, r13d - rol r15, cl + ; ISWAP_R r7, r5 + xchg r15, r13 ; IMUL_R r2, r6 imul r10, r14 - ; COND_R r2, ge(r2, -1892157506) + ; COND_R r2, ge(r2, 2402809790) xor ecx, ecx cmp r10d, -1892157506 setge cl @@ -596,7 +599,7 @@ add r9, rdx ; FADD_R f0, a1 addpd xmm0, xmm9 - ; IADD_RC r5, r7, -477591118 + ; IADD_RC r5, r7, 3817376178 lea r13, [r13+r15-477591118] ; FSUB_R f0, a3 subpd xmm0, xmm11 @@ -610,9 +613,12 @@ add r8, r12 ; FSUB_R f3, a1 subpd xmm3, xmm9 - ; FSUB_R f2, a0 - subpd xmm2, xmm8 - ; ISDIV_C r2, -396711688 + ; FSUB_M f2, L1[r4] + mov eax, r12d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + subpd xmm2, xmm12 + ; ISDIV_C r2, 3898255608 mov rax, 5964731804029407733 imul r10 xor eax, eax @@ -621,16 +627,19 @@ sets al add rdx, rax add r10, rdx - ; FSUB_R f2, a2 - subpd xmm2, xmm10 + ; FNEG_R f2 + xorps xmm2, xmm15 ; FSUB_R f3, a2 subpd xmm3, xmm10 ; FADD_R f1, a3 addpd xmm1, xmm11 ; IMUL_R r3, r2 imul r11, r10 - ; FADD_R f0, a3 - addpd xmm0, xmm11 + ; FADD_M f0, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm0, xmm12 ; ISMULH_R r5, r2 mov rax, r13 imul r10 @@ -639,28 +648,30 @@ mov rax, r14 mul r10 mov r14, rdx - ; FADD_R f3, a3 - addpd xmm3, xmm11 + ; FADD_M f3, L1[r3] + mov eax, r11d + and eax, 16376 + cvtdq2pd xmm12, qword ptr [rsi+rax] + addpd xmm3, xmm12 ; IMUL_R r6, r7 imul r14, r15 ; FSUB_R f0, a0 subpd xmm0, xmm8 - ; FSUB_R f2, a0 - subpd xmm2, xmm8 + ; FNEG_R f2 + xorps xmm2, xmm15 ; ISUB_R r6, r4 sub r14, r12 - ; FSWAP_R f1 - shufpd xmm1, xmm1, 1 + ; FADD_R f1, a1 + addpd xmm1, xmm9 ; IXOR_R r0, r5 xor r8, r13 ; FADD_R f2, a1 addpd xmm2, xmm9 - ; IROL_R r7, r5 - mov ecx, r13d - rol r15, cl + ; ISWAP_R r7, r5 + xchg r15, r13 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; IADD_RC r3, r6, -1317630728 + ; IADD_RC r3, r6, 2977336568 lea r11, [r11+r14-1317630728] ; IMUL_R r2, r3 imul r10, r11 @@ -668,11 +679,8 @@ lea r9, [r9+r12+894105694] ; IMUL_9C r7, 504293473 lea r15, [r15+r15*8+504293473] - ; FADD_M f1, L2[r0] - mov eax, r8d - and eax, 262136 - cvtdq2pd xmm12, qword ptr [rsi+rax] - addpd xmm1, xmm12 + ; FSUB_R f1, a0 + subpd xmm1, xmm8 ; IMUL_R r7, r1 imul r15, r9 ; IXOR_R r2, r4 @@ -713,19 +721,16 @@ mov eax, r9d and eax, 16376 mov qword ptr [rsi+rax], r13 - ; FSUB_M f0, L1[r1] - mov eax, r9d - and eax, 16376 - cvtdq2pd xmm12, qword ptr [rsi+rax] - subpd xmm0, xmm12 + ; FNEG_R f0 + xorps xmm0, xmm15 ; FSQRT_R e2 sqrtpd xmm6, xmm6 ; FMUL_R e0, a3 mulpd xmm4, xmm11 ; FMUL_R e3, a2 mulpd xmm7, xmm10 - ; IROL_R r5, r2 + ; IROR_R r5, r2 mov ecx, r10d - rol r13, cl + ror r13, cl ; IADD_R r0, r4 add r8, r12 diff --git a/src/squareHash.h b/src/squareHash.h index f80b492..05939d7 100644 --- a/src/squareHash.h +++ b/src/squareHash.h @@ -17,6 +17,11 @@ You should have received a copy of the GNU General Public License along with RandomX. If not, see. */ +/* + Based on the original idea by SChernykh: + https://github.com/SChernykh/xmr-stak-cpu/issues/1#issuecomment-414336613 +*/ + #include #if !defined(_M_X64) && !defined(__x86_64__) diff --git a/src/t1ha/t1ha.h b/src/t1ha/t1ha.h deleted file mode 100644 index 6b56e16..0000000 --- a/src/t1ha/t1ha.h +++ /dev/null @@ -1,723 +0,0 @@ -/* - * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, - * Fast Positive Hash. - * - * Portions Copyright (c) 2010-2018 Leonid Yuriev , - * The 1Hippeus project (t1h). - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgement in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* - * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } - * by [Positive Technologies](https://www.ptsecurity.ru) - * - * Briefly, it is a 64-bit Hash Function: - * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, - * but portable and without penalties it can run on any 64-bit CPU. - * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash - * and all others portable hash-functions (which do not use specific - * hardware tricks). - * 3. Not suitable for cryptography. - * - * The Future will Positive. Всё будет хорошо. - * - * ACKNOWLEDGEMENT: - * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) - * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! - */ - -#pragma once - -/***************************************************************************** - * - * PLEASE PAY ATTENTION TO THE FOLLOWING NOTES - * about macros definitions which controls t1ha behaviour and/or performance. - * - * - * 1) T1HA_SYS_UNALIGNED_ACCESS = Defines the system/platform/CPU/architecture - * abilities for unaligned data access. - * - * By default, when the T1HA_SYS_UNALIGNED_ACCESS not defined, - * it will defined on the basis hardcoded knowledge about of capabilities - * of most common CPU architectures. But you could override this - * default behavior when build t1ha library itself: - * - * // To disable unaligned access at all. - * #define T1HA_SYS_UNALIGNED_ACCESS 0 - * - * // To enable unaligned access, but indicate that it significally slow. - * #define T1HA_SYS_UNALIGNED_ACCESS 1 - * - * // To enable unaligned access, and indicate that it effecient. - * #define T1HA_SYS_UNALIGNED_ACCESS 2 - * - * - * 2) T1HA_USE_FAST_ONESHOT_READ = Controls the data reads at the end of buffer. - * - * When defined to non-zero, t1ha will use 'one shot' method for reading - * up to 8 bytes at the end of data. In this case just the one 64-bit read - * will be performed even when the available less than 8 bytes. - * - * This is little bit faster that switching by length of data tail. - * Unfortunately this will triggering a false-positive alarms from Valgrind, - * AddressSanitizer and other similar tool. - * - * By default, t1ha defines it to 1, but you could override this - * default behavior when build t1ha library itself: - * - * // For little bit faster and small code. - * #define T1HA_USE_FAST_ONESHOT_READ 1 - * - * // For calmness if doubt. - * #define T1HA_USE_FAST_ONESHOT_READ 0 - * - * - * 3) T1HA0_RUNTIME_SELECT = Controls choice fastest function in runtime. - * - * t1ha library offers the t1ha0() function as the fastest for current CPU. - * But actual CPU's features/capabilities and may be significantly different, - * especially on x86 platform. Therefore, internally, t1ha0() may require - * dynamic dispatching for choice best implementation. - * - * By default, t1ha enables such runtime choice and (may be) corresponding - * indirect calls if it reasonable, but you could override this default - * behavior when build t1ha library itself: - * - * // To enable runtime choice of fastest implementation. - * #define T1HA0_RUNTIME_SELECT 1 - * - * // To disable runtime choice of fastest implementation. - * #define T1HA0_RUNTIME_SELECT 0 - * - * When T1HA0_RUNTIME_SELECT is nonzero the t1ha0_resolve() function could - * be used to get actual t1ha0() implementation address at runtime. This is - * useful for two cases: - * - calling by local pointer-to-function usually is little - * bit faster (less overhead) than via a PLT thru the DSO boundary. - * - GNU Indirect functions (see below) don't supported by environment - * and calling by t1ha0_funcptr is not available and/or expensive. - * - * 4) T1HA_USE_INDIRECT_FUNCTIONS = Controls usage of GNU Indirect functions. - * - * In continue of T1HA0_RUNTIME_SELECT the T1HA_USE_INDIRECT_FUNCTIONS - * controls usage of ELF indirect functions feature. In general, when - * available, this reduces overhead of indirect function's calls though - * a DSO-bundary (https://sourceware.org/glibc/wiki/GNU_IFUNC). - * - * By default, t1ha engage GNU Indirect functions when it available - * and useful, but you could override this default behavior when build - * t1ha library itself: - * - * // To enable use of GNU ELF Indirect functions. - * #define T1HA_USE_INDIRECT_FUNCTIONS 1 - * - * // To disable use of GNU ELF Indirect functions. This may be useful - * // if the actual toolchain or the system's loader don't support ones. - * #define T1HA_USE_INDIRECT_FUNCTIONS 0 - * - * 5) T1HA0_AESNI_AVAILABLE = Controls AES-NI detection and dispatching on x86. - * - * In continue of T1HA0_RUNTIME_SELECT the T1HA0_AESNI_AVAILABLE controls - * detection and usage of AES-NI CPU's feature. On the other hand, this - * requires compiling parts of t1ha library with certain properly options, - * and could be difficult or inconvenient in some cases. - * - * By default, t1ha engade AES-NI for t1ha0() on the x86 platform, but - * you could override this default behavior when build t1ha library itself: - * - * // To disable detection and usage of AES-NI instructions for t1ha0(). - * // This may be useful when you unable to build t1ha library properly - * // or known that AES-NI will be unavailable at the deploy. - * #define T1HA0_AESNI_AVAILABLE 0 - * - * // To force detection and usage of AES-NI instructions for t1ha0(), - * // but I don't known reasons to anybody would need this. - * #define T1HA0_AESNI_AVAILABLE 1 - * - * 6) T1HA0_DISABLED, T1HA1_DISABLED, T1HA2_DISABLED = Controls availability of - * t1ha functions. - * - * In some cases could be useful to import/use only few of t1ha functions - * or just the one. So, this definitions allows disable corresponding parts - * of t1ha library. - * - * // To disable t1ha0(), t1ha0_32le(), t1ha0_32be() and all AES-NI. - * #define T1HA0_DISABLED - * - * // To disable t1ha1_le() and t1ha1_be(). - * #define T1HA1_DISABLED - * - * // To disable t1ha2_atonce(), t1ha2_atonce128() and so on. - * #define T1HA2_DISABLED - * - *****************************************************************************/ - -#define T1HA_VERSION_MAJOR 2 -#define T1HA_VERSION_MINOR 1 -#define T1HA_VERSION_RELEASE 0 - -#ifndef __has_attribute -#define __has_attribute(x) (0) -#endif - -#ifndef __has_include -#define __has_include(x) (0) -#endif - -#ifndef __GNUC_PREREQ -#if defined(__GNUC__) && defined(__GNUC_MINOR__) -#define __GNUC_PREREQ(maj, min) \ - ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) -#else -#define __GNUC_PREREQ(maj, min) 0 -#endif -#endif /* __GNUC_PREREQ */ - -#ifndef __CLANG_PREREQ -#ifdef __clang__ -#define __CLANG_PREREQ(maj, min) \ - ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) -#else -#define __CLANG_PREREQ(maj, min) (0) -#endif -#endif /* __CLANG_PREREQ */ - -#ifndef __LCC_PREREQ -#ifdef __LCC__ -#define __LCC_PREREQ(maj, min) \ - ((__LCC__ << 16) + __LCC_MINOR__ >= ((maj) << 16) + (min)) -#else -#define __LCC_PREREQ(maj, min) (0) -#endif -#endif /* __LCC_PREREQ */ - -/*****************************************************************************/ - -#ifdef _MSC_VER -/* Avoid '16' bytes padding added after data member 't1ha_context::total' - * and other warnings from std-headers if warning-level > 3. */ -#pragma warning(push, 3) -#endif - -#if defined(__cplusplus) && __cplusplus >= 201103L -#include -#include -#include -#else -#include -#include -#include -#endif - -/*****************************************************************************/ - -#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \ - defined(i486) || defined(__i486) || defined(__i486__) || \ - defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) || \ - defined(__i686) || defined(__i686__) || defined(_M_IX86) || \ - defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) || \ - defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64__) || defined(__amd64) || defined(_M_X64) || \ - defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__) -#ifndef __ia32__ -/* LY: define neutral __ia32__ for x86 and x86-64 archs */ -#define __ia32__ 1 -#endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) -/* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ -#define __amd64__ 1 -#endif /* __amd64__ */ -#endif /* all x86 */ - -#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ - !defined(__ORDER_BIG_ENDIAN__) - -/* *INDENT-OFF* */ -/* clang-format off */ - -#if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID__) || \ - defined(HAVE_ENDIAN_H) || __has_include() -#include -#elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) || \ - defined(HAVE_MACHINE_ENDIAN_H) || __has_include() -#include -#elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include() -#include -#elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) || \ - (__has_include() && __has_include()) -#include -#include -#elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) || \ - defined(__NETBSD__) || defined(__NetBSD__) || \ - defined(HAVE_SYS_PARAM_H) || __has_include() -#include -#endif /* OS */ - -/* *INDENT-ON* */ -/* clang-format on */ - -#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) -#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN -#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN -#define __BYTE_ORDER__ __BYTE_ORDER -#elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -#define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN -#define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN -#define __BYTE_ORDER__ _BYTE_ORDER -#else -#define __ORDER_LITTLE_ENDIAN__ 1234 -#define __ORDER_BIG_ENDIAN__ 4321 - -#if defined(__LITTLE_ENDIAN__) || \ - (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) || \ - defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ - defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ - defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) || \ - defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) || \ - defined(__BFIN__) || defined(__ia64__) || defined(_IA64) || \ - defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || \ - defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) || \ - defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) || \ - defined(__WINDOWS__) -#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ - -#elif defined(__BIG_ENDIAN__) || \ - (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) || \ - defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) || \ - defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) || \ - defined(__m68k__) || defined(M68000) || defined(__hppa__) || \ - defined(__hppa) || defined(__HPPA__) || defined(__sparc__) || \ - defined(__sparc) || defined(__370__) || defined(__THW_370__) || \ - defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__) -#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ - -#else -#error __BYTE_ORDER__ should be defined. -#endif /* Arch */ - -#endif -#endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ - -/*****************************************************************************/ - -#ifndef __dll_export -#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllexport) -#define __dll_export __attribute__((dllexport)) -#elif defined(_MSC_VER) -#define __dll_export __declspec(dllexport) -#else -#define __dll_export -#endif -#elif defined(__GNUC__) || __has_attribute(visibility) -#define __dll_export __attribute__((visibility("default"))) -#else -#define __dll_export -#endif -#endif /* __dll_export */ - -#ifndef __dll_import -#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllimport) -#define __dll_import __attribute__((dllimport)) -#elif defined(_MSC_VER) -#define __dll_import __declspec(dllimport) -#else -#define __dll_import -#endif -#else -#define __dll_import -#endif -#endif /* __dll_import */ - -#ifndef __force_inline -#ifdef _MSC_VER -#define __force_inline __forceinline -#elif __GNUC_PREREQ(3, 2) || __has_attribute(always_inline) -#define __force_inline __inline __attribute__((always_inline)) -#else -#define __force_inline __inline -#endif -#endif /* __force_inline */ - -#ifndef T1HA_API -#if defined(t1ha_EXPORTS) -#define T1HA_API __dll_export -#elif defined(t1ha_IMPORTS) -#define T1HA_API __dll_import -#else -#define T1HA_API -#endif -#endif /* T1HA_API */ - -#if defined(_MSC_VER) && defined(__ia32__) -#define T1HA_ALIGN_PREFIX __declspec(align(32)) /* required only for SIMD */ -#else -#define T1HA_ALIGN_PREFIX -#endif /* _MSC_VER */ - -#if defined(__GNUC__) && defined(__ia32__) -#define T1HA_ALIGN_SUFFIX \ - __attribute__((aligned(32))) /* required only for SIMD */ -#else -#define T1HA_ALIGN_SUFFIX -#endif /* GCC x86 */ - -#ifndef T1HA_USE_INDIRECT_FUNCTIONS -/* GNU ELF indirect functions usage control. For more info please see - * https://en.wikipedia.org/wiki/Executable_and_Linkable_Format - * and https://sourceware.org/glibc/wiki/GNU_IFUNC */ -#if __has_attribute(ifunc) && \ - defined(__ELF__) /* ifunc is broken on Darwin/OSX */ -/* Use ifunc/gnu_indirect_function if corresponding attribute is available, - * Assuming compiler will generate properly code even when - * the -fstack-protector-all and/or the -fsanitize=address are enabled. */ -#define T1HA_USE_INDIRECT_FUNCTIONS 1 -#elif defined(__ELF__) && !defined(__SANITIZE_ADDRESS__) && \ - !defined(__SSP_ALL__) -/* ifunc/gnu_indirect_function will be used on ELF, but only if both - * -fstack-protector-all and -fsanitize=address are NOT enabled. */ -#define T1HA_USE_INDIRECT_FUNCTIONS 1 -#else -#define T1HA_USE_INDIRECT_FUNCTIONS 0 -#endif -#endif /* T1HA_USE_INDIRECT_FUNCTIONS */ - -#if __GNUC_PREREQ(4, 0) -#pragma GCC visibility push(hidden) -#endif /* __GNUC_PREREQ(4,0) */ - -#ifdef __cplusplus -extern "C" { -#endif - -typedef union T1HA_ALIGN_PREFIX t1ha_state256 { - uint8_t bytes[32]; - uint32_t u32[8]; - uint64_t u64[4]; - struct { - uint64_t a, b, c, d; - } n; -} t1ha_state256_t T1HA_ALIGN_SUFFIX; - -typedef struct t1ha_context { - t1ha_state256_t state; - t1ha_state256_t buffer; - size_t partial; - uint64_t total; -} t1ha_context_t; - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -/****************************************************************************** - * - * Self-testing API. - * - * Unfortunately, some compilers (exactly only Microsoft Visual C/C++) has - * a bugs which leads t1ha-functions to produce wrong results. This API allows - * check the correctness of the actual code in runtime. - * - * All check-functions returns 0 on success, or -1 in case the corresponding - * hash-function failed verification. PLEASE, always perform such checking at - * initialization of your code, if you using MSVC or other troubleful compilers. - */ - -T1HA_API int t1ha_selfcheck__all_enabled(void); - -#ifndef T1HA2_DISABLED -T1HA_API int t1ha_selfcheck__t1ha2_atonce(void); -T1HA_API int t1ha_selfcheck__t1ha2_atonce128(void); -T1HA_API int t1ha_selfcheck__t1ha2_stream(void); -T1HA_API int t1ha_selfcheck__t1ha2(void); -#endif /* T1HA2_DISABLED */ - -#ifndef T1HA1_DISABLED -T1HA_API int t1ha_selfcheck__t1ha1_le(void); -T1HA_API int t1ha_selfcheck__t1ha1_be(void); -T1HA_API int t1ha_selfcheck__t1ha1(void); -#endif /* T1HA1_DISABLED */ - -#ifndef T1HA0_DISABLED -T1HA_API int t1ha_selfcheck__t1ha0_32le(void); -T1HA_API int t1ha_selfcheck__t1ha0_32be(void); -T1HA_API int t1ha_selfcheck__t1ha0(void); - -/* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */ -#ifndef T1HA0_AESNI_AVAILABLE -#if defined(__e2k__) || \ - (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800)) -#define T1HA0_AESNI_AVAILABLE 1 -#else -#define T1HA0_AESNI_AVAILABLE 0 -#endif -#endif /* ifndef T1HA0_AESNI_AVAILABLE */ - -#if T1HA0_AESNI_AVAILABLE -T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_noavx(void); -T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_avx(void); -#ifndef __e2k__ -T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_avx2(void); -#endif -#endif /* if T1HA0_AESNI_AVAILABLE */ -#endif /* T1HA0_DISABLED */ - -/****************************************************************************** - * - * t1ha2 = 64 and 128-bit, SLIGHTLY MORE ATTENTION FOR QUALITY AND STRENGTH. - * - * - The recommended version of "Fast Positive Hash" with good quality - * for checksum, hash tables and fingerprinting. - * - Portable and extremely efficiency on modern 64-bit CPUs. - * Designed for 64-bit little-endian platforms, - * in other cases will runs slowly. - * - Great quality of hashing and still faster than other non-t1ha hashes. - * Provides streaming mode and 128-bit result. - * - * Note: Due performance reason 64- and 128-bit results are completely - * different each other, i.e. 64-bit result is NOT any part of 128-bit. - */ -#ifndef T1HA2_DISABLED - -/* The at-once variant with 64-bit result */ -T1HA_API uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed); - -/* The at-once variant with 128-bit result. - * Argument `extra_result` is NOT optional and MUST be valid. - * The high 64-bit part of 128-bit hash will be always unconditionally - * stored to the address given by `extra_result` argument. */ -T1HA_API uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, - const void *__restrict data, size_t length, - uint64_t seed); - -/* The init/update/final trinity for streaming. - * Return 64 or 128-bit result depentently from `extra_result` argument. */ -T1HA_API void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y); -T1HA_API void t1ha2_update(t1ha_context_t *__restrict ctx, - const void *__restrict data, size_t length); - -/* Argument `extra_result` is optional and MAY be NULL. - * - If `extra_result` is NOT NULL then the 128-bit hash will be calculated, - * and high 64-bit part of it will be stored to the address given - * by `extra_result` argument. - * - Otherwise the 64-bit hash will be calculated - * and returned from function directly. - * - * Note: Due performance reason 64- and 128-bit results are completely - * different each other, i.e. 64-bit result is NOT any part of 128-bit. */ -T1HA_API uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, - uint64_t *__restrict extra_result /* optional */); - -#endif /* T1HA2_DISABLED */ - -/****************************************************************************** - * - * t1ha1 = 64-bit, BASELINE FAST PORTABLE HASH: - * - * - Runs faster on 64-bit platforms in other cases may runs slowly. - * - Portable and stable, returns same 64-bit result - * on all architectures and CPUs. - * - Unfortunately it fails the "strict avalanche criteria", - * see test results at https://github.com/demerphq/smhasher. - * - * This flaw is insignificant for the t1ha1() purposes and imperceptible - * from a practical point of view. - * However, nowadays this issue has resolved in the next t1ha2(), - * that was initially planned to providing a bit more quality. - */ -#ifndef T1HA1_DISABLED - -/* The little-endian variant. */ -T1HA_API uint64_t t1ha1_le(const void *data, size_t length, uint64_t seed); - -/* The big-endian variant. */ -T1HA_API uint64_t t1ha1_be(const void *data, size_t length, uint64_t seed); - -#endif /* T1HA1_DISABLED */ - -/****************************************************************************** - * - * t1ha0 = 64-bit, JUST ONLY FASTER: - * - * - Provides fast-as-possible hashing for current CPU, including - * 32-bit systems and engaging the available hardware acceleration. - * - It is a facade that selects most quick-and-dirty hash - * for the current processor. For instance, on IA32 (x86) actual function - * will be selected in runtime, depending on current CPU capabilities - * - * BE CAREFUL!!! THIS IS MEANS: - * - * 1. The quality of hash is a subject for tradeoffs with performance. - * So, the quality and strength of t1ha0() may be lower than t1ha1(), - * especially on 32-bit targets, but then much faster. - * However, guaranteed that it passes all SMHasher tests. - * - * 2. No warranty that the hash result will be same for particular - * key on another machine or another version of libt1ha. - * - * Briefly, such hash-results and their derivatives, should be - * used only in runtime, but should not be persist or transferred - * over a network. - * - * - * When T1HA0_RUNTIME_SELECT is nonzero the t1ha0_resolve() function could - * be used to get actual t1ha0() implementation address at runtime. This is - * useful for two cases: - * - calling by local pointer-to-function usually is little - * bit faster (less overhead) than via a PLT thru the DSO boundary. - * - GNU Indirect functions (see below) don't supported by environment - * and calling by t1ha0_funcptr is not available and/or expensive. - */ - -#ifndef T1HA0_DISABLED - -/* The little-endian variant for 32-bit CPU. */ -uint64_t t1ha0_32le(const void *data, size_t length, uint64_t seed); -/* The big-endian variant for 32-bit CPU. */ -uint64_t t1ha0_32be(const void *data, size_t length, uint64_t seed); - -/* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */ -#ifndef T1HA0_AESNI_AVAILABLE -#if defined(__e2k__) || \ - (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800)) -#define T1HA0_AESNI_AVAILABLE 1 -#else -#define T1HA0_AESNI_AVAILABLE 0 -#endif -#endif /* T1HA0_AESNI_AVAILABLE */ - -/* Define T1HA0_RUNTIME_SELECT to 0 for disable dispatching t1ha0 at runtime. */ -#ifndef T1HA0_RUNTIME_SELECT -#if T1HA0_AESNI_AVAILABLE && !defined(__e2k__) -#define T1HA0_RUNTIME_SELECT 1 -#else -#define T1HA0_RUNTIME_SELECT 0 -#endif -#endif /* T1HA0_RUNTIME_SELECT */ - -#if !T1HA0_RUNTIME_SELECT && !defined(T1HA0_USE_DEFINE) -#if defined(__LCC__) -#define T1HA0_USE_DEFINE 1 -#else -#define T1HA0_USE_DEFINE 0 -#endif -#endif /* T1HA0_USE_DEFINE */ - -#if T1HA0_AESNI_AVAILABLE -uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed); -uint64_t t1ha0_ia32aes_avx(const void *data, size_t length, uint64_t seed); -#ifndef __e2k__ -uint64_t t1ha0_ia32aes_avx2(const void *data, size_t length, uint64_t seed); -#endif -#endif /* T1HA0_AESNI_AVAILABLE */ - -#if T1HA0_RUNTIME_SELECT -typedef uint64_t (*t1ha0_function_t)(const void *, size_t, uint64_t); -T1HA_API t1ha0_function_t t1ha0_resolve(void); -#if T1HA_USE_INDIRECT_FUNCTIONS -T1HA_API uint64_t t1ha0(const void *data, size_t length, uint64_t seed); -#else -/* Otherwise function pointer will be used. - * Unfortunately this may cause some overhead calling. */ -T1HA_API extern uint64_t (*t1ha0_funcptr)(const void *data, size_t length, - uint64_t seed); -static __force_inline uint64_t t1ha0(const void *data, size_t length, - uint64_t seed) { - return t1ha0_funcptr(data, length, seed); -} -#endif /* T1HA_USE_INDIRECT_FUNCTIONS */ - -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - -#if T1HA0_USE_DEFINE - -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) -#define t1ha0 t1ha2_atonce -#else -#define t1ha0 t1ha1_be -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ -#define t1ha0 t1ha0_32be -#endif /* 32/64 */ - -#else /* T1HA0_USE_DEFINE */ - -static __force_inline uint64_t t1ha0(const void *data, size_t length, - uint64_t seed) { -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) - return t1ha2_atonce(data, length, seed); -#else - return t1ha1_be(data, length, seed); -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ - return t1ha0_32be(data, length, seed); -#endif /* 32/64 */ -} - -#endif /* !T1HA0_USE_DEFINE */ - -#else /* !T1HA0_RUNTIME_SELECT && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ - -#if T1HA0_USE_DEFINE - -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) -#define t1ha0 t1ha2_atonce -#else -#define t1ha0 t1ha1_le -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ -#define t1ha0 t1ha0_32le -#endif /* 32/64 */ - -#else - -static __force_inline uint64_t t1ha0(const void *data, size_t length, - uint64_t seed) { -#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) && \ - (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED)) -#if defined(T1HA1_DISABLED) - return t1ha2_atonce(data, length, seed); -#else - return t1ha1_le(data, length, seed); -#endif /* T1HA1_DISABLED */ -#else /* 32/64 */ - return t1ha0_32le(data, length, seed); -#endif /* 32/64 */ -} - -#endif /* !T1HA0_USE_DEFINE */ - -#endif /* !T1HA0_RUNTIME_SELECT */ - -#endif /* T1HA0_DISABLED */ - -#ifdef __cplusplus -} -#endif - -#if __GNUC_PREREQ(4, 0) -#pragma GCC visibility pop -#endif /* __GNUC_PREREQ(4,0) */ diff --git a/src/t1ha/t1ha2.c b/src/t1ha/t1ha2.c deleted file mode 100644 index b05d64c..0000000 --- a/src/t1ha/t1ha2.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, - * Fast Positive Hash. - * - * Portions Copyright (c) 2010-2018 Leonid Yuriev , - * The 1Hippeus project (t1h). - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgement in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* - * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } - * by [Positive Technologies](https://www.ptsecurity.ru) - * - * Briefly, it is a 64-bit Hash Function: - * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, - * but portable and without penalties it can run on any 64-bit CPU. - * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash - * and all others portable hash-functions (which do not use specific - * hardware tricks). - * 3. Not suitable for cryptography. - * - * The Future will Positive. Всё будет хорошо. - * - * ACKNOWLEDGEMENT: - * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) - * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! - */ - -#ifndef T1HA2_DISABLED -#include "t1ha_bits.h" -//#include "t1ha_selfcheck.h" - -static __always_inline void init_ab(t1ha_state256_t *s, uint64_t x, - uint64_t y) { - s->n.a = x; - s->n.b = y; -} - -static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x, - uint64_t y) { - s->n.c = rot64(y, 23) + ~x; - s->n.d = ~y + rot64(x, 19); -} - -/* TODO: C++ template in the next version */ -#define T1HA2_UPDATE(ENDIANNES, ALIGNESS, state, v) \ - do { \ - t1ha_state256_t *const s = state; \ - const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0); \ - const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1); \ - const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2); \ - const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3); \ - \ - const uint64_t d02 = w0 + rot64(w2 + s->n.d, 56); \ - const uint64_t c13 = w1 + rot64(w3 + s->n.c, 19); \ - s->n.d ^= s->n.b + rot64(w1, 38); \ - s->n.c ^= s->n.a + rot64(w0, 57); \ - s->n.b ^= prime_6 * (c13 + w2); \ - s->n.a ^= prime_5 * (d02 + w3); \ - } while (0) - -static __always_inline void squash(t1ha_state256_t *s) { - s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23)); - s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d); -} - -/* TODO: C++ template in the next version */ -#define T1HA2_LOOP(ENDIANNES, ALIGNESS, state, data, len) \ - do { \ - const void *detent = (const uint8_t *)data + len - 31; \ - do { \ - const uint64_t *v = (const uint64_t *)data; \ - data = (const uint64_t *)data + 4; \ - prefetch(data); \ - T1HA2_UPDATE(le, ALIGNESS, state, v); \ - } while (likely(data < detent)); \ - } while (0) - -/* TODO: C++ template in the next version */ -#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, state, data, len) \ - do { \ - t1ha_state256_t *const s = state; \ - const uint64_t *v = (const uint64_t *)data; \ - switch (len) { \ - default: \ - mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_4); \ - /* fall through */ \ - case 24: \ - case 23: \ - case 22: \ - case 21: \ - case 20: \ - case 19: \ - case 18: \ - case 17: \ - mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_3); \ - /* fall through */ \ - case 16: \ - case 15: \ - case 14: \ - case 13: \ - case 12: \ - case 11: \ - case 10: \ - case 9: \ - mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_2); \ - /* fall through */ \ - case 8: \ - case 7: \ - case 6: \ - case 5: \ - case 4: \ - case 3: \ - case 2: \ - case 1: \ - mixup64(&s->n.b, &s->n.a, tail64_##ENDIANNES##_##ALIGNESS(v, len), \ - prime_1); \ - /* fall through */ \ - case 0: \ - return final64(s->n.a, s->n.b); \ - } \ - } while (0) - -/* TODO: C++ template in the next version */ -#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, state, data, len) \ - do { \ - t1ha_state256_t *const s = state; \ - const uint64_t *v = (const uint64_t *)data; \ - switch (len) { \ - default: \ - mixup64(&s->n.a, &s->n.d, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_4); \ - /* fall through */ \ - case 24: \ - case 23: \ - case 22: \ - case 21: \ - case 20: \ - case 19: \ - case 18: \ - case 17: \ - mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_3); \ - /* fall through */ \ - case 16: \ - case 15: \ - case 14: \ - case 13: \ - case 12: \ - case 11: \ - case 10: \ - case 9: \ - mixup64(&s->n.c, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++), \ - prime_2); \ - /* fall through */ \ - case 8: \ - case 7: \ - case 6: \ - case 5: \ - case 4: \ - case 3: \ - case 2: \ - case 1: \ - mixup64(&s->n.d, &s->n.c, tail64_##ENDIANNES##_##ALIGNESS(v, len), \ - prime_1); \ - /* fall through */ \ - case 0: \ - return final128(s->n.a, s->n.b, s->n.c, s->n.d, extra_result); \ - } \ - } while (0) - -static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c, - uint64_t d, uint64_t *h) { - mixup64(&a, &b, rot64(c, 41) ^ d, prime_0); - mixup64(&b, &c, rot64(d, 23) ^ a, prime_6); - mixup64(&c, &d, rot64(a, 19) ^ b, prime_5); - mixup64(&d, &a, rot64(b, 31) ^ c, prime_4); - *h = c + d; - return a ^ b; -} - -//------------------------------------------------------------------------------ - -uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) { - t1ha_state256_t state; - init_ab(&state, seed, length); - -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT - if (unlikely(length > 32)) { - init_cd(&state, seed, length); - T1HA2_LOOP(le, unaligned, &state, data, length); - squash(&state); - length &= 31; - } - T1HA2_TAIL_AB(le, unaligned, &state, data, length); -#else - const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; - if (misaligned) { - if (unlikely(length > 32)) { - init_cd(&state, seed, length); - T1HA2_LOOP(le, unaligned, &state, data, length); - squash(&state); - length &= 31; - } - T1HA2_TAIL_AB(le, unaligned, &state, data, length); - } else { - if (unlikely(length > 32)) { - init_cd(&state, seed, length); - T1HA2_LOOP(le, aligned, &state, data, length); - squash(&state); - length &= 31; - } - T1HA2_TAIL_AB(le, aligned, &state, data, length); - } -#endif -} - -uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result, - const void *__restrict data, size_t length, - uint64_t seed) { - t1ha_state256_t state; - init_ab(&state, seed, length); - init_cd(&state, seed, length); - -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT - if (unlikely(length > 32)) { - T1HA2_LOOP(le, unaligned, &state, data, length); - length &= 31; - } - T1HA2_TAIL_ABCD(le, unaligned, &state, data, length); -#else - const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; - if (misaligned) { - if (unlikely(length > 32)) { - T1HA2_LOOP(le, unaligned, &state, data, length); - length &= 31; - } - T1HA2_TAIL_ABCD(le, unaligned, &state, data, length); - } else { - if (unlikely(length > 32)) { - T1HA2_LOOP(le, aligned, &state, data, length); - length &= 31; - } - T1HA2_TAIL_ABCD(le, aligned, &state, data, length); - } -#endif -} - -//------------------------------------------------------------------------------ - -void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y) { - init_ab(&ctx->state, seed_x, seed_y); - init_cd(&ctx->state, seed_x, seed_y); - ctx->partial = 0; - ctx->total = 0; -} - -void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data, - size_t length) { - ctx->total += length; - - if (ctx->partial) { - const size_t left = 32 - ctx->partial; - const size_t chunk = (length >= left) ? left : length; - memcpy(ctx->buffer.bytes + ctx->partial, data, chunk); - ctx->partial += chunk; - if (ctx->partial < 32) { - assert(left >= length); - return; - } - ctx->partial = 0; - data = (const uint8_t *)data + chunk; - length -= chunk; - T1HA2_UPDATE(le, aligned, &ctx->state, ctx->buffer.u64); - } - - if (length >= 32) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT - T1HA2_LOOP(le, unaligned, &ctx->state, data, length); -#else - const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0; - if (misaligned) { - T1HA2_LOOP(le, unaligned, &ctx->state, data, length); - } else { - T1HA2_LOOP(le, aligned, &ctx->state, data, length); - } -#endif - length &= 31; - } - - if (length) - memcpy(ctx->buffer.bytes, data, ctx->partial = length); -} - -uint64_t t1ha2_final(t1ha_context_t *__restrict ctx, - uint64_t *__restrict extra_result) { - uint64_t bits = (ctx->total << 3) ^ (UINT64_C(1) << 63); -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - bits = bswap64(bits); -#endif - t1ha2_update(ctx, &bits, 8); - - if (likely(!extra_result)) { - squash(&ctx->state); - T1HA2_TAIL_AB(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial); - } - - T1HA2_TAIL_ABCD(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial); -} - -#endif /* T1HA2_DISABLED */ diff --git a/src/t1ha/t1ha_bits.h b/src/t1ha/t1ha_bits.h deleted file mode 100644 index 7c47851..0000000 --- a/src/t1ha/t1ha_bits.h +++ /dev/null @@ -1,1226 +0,0 @@ -/* - * Copyright (c) 2016-2018 Positive Technologies, https://www.ptsecurity.com, - * Fast Positive Hash. - * - * Portions Copyright (c) 2010-2018 Leonid Yuriev , - * The 1Hippeus project (t1h). - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgement in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* - * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" } - * by [Positive Technologies](https://www.ptsecurity.ru) - * - * Briefly, it is a 64-bit Hash Function: - * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64, - * but portable and without penalties it can run on any 64-bit CPU. - * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash - * and all others portable hash-functions (which do not use specific - * hardware tricks). - * 3. Not suitable for cryptography. - * - * The Future will Positive. Всё будет хорошо. - * - * ACKNOWLEDGEMENT: - * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев) - * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta! - */ - -#pragma once - -#if defined(_MSC_VER) -#pragma warning(disable : 4201) /* nameless struct/union */ -#if _MSC_VER > 1800 -#pragma warning(disable : 4464) /* relative include path contains '..' */ -#endif /* 1800 */ -#endif /* MSVC */ -#include "t1ha.h" - -#ifndef T1HA_USE_FAST_ONESHOT_READ -/* Define it to 1 for little bit faster code. - * Unfortunately this may triggering a false-positive alarms from Valgrind, - * AddressSanitizer and other similar tool. - * So, define it to 0 for calmness if doubt. */ -#define T1HA_USE_FAST_ONESHOT_READ 1 -#endif /* T1HA_USE_FAST_ONESHOT_READ */ - -/*****************************************************************************/ - -#include /* for assert() */ -#include /* for bool */ -#include /* for memcpy() */ - -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ - __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ -#error Unsupported byte order. -#endif - -#define T1HA_UNALIGNED_ACCESS__UNABLE 0 -#define T1HA_UNALIGNED_ACCESS__SLOW 1 -#define T1HA_UNALIGNED_ACCESS__EFFICIENT 2 - -#ifndef T1HA_SYS_UNALIGNED_ACCESS -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT -#elif defined(__ia32__) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT -#elif defined(__e2k__) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__SLOW -#elif defined(__ARM_FEATURE_UNALIGNED) -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT -#else -#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE -#endif -#endif /* T1HA_SYS_UNALIGNED_ACCESS */ - -#define ALIGNMENT_16 2 -#define ALIGNMENT_32 4 -#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul -#define ALIGNMENT_64 8 -#else -#define ALIGNMENT_64 4 -#endif - -#ifndef PAGESIZE -#define PAGESIZE 4096 -#endif /* PAGESIZE */ - -/***************************************************************************/ - -#ifndef __has_builtin -#define __has_builtin(x) (0) -#endif - -#ifndef __has_warning -#define __has_warning(x) (0) -#endif - -#ifndef __has_feature -#define __has_feature(x) (0) -#endif - -#ifndef __has_extension -#define __has_extension(x) (0) -#endif - -#if __has_feature(address_sanitizer) -#define __SANITIZE_ADDRESS__ 1 -#endif - -#ifndef __optimize -#if defined(__clang__) && !__has_attribute(optimize) -#define __optimize(ops) -#elif defined(__GNUC__) || __has_attribute(optimize) -#define __optimize(ops) __attribute__((optimize(ops))) -#else -#define __optimize(ops) -#endif -#endif /* __optimize */ - -#ifndef __cold -#if defined(__OPTIMIZE__) -#if defined(__e2k__) -#define __cold __optimize(1) __attribute__((cold)) -#elif defined(__clang__) && !__has_attribute(cold) -/* just put infrequently used functions in separate section */ -#define __cold __attribute__((section("text.unlikely"))) __optimize("Os") -#elif defined(__GNUC__) || __has_attribute(cold) -#define __cold __attribute__((cold)) __optimize("Os") -#else -#define __cold __optimize("Os") -#endif -#else -#define __cold -#endif -#endif /* __cold */ - -#if __GNUC_PREREQ(4, 4) || defined(__clang__) - -#if defined(__ia32__) || defined(__e2k__) -#include -#endif - -#if defined(__ia32__) && !defined(__cpuid_count) -#include -#endif - -#if defined(__e2k__) -#include -#endif - -#ifndef likely -#define likely(cond) __builtin_expect(!!(cond), 1) -#endif - -#ifndef unlikely -#define unlikely(cond) __builtin_expect(!!(cond), 0) -#endif - -#if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable) -#define unreachable() __builtin_unreachable() -#endif - -#define bswap64(v) __builtin_bswap64(v) -#define bswap32(v) __builtin_bswap32(v) -#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) -#define bswap16(v) __builtin_bswap16(v) -#endif - -#if !defined(__maybe_unused) && (__GNUC_PREREQ(4, 3) || __has_attribute(unused)) -#define __maybe_unused __attribute__((unused)) -#endif - -#if !defined(__always_inline) && \ - (__GNUC_PREREQ(3, 2) || __has_attribute(always_inline)) -#define __always_inline __inline __attribute__((always_inline)) -#endif - -#if defined(__e2k__) - -#if __iset__ >= 3 -#define mul_64x64_high(a, b) __builtin_e2k_umulhd(a, b) -#endif /* __iset__ >= 3 */ - -#if __iset__ >= 5 -static __maybe_unused __always_inline unsigned -e2k_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { - *sum = base + addend; - return (unsigned)__builtin_e2k_addcd_c(base, addend, 0); -} -#define add64carry_first(base, addend, sum) \ - e2k_add64carry_first(base, addend, sum) - -static __maybe_unused __always_inline unsigned -e2k_add64carry_next(unsigned carry, uint64_t base, uint64_t addend, - uint64_t *sum) { - *sum = __builtin_e2k_addcd(base, addend, carry); - return (unsigned)__builtin_e2k_addcd_c(base, addend, carry); -} -#define add64carry_next(carry, base, addend, sum) \ - e2k_add64carry_next(carry, base, addend, sum) - -static __maybe_unused __always_inline void e2k_add64carry_last(unsigned carry, - uint64_t base, - uint64_t addend, - uint64_t *sum) { - *sum = __builtin_e2k_addcd(base, addend, carry); -} -#define add64carry_last(carry, base, addend, sum) \ - e2k_add64carry_last(carry, base, addend, sum) -#endif /* __iset__ >= 5 */ - -#define fetch64_be_aligned(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr)) -#define fetch32_be_aligned(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr)) - -#endif /* __e2k__ Elbrus */ - -#elif defined(_MSC_VER) - -#if _MSC_FULL_VER < 190024234 && defined(_M_IX86) -#pragma message( \ - "For AES-NI at least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required.") -#endif -#if _MSC_FULL_VER < 191526730 -#pragma message( \ - "It is recommended to use \"Microsoft C/C++ Compiler\" version 19.15.26730 (Visual Studio 2017 15.8) or newer.") -#endif -#if _MSC_FULL_VER < 180040629 -#error At least "Microsoft C/C++ Compiler" version 18.00.40629 (Visual Studio 2013 Update 5) is required. -#endif - -#pragma warning(push, 1) - -#include -#include -#define likely(cond) (cond) -#define unlikely(cond) (cond) -#define unreachable() __assume(0) -#define bswap64(v) _byteswap_uint64(v) -#define bswap32(v) _byteswap_ulong(v) -#define bswap16(v) _byteswap_ushort(v) -#define rot64(v, s) _rotr64(v, s) -#define rot32(v, s) _rotr(v, s) -#define __always_inline __forceinline - -#if defined(_M_X64) || defined(_M_IA64) -#pragma intrinsic(_umul128) -#define mul_64x64_128(a, b, ph) _umul128(a, b, ph) -#pragma intrinsic(_addcarry_u64) -#define add64carry_first(base, addend, sum) _addcarry_u64(0, base, addend, sum) -#define add64carry_next(carry, base, addend, sum) \ - _addcarry_u64(carry, base, addend, sum) -#define add64carry_last(carry, base, addend, sum) \ - (void)_addcarry_u64(carry, base, addend, sum) -#endif - -#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) -#pragma intrinsic(__umulh) -#define mul_64x64_high(a, b) __umulh(a, b) -#endif - -#if defined(_M_IX86) -#pragma intrinsic(__emulu) -#define mul_32x32_64(a, b) __emulu(a, b) - -#if _MSC_VER >= 1915 /* LY: workaround for SSA-optimizer bug */ -#pragma intrinsic(_addcarry_u32) -#define add32carry_first(base, addend, sum) _addcarry_u32(0, base, addend, sum) -#define add32carry_next(carry, base, addend, sum) \ - _addcarry_u32(carry, base, addend, sum) -#define add32carry_last(carry, base, addend, sum) \ - (void)_addcarry_u32(carry, base, addend, sum) - -static __forceinline char -msvc32_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { - uint32_t *const sum32 = (uint32_t *)sum; - const uint32_t base_32l = (uint32_t)base; - const uint32_t base_32h = (uint32_t)(base >> 32); - const uint32_t addend_32l = (uint32_t)addend; - const uint32_t addend_32h = (uint32_t)(addend >> 32); - return add32carry_next(add32carry_first(base_32l, addend_32l, sum32), - base_32h, addend_32h, sum32 + 1); -} -#define add64carry_first(base, addend, sum) \ - msvc32_add64carry_first(base, addend, sum) - -static __forceinline char msvc32_add64carry_next(char carry, uint64_t base, - uint64_t addend, - uint64_t *sum) { - uint32_t *const sum32 = (uint32_t *)sum; - const uint32_t base_32l = (uint32_t)base; - const uint32_t base_32h = (uint32_t)(base >> 32); - const uint32_t addend_32l = (uint32_t)addend; - const uint32_t addend_32h = (uint32_t)(addend >> 32); - return add32carry_next(add32carry_next(carry, base_32l, addend_32l, sum32), - base_32h, addend_32h, sum32 + 1); -} -#define add64carry_next(carry, base, addend, sum) \ - msvc32_add64carry_next(carry, base, addend, sum) - -static __forceinline void msvc32_add64carry_last(char carry, uint64_t base, - uint64_t addend, - uint64_t *sum) { - uint32_t *const sum32 = (uint32_t *)sum; - const uint32_t base_32l = (uint32_t)base; - const uint32_t base_32h = (uint32_t)(base >> 32); - const uint32_t addend_32l = (uint32_t)addend; - const uint32_t addend_32h = (uint32_t)(addend >> 32); - add32carry_last(add32carry_next(carry, base_32l, addend_32l, sum32), base_32h, - addend_32h, sum32 + 1); -} -#define add64carry_last(carry, base, addend, sum) \ - msvc32_add64carry_last(carry, base, addend, sum) -#endif /* _MSC_FULL_VER >= 190024231 */ - -#elif defined(_M_ARM) -#define mul_32x32_64(a, b) _arm_umull(a, b) -#endif - -#pragma warning(pop) -#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ - has been removed */ -#pragma warning(disable : 4710) /* 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* function 'xyz' selected for \ - automatic inline expansion */ -#pragma warning(disable : 4127) /* conditional expression is constant */ -#pragma warning(disable : 4702) /* unreachable code */ -#endif /* Compiler */ - -#ifndef likely -#define likely(cond) (cond) -#endif -#ifndef unlikely -#define unlikely(cond) (cond) -#endif -#ifndef __maybe_unused -#define __maybe_unused -#endif -#ifndef __always_inline -#define __always_inline __inline -#endif -#ifndef unreachable -#define unreachable() \ - do { \ - } while (1) -#endif - -#ifndef bswap64 -#if defined(bswap_64) -#define bswap64 bswap_64 -#elif defined(__bswap_64) -#define bswap64 __bswap_64 -#else -static __always_inline uint64_t bswap64(uint64_t v) { - return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | - ((v << 24) & UINT64_C(0x0000ff0000000000)) | - ((v << 8) & UINT64_C(0x000000ff00000000)) | - ((v >> 8) & UINT64_C(0x00000000ff000000)) | - ((v >> 24) & UINT64_C(0x0000000000ff0000)) | - ((v >> 40) & UINT64_C(0x000000000000ff00)); -} -#endif -#endif /* bswap64 */ - -#ifndef bswap32 -#if defined(bswap_32) -#define bswap32 bswap_32 -#elif defined(__bswap_32) -#define bswap32 __bswap_32 -#else -static __always_inline uint32_t bswap32(uint32_t v) { - return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | - ((v >> 8) & UINT32_C(0x0000ff00)); -} -#endif -#endif /* bswap32 */ - -#ifndef bswap16 -#if defined(bswap_16) -#define bswap16 bswap_16 -#elif defined(__bswap_16) -#define bswap16 __bswap_16 -#else -static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } -#endif -#endif /* bswap16 */ - -#ifndef read_unaligned -#if defined(__GNUC__) || __has_attribute(packed) -typedef struct { - uint8_t unaligned_8; - uint16_t unaligned_16; - uint32_t unaligned_32; - uint64_t unaligned_64; -} __attribute__((packed)) t1ha_unaligned_proxy; -#define read_unaligned(ptr, bits) \ - (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ - t1ha_unaligned_proxy, unaligned_##bits))) \ - ->unaligned_##bits) -#elif defined(_MSC_VER) -#pragma warning( \ - disable : 4235) /* nonstandard extension used: '__unaligned' \ - * keyword not supported on this architecture */ -#define read_unaligned(ptr, bits) (*(const __unaligned uint##bits##_t *)(ptr)) -#else -#pragma pack(push, 1) -typedef struct { - uint8_t unaligned_8; - uint16_t unaligned_16; - uint32_t unaligned_32; - uint64_t unaligned_64; -} t1ha_unaligned_proxy; -#pragma pack(pop) -#define read_unaligned(ptr, bits) \ - (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof( \ - t1ha_unaligned_proxy, unaligned_##bits))) \ - ->unaligned_##bits) -#endif -#endif /* read_unaligned */ - -#ifndef read_aligned -#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned) -#define read_aligned(ptr, bits) \ - (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits)) -#elif (__GNUC_PREREQ(3, 3) || __has_attribute(aligned)) && !defined(__clang__) -#define read_aligned(ptr, bits) \ - (*(const uint##bits##_t __attribute__((aligned(ALIGNMENT_##bits))) *)(ptr)) -#elif __has_attribute(assume_aligned) - -static __always_inline const - uint16_t *__attribute__((assume_aligned(ALIGNMENT_16))) - cast_aligned_16(const void *ptr) { - return (const uint16_t *)ptr; -} -static __always_inline const - uint32_t *__attribute__((assume_aligned(ALIGNMENT_32))) - cast_aligned_32(const void *ptr) { - return (const uint32_t *)ptr; -} -static __always_inline const - uint64_t *__attribute__((assume_aligned(ALIGNMENT_64))) - cast_aligned_64(const void *ptr) { - return (const uint64_t *)ptr; -} - -#define read_aligned(ptr, bits) (*cast_aligned_##bits(ptr)) - -#elif defined(_MSC_VER) -#define read_aligned(ptr, bits) \ - (*(const __declspec(align(ALIGNMENT_##bits)) uint##bits##_t *)(ptr)) -#else -#define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr)) -#endif -#endif /* read_aligned */ - -#ifndef prefetch -#if (__GNUC_PREREQ(4, 0) || __has_builtin(__builtin_prefetch)) && \ - !defined(__ia32__) -#define prefetch(ptr) __builtin_prefetch(ptr) -#elif defined(_M_ARM64) || defined(_M_ARM) -#define prefetch(ptr) __prefetch(ptr) -#else -#define prefetch(ptr) \ - do { \ - (void)(ptr); \ - } while (0) -#endif -#endif /* prefetch */ - -#if __has_warning("-Wconstant-logical-operand") -#if defined(__clang__) -#pragma clang diagnostic ignored "-Wconstant-logical-operand" -#elif defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wconstant-logical-operand" -#else -#pragma warning disable "constant-logical-operand" -#endif -#endif /* -Wconstant-logical-operand */ - -#if __has_warning("-Wtautological-pointer-compare") -#if defined(__clang__) -#pragma clang diagnostic ignored "-Wtautological-pointer-compare" -#elif defined(__GNUC__) -#pragma GCC diagnostic ignored "-Wtautological-pointer-compare" -#else -#pragma warning disable "tautological-pointer-compare" -#endif -#endif /* -Wtautological-pointer-compare */ - -/***************************************************************************/ - -#if __GNUC_PREREQ(4, 0) -#pragma GCC visibility push(hidden) -#endif /* __GNUC_PREREQ(4,0) */ - -/*---------------------------------------------------------- Little Endian */ - -#ifndef fetch16_le_aligned -static __always_inline uint16_t fetch16_le_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_16 == 0); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 16); -#else - return bswap16(read_aligned(v, 16)); -#endif -} -#endif /* fetch16_le_aligned */ - -#ifndef fetch16_le_unaligned -static __always_inline uint16_t fetch16_le_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - const uint8_t *p = (const uint8_t *)v; - return p[0] | (uint16_t)p[1] << 8; -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 16); -#else - return bswap16(read_unaligned(v, 16)); -#endif -} -#endif /* fetch16_le_unaligned */ - -#ifndef fetch32_le_aligned -static __always_inline uint32_t fetch32_le_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_32 == 0); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 32); -#else - return bswap32(read_aligned(v, 32)); -#endif -} -#endif /* fetch32_le_aligned */ - -#ifndef fetch32_le_unaligned -static __always_inline uint32_t fetch32_le_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return fetch16_le_unaligned(v) | - (uint32_t)fetch16_le_unaligned((const uint8_t *)v + 2) << 16; -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 32); -#else - return bswap32(read_unaligned(v, 32)); -#endif -} -#endif /* fetch32_le_unaligned */ - -#ifndef fetch64_le_aligned -static __always_inline uint64_t fetch64_le_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_64 == 0); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_aligned(v, 64); -#else - return bswap64(read_aligned(v, 64)); -#endif -} -#endif /* fetch64_le_aligned */ - -#ifndef fetch64_le_unaligned -static __always_inline uint64_t fetch64_le_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return fetch32_le_unaligned(v) | - (uint64_t)fetch32_le_unaligned((const uint8_t *)v + 4) << 32; -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return read_unaligned(v, 64); -#else - return bswap64(read_unaligned(v, 64)); -#endif -} -#endif /* fetch64_le_unaligned */ - -static __always_inline uint64_t tail64_le_aligned(const void *v, size_t tail) { - const uint8_t *const p = (const uint8_t *)v; -#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) - /* We can perform a 'oneshot' read, which is little bit faster. */ - const unsigned shift = ((8 - tail) & 7) << 3; - return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift); -#else - uint64_t r = 0; - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - /* For most CPUs this code is better when not needed byte reordering. */ - case 0: - return fetch64_le_aligned(p); - case 7: - r = (uint64_t)p[6] << 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 32; - /* fall through */ - case 4: - return r + fetch32_le_aligned(p); - case 3: - r = (uint64_t)p[2] << 16; - /* fall through */ - case 2: - return r + fetch16_le_aligned(p); - case 1: - return p[0]; -#else - case 0: - r = p[7] << 8; - /* fall through */ - case 7: - r += p[6]; - r <<= 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 8; - /* fall through */ - case 4: - r += p[3]; - r <<= 8; - /* fall through */ - case 3: - r += p[2]; - r <<= 8; - /* fall through */ - case 2: - r += p[1]; - r <<= 8; - /* fall through */ - case 1: - return r + p[0]; -#endif - } -#endif /* T1HA_USE_FAST_ONESHOT_READ */ -} - -#if T1HA_USE_FAST_ONESHOT_READ && \ - T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE && \ - defined(PAGESIZE) && PAGESIZE > 42 && !defined(__SANITIZE_ADDRESS__) -#define can_read_underside(ptr, size) \ - (((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0) -#endif /* T1HA_USE_FAST_ONESHOT_READ */ - -static __always_inline uint64_t tail64_le_unaligned(const void *v, - size_t tail) { - const uint8_t *p = (const uint8_t *)v; -#if defined(can_read_underside) && \ - (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) - /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which - * is little bit faster. Thanks Marcin Żukowski - * for the reminder. */ - const unsigned offset = (8 - tail) & 7; - const unsigned shift = offset << 3; - if (likely(can_read_underside(p, 8))) { - p -= offset; - return fetch64_le_unaligned(p) >> shift; - } - return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift); -#else - uint64_t r = 0; - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ - __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - /* For most CPUs this code is better when not needed - * copying for alignment or byte reordering. */ - case 0: - return fetch64_le_unaligned(p); - case 7: - r = (uint64_t)p[6] << 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 32; - /* fall through */ - case 4: - return r + fetch32_le_unaligned(p); - case 3: - r = (uint64_t)p[2] << 16; - /* fall through */ - case 2: - return r + fetch16_le_unaligned(p); - case 1: - return p[0]; -#else - /* For most CPUs this code is better than a - * copying for alignment and/or byte reordering. */ - case 0: - r = p[7] << 8; - /* fall through */ - case 7: - r += p[6]; - r <<= 8; - /* fall through */ - case 6: - r += p[5]; - r <<= 8; - /* fall through */ - case 5: - r += p[4]; - r <<= 8; - /* fall through */ - case 4: - r += p[3]; - r <<= 8; - /* fall through */ - case 3: - r += p[2]; - r <<= 8; - /* fall through */ - case 2: - r += p[1]; - r <<= 8; - /* fall through */ - case 1: - return r + p[0]; -#endif - } -#endif /* can_read_underside */ -} - -/*------------------------------------------------------------- Big Endian */ - -#ifndef fetch16_be_aligned -static __maybe_unused __always_inline uint16_t -fetch16_be_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_16 == 0); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 16); -#else - return bswap16(read_aligned(v, 16)); -#endif -} -#endif /* fetch16_be_aligned */ - -#ifndef fetch16_be_unaligned -static __maybe_unused __always_inline uint16_t -fetch16_be_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - const uint8_t *p = (const uint8_t *)v; - return (uint16_t)p[0] << 8 | p[1]; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 16); -#else - return bswap16(read_unaligned(v, 16)); -#endif -} -#endif /* fetch16_be_unaligned */ - -#ifndef fetch32_be_aligned -static __maybe_unused __always_inline uint32_t -fetch32_be_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_32 == 0); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 32); -#else - return bswap32(read_aligned(v, 32)); -#endif -} -#endif /* fetch32_be_aligned */ - -#ifndef fetch32_be_unaligned -static __maybe_unused __always_inline uint32_t -fetch32_be_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return (uint32_t)fetch16_be_unaligned(v) << 16 | - fetch16_be_unaligned((const uint8_t *)v + 2); -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 32); -#else - return bswap32(read_unaligned(v, 32)); -#endif -} -#endif /* fetch32_be_unaligned */ - -#ifndef fetch64_be_aligned -static __maybe_unused __always_inline uint64_t -fetch64_be_aligned(const void *v) { - assert(((uintptr_t)v) % ALIGNMENT_64 == 0); -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_aligned(v, 64); -#else - return bswap64(read_aligned(v, 64)); -#endif -} -#endif /* fetch64_be_aligned */ - -#ifndef fetch64_be_unaligned -static __maybe_unused __always_inline uint64_t -fetch64_be_unaligned(const void *v) { -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE - return (uint64_t)fetch32_be_unaligned(v) << 32 | - fetch32_be_unaligned((const uint8_t *)v + 4); -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return read_unaligned(v, 64); -#else - return bswap64(read_unaligned(v, 64)); -#endif -} -#endif /* fetch64_be_unaligned */ - -static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v, - size_t tail) { - const uint8_t *const p = (const uint8_t *)v; -#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__) - /* We can perform a 'oneshot' read, which is little bit faster. */ - const unsigned shift = ((8 - tail) & 7) << 3; - return fetch64_be_aligned(p) >> shift; -#else - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* For most CPUs this code is better when not byte reordering. */ - case 1: - return p[0]; - case 2: - return fetch16_be_aligned(p); - case 3: - return (uint32_t)fetch16_be_aligned(p) << 8 | p[2]; - case 4: - return fetch32_be_aligned(p); - case 5: - return (uint64_t)fetch32_be_aligned(p) << 8 | p[4]; - case 6: - return (uint64_t)fetch32_be_aligned(p) << 16 | fetch16_be_aligned(p + 4); - case 7: - return (uint64_t)fetch32_be_aligned(p) << 24 | - (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6]; - case 0: - return fetch64_be_aligned(p); -#else - case 1: - return p[0]; - case 2: - return p[1] | (uint32_t)p[0] << 8; - case 3: - return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; - case 4: - return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | - (uint32_t)p[0] << 24; - case 5: - return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | - (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; - case 6: - return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | - (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; - case 7: - return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | - (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | - (uint64_t)p[0] << 48; - case 0: - return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | - (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | - (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; -#endif - } -#endif /* T1HA_USE_FAST_ONESHOT_READ */ -} - -static __maybe_unused __always_inline uint64_t -tail64_be_unaligned(const void *v, size_t tail) { - const uint8_t *p = (const uint8_t *)v; -#if defined(can_read_underside) && \ - (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) - /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which - * is little bit faster. Thanks Marcin Żukowski - * for the reminder. */ - const unsigned offset = (8 - tail) & 7; - const unsigned shift = offset << 3; - if (likely(can_read_underside(p, 8))) { - p -= offset; - return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift); - } - return fetch64_be_unaligned(p) >> shift; -#else - switch (tail & 7) { - default: - unreachable(); -/* fall through */ -#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT && \ - __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - /* For most CPUs this code is better when not needed - * copying for alignment or byte reordering. */ - case 1: - return p[0]; - case 2: - return fetch16_be_unaligned(p); - case 3: - return (uint32_t)fetch16_be_unaligned(p) << 8 | p[2]; - case 4: - return fetch32_be(p); - case 5: - return (uint64_t)fetch32_be_unaligned(p) << 8 | p[4]; - case 6: - return (uint64_t)fetch32_be_unaligned(p) << 16 | - fetch16_be_unaligned(p + 4); - case 7: - return (uint64_t)fetch32_be_unaligned(p) << 24 | - (uint32_t)fetch16_be_unaligned(p + 4) << 8 | p[6]; - case 0: - return fetch64_be_unaligned(p); -#else - /* For most CPUs this code is better than a - * copying for alignment and/or byte reordering. */ - case 1: - return p[0]; - case 2: - return p[1] | (uint32_t)p[0] << 8; - case 3: - return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16; - case 4: - return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 | - (uint32_t)p[0] << 24; - case 5: - return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 | - (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32; - case 6: - return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 | - (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40; - case 7: - return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 | - (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 | - (uint64_t)p[0] << 48; - case 0: - return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 | - (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 | - (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56; -#endif - } -#endif /* can_read_underside */ -} - -/***************************************************************************/ - -#ifndef rot64 -static __always_inline uint64_t rot64(uint64_t v, unsigned s) { - return (v >> s) | (v << (64 - s)); -} -#endif /* rot64 */ - -#ifndef mul_32x32_64 -static __always_inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { - return a * (uint64_t)b; -} -#endif /* mul_32x32_64 */ - -#ifndef add64carry_first -static __maybe_unused __always_inline unsigned -add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) { -#if __has_builtin(__builtin_addcll) - unsigned long long carryout; - *sum = __builtin_addcll(base, addend, 0, &carryout); - return (unsigned)carryout; -#else - *sum = base + addend; - return *sum < addend; -#endif /* __has_builtin(__builtin_addcll) */ -} -#endif /* add64carry_fist */ - -#ifndef add64carry_next -static __maybe_unused __always_inline unsigned -add64carry_next(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { -#if __has_builtin(__builtin_addcll) - unsigned long long carryout; - *sum = __builtin_addcll(base, addend, carry, &carryout); - return (unsigned)carryout; -#else - *sum = base + addend + carry; - return *sum < addend || (carry && *sum == addend); -#endif /* __has_builtin(__builtin_addcll) */ -} -#endif /* add64carry_next */ - -#ifndef add64carry_last -static __maybe_unused __always_inline void -add64carry_last(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) { -#if __has_builtin(__builtin_addcll) - unsigned long long carryout; - *sum = __builtin_addcll(base, addend, carry, &carryout); - (void)carryout; -#else - *sum = base + addend + carry; -#endif /* __has_builtin(__builtin_addcll) */ -} -#endif /* add64carry_last */ - -#ifndef mul_64x64_128 -static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a, - uint64_t b, - uint64_t *h) { -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - __uint128_t r = (__uint128_t)a * (__uint128_t)b; - /* modern GCC could nicely optimize this */ - *h = (uint64_t)(r >> 64); - return (uint64_t)r; -#elif defined(mul_64x64_high) - *h = mul_64x64_high(a, b); - return a * b; -#else - /* performs 64x64 to 128 bit multiplication */ - const uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b); - const uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b); - const uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32); - const uint64_t hh = mul_32x32_64(a >> 32, b >> 32); - - /* Few simplification are possible here for 32-bit architectures, - * but thus we would lost compatibility with the original 64-bit - * version. Think is very bad idea, because then 32-bit t1ha will - * still (relatively) very slowly and well yet not compatible. */ - uint64_t l; - add64carry_last(add64carry_first(ll, lh << 32, &l), hh, lh >> 32, h); - add64carry_last(add64carry_first(l, hl << 32, &l), *h, hl >> 32, h); - return l; -#endif -} -#endif /* mul_64x64_128() */ - -#ifndef mul_64x64_high -static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a, - uint64_t b) { - uint64_t h; - mul_64x64_128(a, b, &h); - return h; -} -#endif /* mul_64x64_high */ - -/***************************************************************************/ - -/* 'magic' primes */ -static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB); -static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39); -static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B); -static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571); -static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB); -static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345); -static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31); - -/* xor high and low parts of full 128-bit product */ -static __maybe_unused __always_inline uint64_t mux64(uint64_t v, - uint64_t prime) { - uint64_t l, h; - l = mul_64x64_128(v, prime, &h); - return l ^ h; -} - -static __always_inline uint64_t final64(uint64_t a, uint64_t b) { - uint64_t x = (a + rot64(b, 41)) * prime_0; - uint64_t y = (rot64(a, 23) + b) * prime_6; - return mux64(x ^ y, prime_5); -} - -static __always_inline void mixup64(uint64_t *__restrict a, - uint64_t *__restrict b, uint64_t v, - uint64_t prime) { - uint64_t h; - *a ^= mul_64x64_128(*b + v, prime, &h); - *b += h; -} - -/***************************************************************************/ - -typedef union t1ha_uint128 { -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - __uint128_t v; -#endif - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - uint64_t l, h; -#else - uint64_t h, l; -#endif - }; -} t1ha_uint128_t; - -static __always_inline t1ha_uint128_t not128(const t1ha_uint128_t v) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = ~v.v; -#else - r.l = ~v.l; - r.h = ~v.h; -#endif - return r; -} - -static __always_inline t1ha_uint128_t left128(const t1ha_uint128_t v, - unsigned s) { - t1ha_uint128_t r; - assert(s < 128); -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = v.v << s; -#else - r.l = (s < 64) ? v.l << s : 0; - r.h = (s < 64) ? (v.h << s) | (s ? v.l >> (64 - s) : 0) : v.l << (s - 64); -#endif - return r; -} - -static __always_inline t1ha_uint128_t right128(const t1ha_uint128_t v, - unsigned s) { - t1ha_uint128_t r; - assert(s < 128); -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = v.v >> s; -#else - r.l = (s < 64) ? (s ? v.h << (64 - s) : 0) | (v.l >> s) : v.h >> (s - 64); - r.h = (s < 64) ? v.h >> s : 0; -#endif - return r; -} - -static __always_inline t1ha_uint128_t or128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v | y.v; -#else - r.l = x.l | y.l; - r.h = x.h | y.h; -#endif - return r; -} - -static __always_inline t1ha_uint128_t xor128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v ^ y.v; -#else - r.l = x.l ^ y.l; - r.h = x.h ^ y.h; -#endif - return r; -} - -static __always_inline t1ha_uint128_t rot128(t1ha_uint128_t v, unsigned s) { - s &= 127; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - v.v = (v.v << (128 - s)) | (v.v >> s); - return v; -#else - return s ? or128(left128(v, 128 - s), right128(v, s)) : v; -#endif -} - -static __always_inline t1ha_uint128_t add128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v + y.v; -#else - add64carry_last(add64carry_first(x.l, y.l, &r.l), x.h, y.h, &r.h); -#endif - return r; -} - -static __always_inline t1ha_uint128_t mul128(t1ha_uint128_t x, - t1ha_uint128_t y) { - t1ha_uint128_t r; -#if defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - r.v = x.v * y.v; -#else - r.l = mul_64x64_128(x.l, y.l, &r.h); - r.h += x.l * y.h + y.l * x.h; -#endif - return r; -} - -/***************************************************************************/ - -#if T1HA0_AESNI_AVAILABLE && defined(__ia32__) -uint64_t t1ha_ia32cpu_features(void); - -static __always_inline bool t1ha_ia32_AESNI_avail(uint64_t ia32cpu_features) { - /* check for AES-NI */ - return (ia32cpu_features & UINT32_C(0x02000000)) != 0; -} - -static __always_inline bool t1ha_ia32_AVX_avail(uint64_t ia32cpu_features) { - /* check for any AVX */ - return (ia32cpu_features & UINT32_C(0x1A000000)) == UINT32_C(0x1A000000); -} - -static __always_inline bool t1ha_ia32_AVX2_avail(uint64_t ia32cpu_features) { - /* check for 'Advanced Vector Extensions 2' */ - return ((ia32cpu_features >> 32) & 32) != 0; -} - -#endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */