From 8f3b145fe6bee5d6161b9d9733dfab6b5e981dcd Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 11 Nov 2018 13:05:34 +0100 Subject: [PATCH] Added DRAM buffer option to rx2c --- README.md | 8 ++++---- tests/rx2c.py | 46 +++++++++++++++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 63f549c..8499820 100644 --- a/README.md +++ b/README.md @@ -18,10 +18,10 @@ The VM has access to 4 GiB of external memory in read-only mode. The DRAM memory *The DRAM blob can be generated in 0.1-0.3 seconds using 8 threads with hardware-accelerated AES and dual channel DDR3 or DDR4 memory. Dual channel DDR4 memory has enough bandwidth to support up to 16 mining threads.* #### MMU -The memory management unit (MMU) interfaces the CPU with the DRAM blob. The purpose of the MMU is to translate the random memory accesses generated by the random program into a DRAM-friendly access pattern, where memory reads are not bound by access latency. The MMU accepts a 32-bit address `addr` and outputs a 64-bit value from DRAM. The MMU splits the 4 GiB DRAM blob into 256-byte blocks. Data within one block is always read sequentially in 32 reads (32×8 bytes). When the current block has been consumed, reading jumps to a random block. The address of the next block is calculated 8 reads before the current block is exhausted to enable efficient prefetching. The MMU uses three internal registers: +The memory management unit (MMU) interfaces the CPU with the DRAM blob. The purpose of the MMU is to translate the random memory accesses generated by the random program into a DRAM-friendly access pattern, where memory reads are not bound by access latency. The MMU accepts a 32-bit address `addr` and outputs a 64-bit value from DRAM. The MMU splits the 4 GiB DRAM blob into 256-byte blocks. Data within one block is always read sequentially in 32 reads (32×8 bytes). When the current block has been consumed, reading jumps to a random block. The address of the next block is calculated 16 reads before the current block is exhausted to enable efficient prefetching. The MMU uses three internal registers: * **m0** - Address of the next quadword to be read from memory (32-bit, 8-byte aligned). * **m1** - Address of the next block to be read from memory (32-bit, 256-byte aligned). -* **mx** - Random 32-bit counter that determines the address of the next block. After each read, the read address is mixed with the counter: `mx ^= addr`. When the 24th quadword of the current block is read (the value of the `m0` register ends with `0xC0`), the value of the `mx` register is copied into register `m1` and the last 8 bits of `m1` are cleared. +* **mx** - Random 32-bit counter that determines the address of the next block. After each read, the read address is mixed with the counter: `mx ^= addr`. When the 16th quadword of the current block is read (the value of the `m0` register ends with `0x80`), the value of the `mx` register is copied into register `m1` and the last 8 bits of `m1` are cleared. *When the value of the `m1` register is changed, the memory location can be preloaded into CPU cache using the x86 `PREFETCH` instruction or ARM `PRFM` instruction. Implicit prefetch should ensure that sequentially accessed memory is already in the cache.* @@ -169,7 +169,7 @@ A 32-bit address mask that is used to calculate the write address for the C oper |147-157|ROR_64|no|64|6|A >>> B|64| ##### 32-bit operations -Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations are 32 bits long and bits 32-63 of C are zero. +Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations is 32 bits long and bits 32-63 of C are zero. ##### Multiplication There are 5 different multiplication operations. MUL_64 and MULH_64 both take 64-bit unsigned operands, but MUL_64 produces the low 64 bits of the result and MULH_64 produces the high 64 bits. MUL_32 and IMUL_32 use only the low-order 32 bits of the operands and produce a 64-bit result. The signed variant interprets the arguments as signed integers. IMULH_64 takes two 64-bit signed operands and produces the high-order 64 bits of the result. @@ -246,7 +246,7 @@ The RET instruction behaves like "not taken" when the stack is empty. Taken RET The program is initialized from a 256-bit seed value `S`. 1. A [pcg32](http://www.pcg-random.org/) random number generator is initialized with state `S[63:0]`. 2. The generator is used to generate random 128 bytes `R1`. -3. Integer registers `r0`-`r7` are initialized using bytes 0-63 bytes of `R1`. +3. Integer registers `r0`-`r7` are initialized using bytes 0-63 of `R1`. 4. Floating point registers `f0`-`f7` are initialized using bytes 64-127 of `R1` interpreted as 8 64-bit signed integers converted to a double precision floating point format. 5. The initial value of the `m0` register is set to `S[95:64]` and the the last 8 bits are cleared (256-byte aligned). 6. `S` is expanded into 10 AES round keys `K0`-`K9`. diff --git a/tests/rx2c.py b/tests/rx2c.py index e8be471..5f44ba6 100644 --- a/tests/rx2c.py +++ b/tests/rx2c.py @@ -100,6 +100,14 @@ def getRegister(num, type): return registers.get(type).format(num) def writeInitialValues(file): + file.write("#ifdef RAM\n") + file.write("\tmmu.buffer = (char*)malloc(DRAM_SIZE);\n") + file.write("\tif(!mmu.buffer) {\n") + file.write('\t\tprintf("DRAM buffer allocation failed\\n");\n') + file.write("\t\treturn 1; }\n") + file.write("\t\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)mmu.buffer, DRAM_SIZE);\n") + file.write('\t\tprintf("DRAM buffer initialized successfully\\n");\n') + file.write("#endif\n") file.write("\tclock_t clockStart = clock(), clockEnd;\n") for i in range(8): file.write("\tr{0} = *(uint64_t*)(aesSeed + {1});\n".format(i, i * 8)) @@ -108,7 +116,7 @@ def writeInitialValues(file): file.write("\tmmu.m0 = (aesKey[9] << 8) | (aesKey[10] << 16) | (aesKey[11] << 24);\n") file.write("\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)scratchpad, SCRATCHPAD_SIZE);\n") file.write("\tmmu.mx = 0;\n") - file.write("\tmmu.sp = 0;\n") + file.write("\tsp = 0;\n") file.write("\tic = {0};\n".format(INSTRUCTION_COUNT)) file.write("\tmxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK; //flush denormals to zero, round to nearest\n") file.write("\t_mm_setcsr(mxcsr);\n") @@ -619,7 +627,7 @@ def writeMain(file): file.write(('__attribute__((optimize("Os"))) int main() {\n' " register uint64_t r0, r1, r2, r3, r4, r5, r6, r7;\n" " register double f0, f1, f2, f3, f4, f5, f6, f7;\n" - " register uint64_t ic;\n" + " register uint64_t ic, sp;\n" " convertible_t scratchpad[SCRATCHPAD_LENGTH];\n" " stack_t stack[STACK_LENGTH];\n" " mmu_t mmu;\n" @@ -637,6 +645,7 @@ def writeProlog(file): "typedef uint32_t addr_t;\n" "typedef unsigned __int128 uint128_t;\n" "typedef __int128 int128_t;\n" + "typedef unsigned char byte;\n" "typedef union {\n" " double f64;\n" " int64_t i64;\n" @@ -652,8 +661,11 @@ def writeProlog(file): " addr_t m0;\n" " addr_t m1;\n" " addr_t mx;\n" - " uint32_t sp;\n" + "#ifdef RAM\n" + " const char* buffer;\n" + "#endif\n" "} mmu_t;\n" + "#define DRAM_SIZE (1UL << 32)\n" "#define SCRATCHPAD_SIZE (256 * 1024)\n" "#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))\n" "#define SCRATCHPAD_MASK14 (16 * 1024 / sizeof(convertible_t) - 1)\n" @@ -661,22 +673,26 @@ def writeProlog(file): "#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK14]\n" "#define SCRATCHPAD_256K(x) scratchpad[(x) & SCRATCHPAD_MASK18]\n" "#define STACK_LENGTH (32 * 1024)\n" - "#define DRAM(x) __rolq(6364136223846793005ULL*(x)+1442695040888963407ULL,32)\n" - "//#define PREFETCH(x) _mm_prefetch(x, _MM_HINT_T0)\n" + "#ifdef RAM\n" + "#define DRAM_READ(mmu) (convertible_t)*(uint64_t*)((mmu)->buffer + (mmu)->m0)\n" + "#define PREFETCH(mmu) _mm_prefetch(((mmu)->buffer + (mmu)->m1), _MM_HINT_T0)\n" + "#else\n" + "#define DRAM_READ(mmu) (convertible_t)(uint64_t)__rolq(6364136223846793005ULL*((mmu)->m0)+1442695040888963407ULL,32)\n" "#define PREFETCH(x)\n" - "#define PUSH_VALUE(x) stack[mmu.sp++].value = x\n" - "#define PUSH_ADDRESS(x) stack[mmu.sp++].address = x\n" - "#define STACK_IS_EMPTY() (mmu.sp == 0)\n" - "#define POP_VALUE() stack[--mmu.sp].value\n" - "#define POP_ADDRESS() stack[--mmu.sp].address\n" + "#endif\n" + "#define PUSH_VALUE(x) stack[sp++].value = x\n" + "#define PUSH_ADDRESS(x) stack[sp++].address = x\n" + "#define STACK_IS_EMPTY() (sp == 0)\n" + "#define POP_VALUE() stack[--sp].value\n" + "#define POP_ADDRESS() stack[--sp].address\n" "static convertible_t readDram(mmu_t* mmu, addr_t addr) {\n" " convertible_t data;\n" - " data.u64 = DRAM(mmu->m0); //TODO\n" + " data = DRAM_READ(mmu);\n" " mmu->m0 += 8;\n" " mmu->mx ^= addr;\n" - " if((mmu->m0 & 255) == 192) {\n" + " if((mmu->m0 & 255) == 128) {\n" " mmu->m1 = mmu->mx & 0xFFFFFF00;\n" - " PREFETCH(mmu->m1); //TODO\n" + " PREFETCH(mmu);\n" " }\n" " if((mmu->m0 & 255) == 0)\n" " mmu->m0 = mmu->m1;\n" @@ -772,8 +788,8 @@ def writeProlog(file): with sys.stdout as file: writeProlog(file) - file.write("const unsigned char aesKey[32] = {{ {0} }};\n".format(genBytes(32))) - file.write("const unsigned char aesSeed[128] = {{ {0} }};\n".format(genBytes(128))) + file.write("const byte aesKey[32] = {{ {0} }};\n".format(genBytes(32))) + file.write("const byte aesSeed[128] = {{ {0} }};\n".format(genBytes(128))) writeMain(file) writeInitialValues(file) for i in range(PROGRAM_SIZE):