From 8f3b145fe6bee5d6161b9d9733dfab6b5e981dcd Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Sun, 11 Nov 2018 13:05:34 +0100
Subject: [PATCH] Added DRAM buffer option to rx2c

---
 README.md     |  8 ++++----
 tests/rx2c.py | 46 +++++++++++++++++++++++++++++++---------------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 63f549c..8499820 100644
--- a/README.md
+++ b/README.md
@@ -18,10 +18,10 @@ The VM has access to 4 GiB of external memory in read-only mode. The DRAM memory
 *The DRAM blob can be generated in 0.1-0.3 seconds using 8 threads with hardware-accelerated AES and dual channel DDR3 or DDR4 memory. Dual channel DDR4 memory has enough bandwidth to support up to 16 mining threads.*
 
 #### MMU
-The memory management unit (MMU) interfaces the CPU with the DRAM blob. The purpose of the MMU is to translate the random memory accesses generated by the random program into a DRAM-friendly access pattern, where memory reads are not bound by access latency. The MMU accepts a 32-bit address `addr` and outputs a 64-bit value from DRAM. The MMU splits the 4 GiB DRAM blob into 256-byte blocks. Data within one block is always read sequentially in 32 reads (32×8 bytes). When the current block has been consumed, reading jumps to a random block. The address of the next block is calculated 8 reads before the current block is exhausted to enable efficient prefetching. The MMU uses three internal registers:
+The memory management unit (MMU) interfaces the CPU with the DRAM blob. The purpose of the MMU is to translate the random memory accesses generated by the random program into a DRAM-friendly access pattern, where memory reads are not bound by access latency. The MMU accepts a 32-bit address `addr` and outputs a 64-bit value from DRAM. The MMU splits the 4 GiB DRAM blob into 256-byte blocks. Data within one block is always read sequentially in 32 reads (32×8 bytes). When the current block has been consumed, reading jumps to a random block. The address of the next block is calculated 16 reads before the current block is exhausted to enable efficient prefetching. The MMU uses three internal registers:
 * **m0** - Address of the next quadword to be read from memory (32-bit, 8-byte aligned).
 * **m1** - Address of the next block to be read from memory (32-bit, 256-byte aligned).
-* **mx** - Random 32-bit counter that determines the address of the next block. After each read, the read address is mixed with the counter: `mx ^= addr`. When the 24th quadword of the current block is read (the value of the `m0` register ends with `0xC0`), the value of the `mx` register is copied into register `m1` and the last 8 bits of `m1` are cleared.
+* **mx** - Random 32-bit counter that determines the address of the next block. After each read, the read address is mixed with the counter: `mx ^= addr`. When the 16th quadword of the current block is read (the value of the `m0` register ends with `0x80`), the value of the `mx` register is copied into register `m1` and the last 8 bits of `m1` are cleared.
 
 *When the value of the `m1` register is changed, the memory location can be preloaded into CPU cache using the x86 `PREFETCH` instruction or ARM `PRFM` instruction. Implicit prefetch should ensure that sequentially accessed memory is already in the cache.*
 
@@ -169,7 +169,7 @@ A 32-bit address mask that is used to calculate the write address for the C oper
 |147-157|ROR_64|no|64|6|A >>> B|64|
 
 ##### 32-bit operations
-Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations are 32 bits long and bits 32-63 of C are zero.
+Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations is 32 bits long and bits 32-63 of C are zero.
 
 ##### Multiplication
 There are 5 different multiplication operations. MUL_64 and MULH_64 both take 64-bit unsigned operands, but MUL_64 produces the low 64 bits of the result and MULH_64 produces the high 64 bits. MUL_32 and IMUL_32 use only the low-order 32 bits of the operands and produce a 64-bit result. The signed variant interprets the arguments as signed integers. IMULH_64 takes two 64-bit signed operands and produces the high-order 64 bits of the result.
@@ -246,7 +246,7 @@ The RET instruction behaves like "not taken" when the stack is empty. Taken RET
 The program is initialized from a 256-bit seed value `S`.
 1. A [pcg32](http://www.pcg-random.org/)  random number generator is initialized with state `S[63:0]`.
 2. The generator is used to generate random 128 bytes `R1`.
-3. Integer registers `r0`-`r7` are initialized using bytes 0-63 bytes of `R1`.
+3. Integer registers `r0`-`r7` are initialized using bytes 0-63 of `R1`.
 4. Floating point registers `f0`-`f7` are initialized using bytes 64-127 of `R1` interpreted as 8 64-bit signed integers converted to a double precision floating point format.
 5. The initial value of the `m0` register is set to `S[95:64]` and the the last 8 bits are cleared (256-byte aligned).
 6. `S` is expanded into 10 AES round keys `K0`-`K9`.
diff --git a/tests/rx2c.py b/tests/rx2c.py
index e8be471..5f44ba6 100644
--- a/tests/rx2c.py
+++ b/tests/rx2c.py
@@ -100,6 +100,14 @@ def getRegister(num, type):
     return registers.get(type).format(num)
 
 def writeInitialValues(file):
+    file.write("#ifdef RAM\n")
+    file.write("\tmmu.buffer = (char*)malloc(DRAM_SIZE);\n")
+    file.write("\tif(!mmu.buffer) {\n")
+    file.write('\t\tprintf("DRAM buffer allocation failed\\n");\n')
+    file.write("\t\treturn 1; }\n")
+    file.write("\t\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)mmu.buffer, DRAM_SIZE);\n")
+    file.write('\t\tprintf("DRAM buffer initialized successfully\\n");\n')
+    file.write("#endif\n")
     file.write("\tclock_t clockStart = clock(), clockEnd;\n")
     for i in range(8):
         file.write("\tr{0} = *(uint64_t*)(aesSeed + {1});\n".format(i, i * 8))
@@ -108,7 +116,7 @@ def writeInitialValues(file):
     file.write("\tmmu.m0 = (aesKey[9] << 8) | (aesKey[10] << 16) | (aesKey[11] << 24);\n")
     file.write("\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)scratchpad, SCRATCHPAD_SIZE);\n")
     file.write("\tmmu.mx = 0;\n")
-    file.write("\tmmu.sp = 0;\n")
+    file.write("\tsp = 0;\n")
     file.write("\tic = {0};\n".format(INSTRUCTION_COUNT))
     file.write("\tmxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK; //flush denormals to zero, round to nearest\n")
     file.write("\t_mm_setcsr(mxcsr);\n")
@@ -619,7 +627,7 @@ def writeMain(file):
     file.write(('__attribute__((optimize("Os"))) int main() {\n'
                 "	register uint64_t r0, r1, r2, r3, r4, r5, r6, r7;\n"
                 "	register double f0, f1, f2, f3, f4, f5, f6, f7;\n"
-                "	register uint64_t ic;\n"
+                "	register uint64_t ic, sp;\n"
                 "	convertible_t scratchpad[SCRATCHPAD_LENGTH];\n"
                 "	stack_t stack[STACK_LENGTH];\n"
                 "	mmu_t mmu;\n"
@@ -637,6 +645,7 @@ def writeProlog(file):
                 "typedef uint32_t addr_t;\n"
                 "typedef unsigned __int128 uint128_t;\n"
                 "typedef __int128 int128_t;\n"
+                "typedef unsigned char byte;\n"
                 "typedef union {\n"
                 "	double f64;\n"
                 "	int64_t i64;\n"
@@ -652,8 +661,11 @@ def writeProlog(file):
                 "	addr_t m0;\n"
                 "	addr_t m1;\n"
                 "	addr_t mx;\n"
-                "	uint32_t sp;\n"
+                "#ifdef RAM\n"
+                "	const char* buffer;\n"
+                "#endif\n"
                 "} mmu_t;\n"
+                "#define DRAM_SIZE (1UL << 32)\n"
                 "#define SCRATCHPAD_SIZE (256 * 1024)\n"
                 "#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))\n"
                 "#define SCRATCHPAD_MASK14 (16 * 1024 / sizeof(convertible_t) - 1)\n"
@@ -661,22 +673,26 @@ def writeProlog(file):
                 "#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK14]\n"
                 "#define SCRATCHPAD_256K(x) scratchpad[(x) & SCRATCHPAD_MASK18]\n"
                 "#define STACK_LENGTH (32 * 1024)\n"
-                "#define DRAM(x) __rolq(6364136223846793005ULL*(x)+1442695040888963407ULL,32)\n"
-                "//#define PREFETCH(x) _mm_prefetch(x, _MM_HINT_T0)\n"
+                "#ifdef RAM\n"
+                "#define DRAM_READ(mmu) (convertible_t)*(uint64_t*)((mmu)->buffer + (mmu)->m0)\n"
+                "#define PREFETCH(mmu) _mm_prefetch(((mmu)->buffer + (mmu)->m1), _MM_HINT_T0)\n"
+                "#else\n"
+                "#define DRAM_READ(mmu) (convertible_t)(uint64_t)__rolq(6364136223846793005ULL*((mmu)->m0)+1442695040888963407ULL,32)\n"
                 "#define PREFETCH(x)\n"
-                "#define PUSH_VALUE(x) stack[mmu.sp++].value = x\n"
-                "#define PUSH_ADDRESS(x) stack[mmu.sp++].address = x\n"
-                "#define STACK_IS_EMPTY() (mmu.sp == 0)\n"
-                "#define POP_VALUE() stack[--mmu.sp].value\n"
-                "#define POP_ADDRESS() stack[--mmu.sp].address\n"
+                "#endif\n"
+                "#define PUSH_VALUE(x) stack[sp++].value = x\n"
+                "#define PUSH_ADDRESS(x) stack[sp++].address = x\n"
+                "#define STACK_IS_EMPTY() (sp == 0)\n"
+                "#define POP_VALUE() stack[--sp].value\n"
+                "#define POP_ADDRESS() stack[--sp].address\n"
                 "static convertible_t readDram(mmu_t* mmu, addr_t addr) {\n"
                 "	convertible_t data;\n"
-                "	data.u64 = DRAM(mmu->m0); //TODO\n"
+                "	data = DRAM_READ(mmu);\n"
                 "	mmu->m0 += 8;\n"
                 "	mmu->mx ^= addr;\n"
-                "	if((mmu->m0 & 255) == 192) {\n"
+                "	if((mmu->m0 & 255) == 128) {\n"
                 "		mmu->m1 = mmu->mx & 0xFFFFFF00;\n"
-                "		PREFETCH(mmu->m1); //TODO\n"
+                "		PREFETCH(mmu);\n"
                 "	}\n"
                 "	if((mmu->m0 & 255) == 0)\n"
                 "		mmu->m0 = mmu->m1;\n"
@@ -772,8 +788,8 @@ def writeProlog(file):
 
 with sys.stdout as file:
     writeProlog(file)
-    file.write("const unsigned char aesKey[32] = {{ {0} }};\n".format(genBytes(32)))
-    file.write("const unsigned char aesSeed[128] = {{ {0} }};\n".format(genBytes(128)))
+    file.write("const byte aesKey[32] = {{ {0} }};\n".format(genBytes(32)))
+    file.write("const byte aesSeed[128] = {{ {0} }};\n".format(genBytes(128)))
     writeMain(file)
     writeInitialValues(file)
     for i in range(PROGRAM_SIZE):