From cf59ced79543920c0ae5eedfb0aed6c78395d05c Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Fri, 9 Nov 2018 21:05:45 +0100
Subject: [PATCH] RandomX C generator Updated specification

---
 README.md     | 151 ++++++----
 tests/rx2c.py | 787 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 876 insertions(+), 62 deletions(-)
 create mode 100644 tests/rx2c.py

diff --git a/README.md b/README.md
index 535da2d..90c954c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 
+
 # RandomX
 RandomX ("random ex") is an experimental proof of work (PoW) algorithm that uses random code execution to achieve ASIC resistance.
 
@@ -17,12 +18,12 @@ The VM has access to 4 GiB of external memory in read-only mode. The DRAM memory
 *The DRAM blob can be generated in 0.1-0.3 seconds using 8 threads with hardware-accelerated AES and dual channel DDR3 or DDR4 memory. Dual channel DDR4 memory has enough bandwidth to support up to 16 mining threads.*
 
 #### MMU
-The memory management unit (MMU) interfaces the CPU with the DRAM blob. The purpose of the MMU is to translate the random memory accesses generated by the random program into a DRAM-friendly access pattern, where memory reads are not bound by access latency. The MMU accepts a 32-bit address `addr` and outputs a 64-bit value from DRAM. The MMU splits the 4 GiB DRAM blob into 256-byte blocks. Data within one block is always read sequentially in 32 reads (32×8 bytes). Blocks are read mostly sequentially apart from occasional random jumps that happen on average every 256 blocks. The address of the next block to be read is determined 1 block ahead of time to enable efficient prefetching. The MMU uses three internal registers:
+The memory management unit (MMU) interfaces the CPU with the DRAM blob. The purpose of the MMU is to translate the random memory accesses generated by the random program into a DRAM-friendly access pattern, where memory reads are not bound by access latency. The MMU accepts a 32-bit address `addr` and outputs a 64-bit value from DRAM. The MMU splits the 4 GiB DRAM blob into 256-byte blocks. Data within one block is always read sequentially in 32 reads (32×8 bytes). When a block has been consumed, reading jumps to a random block. The address of the next block is calculated 8 reads before the current block is exhausted to enable efficient prefetching. The MMU uses three internal registers:
 * **m0** - Address of the next quadword to be read from memory (32-bit, 8-byte aligned).
 * **m1** - Address of the next block to be read from memory (32-bit, 256-byte aligned).
-* **mx** - Random 32-bit counter that determines if reading continues sequentially or jumps to a random block. After each read, the read address is mixed with the counter: `mx ^= addr`. When the last quadword of the current block is read (the value of the `m0` register ends with `0xFF`), the MMU checks if the last 8 bits of `mx` are zero. If yes, the value of the `mx` register is copied into register `m1`.
+* **mx** - Random 32-bit counter that determines the address of the next block. After each read, the read address is mixed with the counter: `mx ^= addr`. When the 24th quadword of the current block is read (the value of the `m0` register ends with `0xC0`), the value of the `mx` register is copied into register `m1` and the last 8 bits of `m1` are cleared.
 
-*When the value of the `m1` register is changed, the memory location can be preloaded into CPU cache using the x86 `PREFETCH` instruction or ARM `PRFM` instruction. The average length of a sequential DRAM read is 64 KiB. Implicit prefetch should ensure that sequentially accessed memory is already in the cache.*
+*When the value of the `m1` register is changed, the memory location can be preloaded into CPU cache using the x86 `PREFETCH` instruction or ARM `PRFM` instruction. Implicit prefetch should ensure that sequentially accessed memory is already in the cache.*
 
 #### Scratchpad
 The VM contains a 256 KiB scratchpad, which is accessed randomly both for reading and writing. The scratchpad is split into two segments (16 KiB and 240 KiB). 75% of accesses are into the first 16 KiB.
@@ -32,7 +33,7 @@ The VM contains a 256 KiB scratchpad, which is accessed randomly both for readin
 #### Program
 The actual program is stored in a 8 KiB ring buffer structure. Each program consists of 1024 random 64-bit instructions. The ring buffer structure makes sure that the program forms a closed infinite loop.
 
-*For high-performance mining, the program should be translated directly into machine code. The whole program will typically fit into the L1 instruction cache and hot execution paths should stay in the µOP cache that is used by newer x86 CPUs. This should limit the number of front-end stalls and keep the CPU busy most of the time.*
+*For high-performance mining, the program should be translated directly into machine code. The whole program should fit into the L1 instruction cache and hot execution paths should stay in the µOP cache that is used by newer x86 CPUs. This should limit the number of front-end stalls and keep the CPU busy most of the time.*
 
 #### Control unit
 The control unit (CU) controls the execution of the program. It reads instructions from the program buffer and sends commands to the other units. The CU contains 3 internal registers:
@@ -43,7 +44,7 @@ The control unit (CU) controls the execution of the program. It reads instructio
 *Fixed number of executed instructions per program should ensure roughly equal runtime of each random program.*
 
 #### Stack
-To simulate function calls, the VM uses a stack structure. The program interacts with the stack using the CALL, DCALL and RET instructions. The stack has unlimited size and each stack element is 64 bits wide.
+To simulate function calls, the VM uses a stack structure. The program interacts with the stack using the CALL and RET instructions. The stack has unlimited size and each stack element is 64 bits wide.
 
 #### Register file
 The VM has 8 integer registers `r0`-`r7` (each 64 bits wide), 8 floating point registers `f0`-`f7` (each 64 bits wide) and 4 memory address registers `g0`-`g3` (each 32 bits wide).
@@ -56,18 +57,21 @@ The arithmetic logic unit (ALU) performs integer operations. The ALU can perform
 #### FPU
 The floating-point unit performs IEEE-754 compliant math using 64-bit double precision floating point numbers.
 
+#### Endianness
+The VM stores and loads all data in little-endian byte order.
+
 ## Instruction set
 The 64-bit instruction is encoded as follows:
 
 ![Imgur](https://i.imgur.com/FwYyKBB.png)
 
 #### Opcode (8 bits)
-There are 256 opcodes, which are distributed between various operations depending on their weight (how often they will occur in the program on average). The distribution of opcodes is following:
+There are 256 opcodes, which are distributed between various operations depending on their weight (how often they will occur in the program on average). The distribution of opcodes is following (TBD):
 
 |operation|number of opcodes||
 |---------|-----------------|----|
-|ALU operations|TBD|TBD|
-|FPU operations|TBD|TBD|
+|ALU operations|158|61.7%|
+|FPU operations|66|25.8%|
 |Control flow |32|12.5%|
 
 #### Operand a (8 bits)
@@ -99,7 +103,7 @@ Pseudocode:
 FUNCTION GET_ADDRESS
 	r(a) ^= g0
 	addr = r(a)
-	r(a) <<< 32
+	r(a) <<<= 32
 	g0 = g1
 	g1 = g2
 	g2 = g3
@@ -108,8 +112,7 @@ FUNCTION GET_ADDRESS
 	return addr
 END FUNCTION
 ```
-*The rotation of registers `g0`-`g3` can be performed with a single `SHUFPS` x86 instruction.*
-
+*The rotation of registers `g0`-`g3` can be performed with a single `PSHUFD` x86 instruction.*
 
 
 #### Operand b (8 bits)
@@ -130,10 +133,10 @@ END FUNCTION
 
 The `x(b)` flag encodes a register. For ALU operations, this is an integer register (`r0`-`r7`) and for FPU operations, it's a floating point register (`f0`-`f7`).
 
-`imm1` is a 32-bit immediate value encoded within the instruction. For ALU instructions that use operands shorter than 32 bits, the value is truncated. For operands larger than 32 bits, the value is zero-extended for unsigned instructions and sign-extended for signed instructions. For FPU instructions, the value is treated as a signed 32-bit integer, first converted to a single precision floating point format and then to a double precision format.
+`imm1` is a 32-bit immediate value encoded within the instruction. For ALU instructions that use operands shorter than 32 bits, the value is truncated. For operands larger than 32 bits, the value is zero-extended for unsigned instructions and sign-extended for signed instructions. For FPU instructions, the value is treated as a signed 32-bit integer and converted to a double precision floating point format.
 
 #### imm0 (8 bits)
-An 8-bit immediate value that is used to calculate the jump offset of the CALL and DCALL instructions.
+An 8-bit immediate value that is used to calculate the jump offset of the CALL instruction.
 
 #### Result writeback
 
@@ -172,51 +175,65 @@ END PROCEDURE
 
 |opcodes|instruction|signed|A width|B width|C|C width|
 |-|-|-|-|-|-|-|
-|TBD|ADD_64|no|64|64|A + B|64|
-|TBD|ADD_32|no|32|32|A + B|32|
-|TBD|ADD_16|no|16|16|A + B|16|
-|TBD|SUB_64|no|64|64|A - B|64|
-|TBD|SUB_32|no|32|32|A - B|32|
-|TBD|SUB_16|no|16|16|A - B|16|
-|TBD|MUL_64|no|64|64|A * B|64|
-|TBD|MUL_32|no|32|32|A * B|64|
-|TBD|MUL_16|no|16|16|A * B|32|
-|TBD|IMUL_32|yes|32|32|A * B|64|
-|TBD|IMUL_16|yes|16|16|A * B|32|
-|TBD|DIV_64|no|64|32|A / B, A % B|64|
-|TBD|IDIV_64|yes|64|32|A / B, A % B|64|
-|TBD|DIV_32|no|32|16|A / B, A % B|32|
-|TBD|IDIV_32|yes|32|16|A / B, A % B|32|
-|TBD|AND_64|no|64|64|A & B|64|
-|TBD|AND_32|no|32|32|A & B|32|
-|TBD|AND_16|no|16|16|A & B|16|
-|TBD|OR_64|no|64|64|A &#124; B|64|
-|TBD|OR_32|no|32|32|A &#124; B|32|
-|TBD|OR_16|no|16|16|A &#124; B|16|
-|TBD|XOR_64|no|64|64|A ^ B|64|
-|TBD|XOR_32|no|32|32|A ^ B|32|
-|TBD|XOR_16|no|16|16|A ^ B|16|
-|TBD|SHL_64|no|64|6|A << B|64|
-|TBD|SHR_64|no|64|6|A >> B|64|
-|TBD|SAR_64|yes|64|6|A >> B|64|
-|TBD|ROL_64|no|64|6|A <<< B|64|
-|TBD|ROR_64|no|64|6|A >>> B|64|
+|0-13|ADD_64|no|64|64|A + B|64|
+|14-20|ADD_32|no|32|32|A + B|32|
+|21-34|SUB_64|no|64|64|A - B|64|
+|35-41|SUB_32|no|32|32|A - B|32|
+|42-45|MUL_64|no|64|64|A * B|64|
+|46-49|MULH_64|no|64|64|A * B|64|
+|50-53|MUL_32|no|32|32|A * B|64|
+|54-57|IMUL_32|yes|32|32|A * B|64|
+|58-61|IMULH_64|yes|64|64|A * B|64|
+|62|DIV_64|no|64|32|A / B|32|
+|63|IDIV_64|yes|64|32|A / B|32|
+|64-76|AND_64|no|64|64|A & B|64|
+|77-82|AND_32|no|32|32|A & B|32|
+|83-95|OR_64|no|64|64|A &#124; B|64|
+|96-101|OR_32|no|32|32|A &#124; B|32|
+|102-115|XOR_64|no|64|64|A ^ B|64|
+|116-121|XOR_32|no|32|32|A ^ B|32|
+|122-128|SHL_64|no|64|6|A << B|64|
+|129-132|SHR_64|no|64|6|A >> B|64|
+|133-135|SAR_64|yes|64|6|A >> B|64|
+|136-146|ROL_64|no|64|6|A <<< B|64|
+|147-157|ROR_64|no|64|6|A >>> B|64|
+
+##### 32-bit operations
+Instructions ADD_32, SUB_32, AND_32, OR_32, XOR_32 only use the low-order 32 bits of the input operands. The result of these operations are 32 bits long and bits 32-63 of C are zero.
+
+##### Multiplication
+There are 5 different multiplication operations. MUL_64 and MULH_64 both take 64-bit unsigned operands, but MUL_64 produces the low 64 bits of the result and MULH_64 produces the high 64 bits. MUL_32 and IMUL_32 use only the low-order 32 bits of the operands and produce a 64-bit result. The signed variant interprets the arguments as signed integers. IMULH_64 takes two 64-bit signed operands and produces the high-order 64 bits of the result.
+
+The following table shows an example of the output of the 5 multiplication instructions for inputs `A = 0xBC550E96BA88A72B` and `B = 0xF5391FA9F18D6273`:
+
+|instruction|A interpreted as|B interpreted as|result C|
+|-|-|-|-|
+|MUL_64|13570769092688258859|17670189427727360627|`0x28723424A9108E51`|
+|MULH_64|13570769092688258859|17670189427727360627|`0xB4676D31D2B34883`|
+|MUL_32|3129517867|4052574835|`0xB001AA5FA9108E51`|
+|IMUL_32|-1165449429|-242392461|`0x03EBA0C1A9108E51`|
+|IMULH_64|-4875974981021292757|-776554645982190989|`0x02D93EF1269D3EE5`|
 
 ##### Division
-For the division instructions, the divisor is half length of the dividend. The result `C` consists of both the quotient and the remainder (remainder is put the upper bits). The result of division by zero is equal to the dividend.
+For the division instructions, the dividend is 64 bits long and the divisor 32 bits long. The IDIV_64 instruction interprets both operands as signed integers. In case of division by zero or signed overflow, the result is equal to the dividend `A`.
+
+*Division by zero can be handled without branching by conditional move (`IF B == 0 THEN B = 1`). Signed overflow happens only for the signed variant when the minimum negative value is divided by -1. In this extremely rare case, ARM produces the "correct" result, but x86 throws a hardware exception, which must be handled.*
+
+##### Shift and rotate
+The shift/rotate instructions use just the bottom 6 bits of the `B` operand. All treat `A` as unsigned except SAR_64, which performs an arithmetic right shift by copying the sign bit.
 
 ### FPU instructions
 
 |opcodes|instruction|C|
 |-|-|-|
-|TBD|FADD|A + B|
-|TBD|FSUB|A - B|
-|TBD|FMUL|A * B|
-|TBD|FDIV|A / B|
-|TBD|FSQRT|sqrt(A)|
-|TBD|FROUND|A|
+|158-175|FADD|A + B|
+|176-193|FSUB|A - B|
+|194-211|FMUL|A * B|
+|212-214|FDIV|A / B|
+|215-221|FSQRT|sqrt(A)|
+|222-223|FROUND|A|
 
-FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is RN (Round to Nearest). Denormal values are treated as zero.
+FPU instructions conform to the IEEE-754 specification, so they must give correctly rounded results. Initial rounding mode is RN (Round to Nearest). Denormal values may not be produced by any operation.
 
 *Denormals can be disabled by setting the FTZ flag in x86 SSE and ARM Neon engines. This is done for performance reasons.*
 
@@ -237,24 +254,23 @@ The FROUND instruction changes the rounding mode for all subsequent FPU operatio
 |10|Round towards Minus Infinity (RM) mode
 |11|Round towards Zero (RZ) mode
 
-*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 22-33 of the ARM `FPSCR` register.*
+*The two-bit flag value exactly corresponds to bits 13-14 of the x86 `MXCSR` register and bits 22-23 of the ARM `FPSCR` register.*
 
 ### Control flow instructions
-The following 3 control flow instructions are supported:
+The following 2 control flow instructions are supported:
 
 |opcodes|instruction|function|
 |-|-|-|
-|TBD|CALL|near procedure call with a static offset|
-|TBD|DCALL|near procedure call with a dynamic offset|
-|TBD|RET|return from procedure|
+|224-240|CALL|near procedure call|
+|241-255|RET|return from procedure|
 
-All three instructions are conditional in 75% of cases. The jump is taken only if `B <= imm1`. For the 25% of cases when `B` is equal to `imm1`, the jump is unconditional. In case the branch is not taken, all three instructions become "arithmetic no-op" `C = A`.
+Both instructions are conditional in 75% of cases. The jump is taken only if `B <= imm1`. For the 25% of cases when `B` is equal to `imm1`, the jump is unconditional. In case the branch is not taken, both instructions become "arithmetic no-op" `C = A`.
 
-##### CALL and DCALL
-Taken CALL and DCALL instructions push the values `A` and `pc` (program counter) onto the stack and then perform a forward jump relative to the value of `pc`. The forward offset is equal to `8 * (imm0 + 1)` for the CALL instruction and `8 * ((imm0 ^ (A >> 56)) + 1)` for the DCALL instruction. Maximum jump distance is therefore 256 instructions forward (this means that at least 4 correctly spaced CALL/DCALL instructions are needed to form a loop in the program).
+##### CALL
+Taken CALL instruction pushes the values `A` and `pc` (program counter) onto the stack and then performs a forward jump relative to the value of `pc`. The forward offset is equal to `8 * (imm0 + 1)`. Maximum jump distance is therefore 256 instructions forward (this means that at least 4 correctly spaced CALL instructions are needed to form a loop in the program).
 
 ##### RET
-Taken RET instruction pops the return address `raddr` from the stack (it's the instruction following the previous CALL or DCALL), then pops a return value `retval` from the stack and sets `C = A ^ retval`. Finally, the instruction jumps back to `raddr`.
+The RET instruction behaves like "not taken" when the stack is empty. Taken RET instruction pops the return address `raddr` from the stack (it's the instruction following the previous CALL), then pops a return value `retval` from the stack and sets `C = A ^ retval`. Finally, the instruction jumps back to `raddr`.
 
 ## Program generation
 The program is initialized from a 256-bit seed value using a [PCG random number generator](http://www.pcg-random.org/). The program is generated in this order:
@@ -263,9 +279,20 @@ The program is initialized from a 256-bit seed value using a [PCG random number
 3. Initial values of all floating point registers `f0`-`f7` are generated as random 64-bit signed integers converted to a double precision floating point format.
 4. Initial values of all memory address registers `g0`-`g3` are generated as random 32-bit integers.
 5. The initial value of the `m0` register is generated as a random 32-bit value with the last 8 bits cleared (256-byte aligned).
-6. The 256 KiB cache is initialized (TBD).
-7. The remaining registers are initialized as `pc = 0`, `sp = 0`, `ic = 65536` (TBD), `m1 = m0 + 256`, `mx = 0`.
+6. A random 128-byte scratchpad seed is generated.
+7. The initial 256-bit seed is used to generate 10 AES round keys.
+6. The 256 KiB scratchpad is initialized by repeated 10-round AES encryption starting with the scratchpad seed.
+7. The remaining registers are initialized as `pc = 0`, `sp = 0`, `ic = 65536` (TBD), `mx = 0`.
 
 
 ## Result
-When the program terminates (the value of `ic` register reaches 0), the scratchpad, the register file and the stack are hashed using the Blake2b hash function to get the final PoW value. The generation/execution can be chained multiple times to discourage mining strategies that search for programs with particular properties.
+When the program terminates (the value of `ic` register reaches 0), the final result is calculated as follows:
+1. The register file is hashed using the Blake2b 256-bit hash function. The order of registers is: `r0`-`r7`, `f0`-`f7`, `g0`-`g3` (total of 144 bytes).
+2. The 256-bit hash is expanded into 10 AES round keys.
+3. The 256 KiB scratchpad is imploded into 128 bytes using 10-round AES decryption.
+4. The 128 byte scratchpad digest is hashed again using the Blake2b 256-bit hash function. This is the result of the PoW.
+
+*The stack is not included in the result calculation to enable platform-specific return addresses.*
+
+### Chaining
+The program generation, execution and result calculation can be chained multiple times to discourage mining strategies that search for programs with particular properties.
diff --git a/tests/rx2c.py b/tests/rx2c.py
new file mode 100644
index 0000000..7d168a2
--- /dev/null
+++ b/tests/rx2c.py
@@ -0,0 +1,787 @@
+import random
+import sys
+import os
+
+PROGRAM_SIZE = 1024
+INSTRUCTION_COUNT = 65536
+        
+def genBytes(count):
+    return ', '.join(str(random.getrandbits(8)) for i in range(count))
+        
+class OperandType:
+    INT32 = 0
+    UINT32 = 1
+    INT64 = 2
+    UINT64 = 3
+    FLOAT = 4
+    SHIFT = 5
+
+def declareType(type):
+    converters = {
+        0: "int32_t",
+        1: "uint32_t",
+        2: "int64_t",
+        3: "uint64_t",
+        4: "double",
+        5: "int32_t"
+    }
+    return converters.get(type)
+    
+def toSigned32(x):
+    return x - ((x & 0x80000000) << 1)
+    
+def toSigned64(x):
+    return x - ((x & 0x8000000000000000) << 1)
+
+def immediateTo(val, type):
+    converters = {
+        0: toSigned32(val),
+        1: val,
+        2: toSigned32(val),
+        3: val,
+        4: float(toSigned32(val) << 32),
+        5: val & 63
+    }
+    return repr(converters.get(type))
+    
+def registerTo(expr, type):
+    converters = {
+        0: "(int64_t){0}",
+        1: "{0}",
+        2: "(int64_t){0}",
+        3: "{0}",
+        4: "{0}",
+        5: "({0} & 63)"
+    }
+    return converters.get(type).format(expr)
+    
+def registerFrom(num, type):
+    converters = {
+        0: "r{0}",
+        1: "r{0}",
+        2: "r{0}",
+        3: "r{0}",
+        4: "((convertible_t)f{0}).u64",
+        5: "r{0}"
+    }
+    return converters.get(type).format(num)
+    
+def convertibleTo(expr, type):
+    converters = {
+        0: "{0}.i32",
+        1: "{0}.u32",
+        2: "{0}.i64",
+        3: "{0}.u64",
+        4: "(double){0}.i64",
+        5: "({0}.u64 & 63)"
+    }
+    return converters.get(type).format(expr)
+    
+def convertibleFrom(expr, type):
+    converters = {
+        0: "{0}.i32",
+        1: "{0}.u32",
+        2: "{0}.i64",
+        3: "{0}.u64",
+        4: "{0}.f64",
+        5: "({0}.u64 & 63)"
+    }
+    return converters.get(type).format(expr)
+
+def getRegister(num, type):
+    registers = {
+        0: "r{0}",
+        1: "r{0}",
+        2: "r{0}",
+        3: "r{0}",
+        4: "f{0}",
+        5: "r{0}"
+    }
+    return registers.get(type).format(num)
+
+def writeInitialValues(file):
+    file.write("\tclock_t clockStart = clock(), clockEnd;\n")
+    for i in range(8):
+        file.write("\tr{0} = {1}ULL;\n".format(i, random.getrandbits(64)))
+    for i in range(8):
+        file.write("\tf{0} = {1};\n".format(i, toSigned64(random.getrandbits(64))))
+    file.write("\tG = _mm_set_epi64x({0}ULL, {1}ULL);\n".format(random.getrandbits(64), random.getrandbits(64)))
+    file.write("\tmmu.m0 = {1};\n".format(i, random.getrandbits(32) & 0xFFFFFF00))
+    file.write("\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)scratchpad, SCRATCHPAD_SIZE);\n")
+    file.write("\tmmu.mx = 0;\n")
+    file.write("\tmmu.sp = 0;\n")
+    file.write("\tic = 65536;\n")
+    file.write("\tmxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK; //flush denormals to zero, round to nearest\n")
+    file.write("\t_mm_setcsr(mxcsr);\n")
+    
+def writeEpilog(file):
+    file.write("\tend:\n")
+    file.write("\t\tclockEnd = clock();\n")
+    for i in range(8):
+        file.write('\t\tprintf("r{0} = %-36llu f{0} = %g\\n", r{0}, f{0});\n'.format(i))
+    file.write(("\t\tuint64_t spadsum = 0;\n"
+		"\t\tfor(int i = 0; i < SCRATCHPAD_LENGTH; ++i) {\n"
+		"\t\t	spadsum += scratchpad[i].u64;\n"
+		"\t\t}\n"
+		'\t\tprintf("scratchpad sum = %llu\\n", spadsum);\n'
+        '\t\tprintf("runtime: %f\\n", (clockEnd - clockStart) / (double)CLOCKS_PER_SEC);\n'))
+    file.write("\t\treturn 0;")
+    file.write("}")
+
+def writeCommon(file, i, symbol, type, name):
+    file.write("\ti_{0}: {{ //{1}\n".format(i, name))
+    file.write("\t\tif(0 == ic--) goto end;\n")
+    file.write("\t\tr{0} ^= (uint32_t)_mm_cvtsi128_si32(G);\n".format(symbol.ra))
+    file.write("\t\taddr_t addr = r{0};\n".format(symbol.ra))
+    file.write("\t\tr{0} = __rolq(r{0}, 32);\n".format(symbol.ra))
+    file.write("\t\tG = _mm_shuffle_epi32(G, _MM_SHUFFLE(1, 2, 3, 0));\n")
+    if symbol.gen == 0:
+        file.write("\t\t__m128i K = _mm_set_epi64x({0}, r{1});\n".format(registerFrom(symbol.xb, type), symbol.ra))
+        file.write("\t\tG = _mm_aesenc_si128(G, K);\n")
+
+def readA(symbol, type):
+    location = {
+        0: "readDram(&mmu, addr)",
+        1: "readDram(&mmu, addr)",
+        2: "readDram(&mmu, addr)",
+        3: "readDram(&mmu, addr)",
+        4: "SCRATCHPAD_256K(addr)",
+        5: "SCRATCHPAD_16K(addr)",
+        6: "SCRATCHPAD_16K(addr)",
+        7: "SCRATCHPAD_16K(addr)",
+    }
+    return convertibleTo(location.get(symbol.loca), type)
+
+def writeC(symbol, type):
+    location = {
+        0: "SCRATCHPAD_256K(addr)",
+        1: "SCRATCHPAD_16K(addr)",
+        2: "",
+        3: "",
+        4: "SCRATCHPAD_16K(addr)",
+        5: "SCRATCHPAD_16K(addr)",
+        6: "",
+        7: ""
+    }
+    c = location.get(symbol.loca)
+    if c == "":
+        c = getRegister(symbol.xb, type)
+    else:
+        c = convertibleFrom(c, type)
+    return c
+
+def readB(symbol, type):
+    if symbol.locb < 6:
+        return registerTo(getRegister(symbol.xb, type), type)
+    else:
+        return immediateTo(symbol.imm1, type)
+
+class CodeSymbol:
+    def __init__(self, qi):
+        self.opcode = qi & 255
+        self.loca = (qi >> 8) & 7
+        self.ra = (qi >> 11) & 7
+        self.gen = (qi >> 14) & 3
+        self.locb = (qi >> 16) & 7
+        self.xb = (qi >> 19) & 7
+        self.imm0 = (qi >> 24) & 255
+        self.imm1 = qi >> 32
+
+def writeOperation(file, i, symbol, type, name, op):
+    writeCommon(file, i, symbol, type, name)
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
+    file.write("\t\t{0} = A {1} B; }}\n".format(writeC(symbol, type), op))
+
+def write_ADD_64(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT64, 'ADD_64', '+');
+
+def write_ADD_32(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT32, 'ADD_32', '+');
+
+def write_SUB_64(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT64, 'SUB_64', '-');
+
+def write_SUB_32(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT32, 'SUB_32', '-');
+
+def write_MUL_64(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT64, 'MUL_64', '*');
+
+def write_MULH_64(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'MULH_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
+    file.write("\t\t{0} = ((uint128_t)A * B) >> 64; }}\n".format(writeC(symbol, type)))
+
+def write_MUL_32(file, i, symbol):
+    type = OperandType.UINT32
+    writeCommon(file, i, symbol, type, 'MUL_32')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
+    file.write("\t\t{0} = (uint64_t)A * B; }}\n".format(writeC(symbol, OperandType.UINT64)))
+
+def write_IMUL_32(file, i, symbol):
+    type = OperandType.INT32
+    writeCommon(file, i, symbol, type, 'IMUL_32')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
+    file.write("\t\t{0} = (int64_t)A * B; }}\n".format(writeC(symbol, OperandType.INT64)))
+
+def write_IMULH_64(file, i, symbol):
+    type = OperandType.INT64
+    writeCommon(file, i, symbol, type, 'IMULH_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
+    file.write("\t\t{0} = ((int128_t)A * B) >> 64; }}\n".format(writeC(symbol, type)))
+
+def write_DIV_64(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'DIV_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.UINT32), readB(symbol, OperandType.UINT32)))
+    file.write("\t\tif(B == 0) B = 1;\n".format(declareType(type), readB(symbol, type)))
+    file.write("\t\t{0} = A / B; }}\n".format(writeC(symbol, type)))
+
+def write_IDIV_64(file, i, symbol):
+    type = OperandType.INT64
+    writeCommon(file, i, symbol, type, 'IDIV_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.INT32), readB(symbol, OperandType.INT32)))
+    file.write("\t\tif(B == 0) B = 1;\n".format(declareType(type), readB(symbol, type)))
+    file.write("\t\t{0} = A / B; }}\n".format(writeC(symbol, type)))
+
+def write_AND_64(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT64, 'AND_64', '&');
+
+def write_AND_32(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT32, 'AND_32', '&');
+
+def write_OR_64(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT64, 'OR_64', '|');
+
+def write_OR_32(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT32, 'OR_32', '|');
+
+def write_XOR_64(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT64, 'XOR_64', '^');
+
+def write_XOR_32(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.UINT32, 'XOR_32', '^');
+
+def write_SHL_64(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'SHL_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
+    file.write("\t\t{0} = A << B; }}\n".format(writeC(symbol, type)))
+
+def write_SHR_64(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'SHR_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
+    file.write("\t\t{0} = A >> B; }}\n".format(writeC(symbol, type)))
+
+def write_SAR_64(file, i, symbol):
+    type = OperandType.INT64
+    writeCommon(file, i, symbol, type, 'SAR_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
+    file.write("\t\t{0} = A >> B; }}\n".format(writeC(symbol, type)))
+
+def write_ROL_64(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'ROL_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
+    file.write("\t\t{0} = __rolq(A, B); }}\n".format(writeC(symbol, type)))
+
+def write_ROR_64(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'ROR_64')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
+    file.write("\t\t{0} = __rorq(A, B); }}\n".format(writeC(symbol, type)))
+
+def write_FADD(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.FLOAT, 'FADD', '+');
+
+def write_FSUB(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.FLOAT, 'FSUB', '-');
+
+def write_FMUL(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.FLOAT, 'FMUL', '*');
+
+def write_FDIV(file, i, symbol):
+    writeOperation(file, i, symbol, OperandType.FLOAT, 'FDIV', '/');
+
+def write_FSQRT(file, i, symbol):
+    type = OperandType.FLOAT
+    writeCommon(file, i, symbol, type, 'FSQRT')
+    file.write("\t\t{0} A = fabs({1});\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\t{0} = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&A))); }}\n".format(writeC(symbol, type)))
+
+def write_FROUND(file, i, symbol):
+    type = OperandType.FLOAT
+    writeCommon(file, i, symbol, type, 'FROUND')
+    file.write("\t\t{0} A = {1};\n".format(declareType(OperandType.UINT64), readA(symbol, OperandType.UINT64)))
+    file.write("\t\t{0} = A;\n".format(writeC(symbol, type)))
+    file.write("\t\t_mm_setcsr(mxcsr | ((uint32_t)(A << 13) & _MM_ROUND_MASK)); }\n")
+
+def write_CALL(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'CALL')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    if symbol.locb < 6:
+        file.write("\t\tif((uint32_t){0} <= {1}) {{\n".format(getRegister(symbol.xb, type), immediateTo(symbol.imm1, type)))
+    file.write("\t\t\tPUSH_VALUE(A);\n");
+    file.write("\t\t\tPUSH_ADDRESS(&&i_{0});\n".format((i + 1) & (PROGRAM_SIZE - 1)));
+    file.write("\t\t\tgoto i_{0};\n".format((i + 1 + symbol.imm0) & (PROGRAM_SIZE - 1)));
+    if symbol.locb < 6:
+        file.write("\t\t}}\n\t\t{0} = A;".format(writeC(symbol, type)))
+    file.write(" }\n")
+
+def write_RET(file, i, symbol):
+    type = OperandType.UINT64
+    writeCommon(file, i, symbol, type, 'RET')
+    file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
+    file.write("\t\tif(!STACK_IS_EMPTY()")
+    if symbol.locb < 6:
+        file.write(" && (uint32_t){0} <= {1}".format(getRegister(symbol.xb, type), immediateTo(symbol.imm1, type)))
+    file.write(") {\n")
+    file.write("\t\t\tvoid* target = POP_ADDRESS();\n")
+    file.write("\t\t\tuint64_t C = POP_VALUE();\n")
+    file.write("\t\t\t{0} = A ^ C;\n".format(writeC(symbol, type)))
+    file.write("\t\t\tgoto *target;\n")
+    file.write("\t\t}}\n\t\t{0} = A; }}\n".format(writeC(symbol, type)))
+
+opcodeMap = {
+    0: write_ADD_64,
+    1: write_ADD_64,
+    2: write_ADD_64,
+    3: write_ADD_64,
+    4: write_ADD_64,
+    5: write_ADD_64,
+    6: write_ADD_64,
+    7: write_ADD_64,
+    8: write_ADD_64,
+    9: write_ADD_64,
+    10: write_ADD_64,
+    11: write_ADD_64,
+    12: write_ADD_64,
+    13: write_ADD_64,
+    14: write_ADD_32,
+    15: write_ADD_32,
+    16: write_ADD_32,
+    17: write_ADD_32,
+    18: write_ADD_32,
+    19: write_ADD_32,
+    20: write_ADD_32,
+    21: write_SUB_64,
+    22: write_SUB_64,
+    23: write_SUB_64,
+    24: write_SUB_64,
+    25: write_SUB_64,
+    26: write_SUB_64,
+    27: write_SUB_64,
+    28: write_SUB_64,
+    29: write_SUB_64,
+    30: write_SUB_64,
+    31: write_SUB_64,
+    32: write_SUB_64,
+    33: write_SUB_64,
+    34: write_SUB_64,
+    35: write_SUB_32,
+    36: write_SUB_32,
+    37: write_SUB_32,
+    38: write_SUB_32,
+    39: write_SUB_32,
+    40: write_SUB_32,
+    41: write_SUB_32,
+    42: write_MUL_64,
+    43: write_MUL_64,
+    44: write_MUL_64,
+    45: write_MUL_64,
+    46: write_MULH_64,
+    47: write_MULH_64,
+    48: write_MULH_64,
+    49: write_MULH_64,
+    50: write_MUL_32,
+    51: write_MUL_32,
+    52: write_MUL_32,
+    53: write_MUL_32,
+    54: write_IMUL_32,
+    55: write_IMUL_32,
+    56: write_IMUL_32,
+    57: write_IMUL_32,
+    58: write_IMULH_64,
+    59: write_IMULH_64,
+    60: write_IMULH_64,
+    61: write_IMULH_64,
+    62: write_DIV_64,
+    63: write_IDIV_64,
+    64: write_AND_64,
+    65: write_AND_64,
+    66: write_AND_64,
+    67: write_AND_64,
+    68: write_AND_64,
+    69: write_AND_64,
+    70: write_AND_64,
+    71: write_AND_64,
+    72: write_AND_64,
+    73: write_AND_64,
+    74: write_AND_64,
+    75: write_AND_64,
+    76: write_AND_64,
+    77: write_AND_32,
+    78: write_AND_32,
+    79: write_AND_32,
+    80: write_AND_32,
+    81: write_AND_32,
+    82: write_AND_32,
+    83: write_OR_64,
+    84: write_OR_64,
+    85: write_OR_64,
+    86: write_OR_64,
+    87: write_OR_64,
+    88: write_OR_64,
+    89: write_OR_64,
+    90: write_OR_64,
+    91: write_OR_64,
+    92: write_OR_64,
+    93: write_OR_64,
+    94: write_OR_64,
+    95: write_OR_64,
+    96: write_OR_32,
+    97: write_OR_32,
+    98: write_OR_32,
+    99: write_OR_32,
+    100: write_OR_32,
+    101: write_OR_32,
+    102: write_XOR_64,
+    103: write_XOR_64,
+    104: write_XOR_64,
+    105: write_XOR_64,
+    106: write_XOR_64,
+    107: write_XOR_64,
+    108: write_XOR_64,
+    109: write_XOR_64,
+    110: write_XOR_64,
+    111: write_XOR_64,
+    112: write_XOR_64,
+    113: write_XOR_64,
+    114: write_XOR_64,
+    115: write_XOR_64,
+    116: write_XOR_32,
+    117: write_XOR_32,
+    118: write_XOR_32,
+    119: write_XOR_32,
+    120: write_XOR_32,
+    121: write_XOR_32,
+    122: write_SHL_64,
+    123: write_SHL_64,
+    124: write_SHL_64,
+    125: write_SHL_64,
+    126: write_SHL_64,
+    127: write_SHL_64,
+    128: write_SHL_64,
+    129: write_SHR_64,
+    130: write_SHR_64,
+    131: write_SHR_64,
+    132: write_SHR_64,
+    133: write_SAR_64,
+    134: write_SAR_64,
+    135: write_SAR_64,
+    136: write_ROL_64,
+    137: write_ROL_64,
+    138: write_ROL_64,
+    139: write_ROL_64,
+    140: write_ROL_64,
+    141: write_ROL_64,
+    142: write_ROL_64,
+    143: write_ROL_64,
+    144: write_ROL_64,
+    145: write_ROL_64,
+    146: write_ROL_64,
+    147: write_ROR_64,
+    148: write_ROR_64,
+    149: write_ROR_64,
+    150: write_ROR_64,
+    151: write_ROR_64,
+    152: write_ROR_64,
+    153: write_ROR_64,
+    154: write_ROR_64,
+    155: write_ROR_64,
+    156: write_ROR_64,
+    157: write_ROR_64,
+    158: write_FADD,
+    159: write_FADD,
+    160: write_FADD,
+    161: write_FADD,
+    162: write_FADD,
+    163: write_FADD,
+    164: write_FADD,
+    165: write_FADD,
+    166: write_FADD,
+    167: write_FADD,
+    168: write_FADD,
+    169: write_FADD,
+    170: write_FADD,
+    171: write_FADD,
+    172: write_FADD,
+    173: write_FADD,
+    174: write_FADD,
+    175: write_FADD,
+    176: write_FSUB,
+    177: write_FSUB,
+    178: write_FSUB,
+    179: write_FSUB,
+    180: write_FSUB,
+    181: write_FSUB,
+    182: write_FSUB,
+    183: write_FSUB,
+    184: write_FSUB,
+    185: write_FSUB,
+    186: write_FSUB,
+    187: write_FSUB,
+    188: write_FSUB,
+    189: write_FSUB,
+    190: write_FSUB,
+    191: write_FSUB,
+    192: write_FSUB,
+    193: write_FSUB,
+    194: write_FMUL,
+    195: write_FMUL,
+    196: write_FMUL,
+    197: write_FMUL,
+    198: write_FMUL,
+    199: write_FMUL,
+    200: write_FMUL,
+    201: write_FMUL,
+    202: write_FMUL,
+    203: write_FMUL,
+    204: write_FMUL,
+    205: write_FMUL,
+    206: write_FMUL,
+    207: write_FMUL,
+    208: write_FMUL,
+    209: write_FMUL,
+    210: write_FMUL,
+    211: write_FMUL,
+    212: write_FDIV,
+    213: write_FDIV,
+    214: write_FDIV,
+    215: write_FSQRT,
+    216: write_FSQRT,
+    217: write_FSQRT,
+    218: write_FSQRT,
+    219: write_FSQRT,
+    220: write_FSQRT,
+    221: write_FSQRT,
+    222: write_FROUND,
+    223: write_FROUND,
+    224: write_CALL,
+    225: write_CALL,
+    226: write_CALL,
+    227: write_CALL,
+    228: write_CALL,
+    229: write_CALL,
+    230: write_CALL,
+    231: write_CALL,
+    232: write_CALL,
+    233: write_CALL,
+    234: write_CALL,
+    235: write_CALL,
+    236: write_CALL,
+    237: write_CALL,
+    238: write_CALL,
+    239: write_CALL,
+    240: write_CALL,
+    241: write_RET,
+    242: write_RET,
+    243: write_RET,
+    244: write_RET,
+    245: write_RET,
+    246: write_RET,
+    247: write_RET,
+    248: write_RET,
+    249: write_RET,
+    250: write_RET,
+    251: write_RET,
+    252: write_RET,
+    253: write_RET,
+    254: write_RET,
+    255: write_RET,
+}
+
+def writeCode(file, i, symbol):
+    opcodeMap.get(symbol.opcode)(file, i, symbol)
+
+def writeMain(file):
+    file.write(("int main() {\n"
+                "	register uint64_t r0, r1, r2, r3, r4, r5, r6, r7;\n"
+                "	register double f0, f1, f2, f3, f4, f5, f6, f7;\n"
+                "	register __m128i G; //g0-g3\n"
+                "	register uint64_t ic;\n"
+                "	convertible_t scratchpad[SCRATCHPAD_LENGTH];\n"
+                "	stack_t stack[STACK_LENGTH];\n"
+                "	mmu_t mmu;\n"
+                "	uint32_t mxcsr;\n"
+                ))
+
+def writeProlog(file):
+    file.write(("#include <stdint.h>\n"
+                "#include <time.h>\n"
+                "#include <stdio.h>\n"
+                "#include <x86intrin.h>\n"
+                "#include <emmintrin.h>\n"
+                "#include <wmmintrin.h>\n"
+                "#include <math.h>\n"
+                "typedef uint32_t addr_t;\n"
+                "typedef unsigned __int128 uint128_t;\n"
+                "typedef __int128 int128_t;\n"
+                "typedef union {\n"
+                "	double f64;\n"
+                "	int64_t i64;\n"
+                "	uint64_t u64;\n"
+                "	int32_t i32;\n"
+                "	uint32_t u32;\n"
+                "} convertible_t;\n"
+                "typedef union {\n"
+                "	uint64_t value;\n"
+                "	void* address;\n"
+                "} stack_t;\n"
+                "typedef struct {\n"
+                "	addr_t m0;\n"
+                "	addr_t m1;\n"
+                "	addr_t mx;\n"
+                "	uint32_t sp;\n"
+                "} mmu_t;\n"
+                "#define SCRATCHPAD_SIZE (256 * 1024)\n"
+                "#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))\n"
+                "#define SCRATCHPAD_MASK14 (16 * 1024 / sizeof(convertible_t) - 1)\n"
+                "#define SCRATCHPAD_MASK18 (SCRATCHPAD_LENGTH - 1)\n"
+                "#define SCRATCHPAD_16K(x) scratchpad[(x >> 3) & SCRATCHPAD_MASK14]\n"
+                "#define SCRATCHPAD_256K(x) scratchpad[(x >> 3) & SCRATCHPAD_MASK18]\n"
+                "#define STACK_LENGTH (32 * 1024)\n"
+                "#define DRAM(x) __rolq(6364136223846793005*(x)+1442695040888963407,32)\n"
+                "//#define PREFETCH(x) _mm_prefetch(x, _MM_HINT_T0)\n"
+                "#define PREFETCH(x)\n"
+                "#define PUSH_VALUE(x) stack[mmu.sp++].value = x\n"
+                "#define PUSH_ADDRESS(x) stack[mmu.sp++].address = x\n"
+                "#define STACK_IS_EMPTY() (mmu.sp == 0)\n"
+                "#define POP_VALUE() stack[--mmu.sp].value\n"
+                "#define POP_ADDRESS() stack[--mmu.sp].address\n"
+                "static convertible_t readDram(mmu_t* mmu, addr_t addr) {\n"
+                "	convertible_t data;\n"
+                "	data.u64 = DRAM(mmu->m0); //TODO\n"
+                "	mmu->m0 += 8;\n"
+                "	mmu->mx ^= addr;\n"
+                "	if((mmu->m0 & 255) == 192) {\n"
+                "		mmu->m1 = mmu->mx & 0xFFFFFF00;\n"
+                "		PREFETCH(mmu->m1); //TODO\n"
+                "	}\n"
+                "	if((mmu->m0 & 255) == 0)\n"
+                "		mmu->m0 = mmu->m1;\n"
+                "	return data;\n"
+                "}\n"
+                "static inline __m128i sl_xor(__m128i tmp1) {\n"
+                "	__m128i tmp4;\n"
+                "	tmp4 = _mm_slli_si128(tmp1, 0x04);\n"
+                "	tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
+                "	tmp4 = _mm_slli_si128(tmp4, 0x04);\n"
+                "	tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
+                "	tmp4 = _mm_slli_si128(tmp4, 0x04);\n"
+                "	tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
+                "	return tmp1;\n"
+                "}\n"
+                "#define AES_GENKEY_SUB(rcon) do { \\\n"
+                "	__m128i xout1 = _mm_aeskeygenassist_si128(xout2, rcon);	\\\n"
+                "	xout1 = _mm_shuffle_epi32(xout1, 0xFF);	\\\n"
+                "	xout0 = sl_xor(xout0);  \\\n"
+                "	xout0 = _mm_xor_si128(xout0, xout1); \\\n"
+                "	xout1 = _mm_aeskeygenassist_si128(xout0, 0x00); \\\n"
+                "	xout1 = _mm_shuffle_epi32(xout1, 0xAA); \\\n"
+                "	xout2 = sl_xor(xout2); \\\n"
+                "	xout2 = _mm_xor_si128(xout2, xout1); } while(0)\n"
+                "static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) {\n"
+                "	__m128i xout0, xout2;\n"
+                "	xout0 = _mm_load_si128(memory);\n"
+                "	xout2 = _mm_load_si128(memory+1);\n"
+                "	*k0 = xout0;\n"
+                "	*k1 = xout2;\n"
+                "	AES_GENKEY_SUB(0x01);\n"
+                "	*k2 = xout0;\n"
+                "	*k3 = xout2;\n"
+                "	AES_GENKEY_SUB(0x02);\n"
+                "	*k4 = xout0;\n"
+                "	*k5 = xout2;\n"
+                "	AES_GENKEY_SUB(0x04);\n"
+                "	*k6 = xout0;\n"
+                "	*k7 = xout2;\n"
+                "	AES_GENKEY_SUB(0x08);\n"
+                "	*k8 = xout0;\n"
+                "	*k9 = xout2;\n"
+                "}\n"
+                "static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) {\n"
+                "	*x0 = _mm_aesenc_si128(*x0, key);\n"
+                "	*x1 = _mm_aesenc_si128(*x1, key);\n"
+                "	*x2 = _mm_aesenc_si128(*x2, key);\n"
+                "	*x3 = _mm_aesenc_si128(*x3, key);\n"
+                "	*x4 = _mm_aesenc_si128(*x4, key);\n"
+                "	*x5 = _mm_aesenc_si128(*x5, key);\n"
+                "	*x6 = _mm_aesenc_si128(*x6, key);\n"
+                "	*x7 = _mm_aesenc_si128(*x7, key);\n"
+                "}\n"
+                "static void aesInitialize(__m128i* key, __m128i* seed, __m128i* output, size_t count) {\n"
+                "	\n"
+                "	__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;\n"
+                "	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;\n"
+                "	\n"
+                "	aes_genkey(key, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);\n"
+                "	\n"
+                "	xin0 = _mm_load_si128(seed + 0);\n"
+                "	xin1 = _mm_load_si128(seed + 1);\n"
+                "	xin2 = _mm_load_si128(seed + 2);\n"
+                "	xin3 = _mm_load_si128(seed + 3);\n"
+                "	xin4 = _mm_load_si128(seed + 4);\n"
+                "	xin5 = _mm_load_si128(seed + 5);\n"
+                "	xin6 = _mm_load_si128(seed + 6);\n"
+                "	xin7 = _mm_load_si128(seed + 7);\n"
+                "	\n"
+                "	for (size_t i = 0; i < count / sizeof(__m128i); i += 8)\n"
+                "	{\n"
+                "		aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
+                "		\n"
+                "		_mm_store_si128(output + i + 0, xin0);\n"
+                "		_mm_store_si128(output + i + 1, xin1);\n"
+                "		_mm_store_si128(output + i + 2, xin2);\n"
+                "		_mm_store_si128(output + i + 3, xin3);\n"
+                "		_mm_store_si128(output + i + 4, xin4);\n"
+                "		_mm_store_si128(output + i + 5, xin5);\n"
+                "		_mm_store_si128(output + i + 6, xin6);\n"
+                "		_mm_store_si128(output + i + 7, xin7);\n"
+                "	}\n"
+                "}\n"))
+
+with sys.stdout as file:
+    writeProlog(file)
+    file.write("const unsigned char aesKey[32] = {{ {0} }};\n".format(genBytes(32)))
+    file.write("const unsigned char aesSeed[128] = {{ {0} }};\n".format(genBytes(128)))
+    writeMain(file)
+    writeInitialValues(file)
+    for i in range(PROGRAM_SIZE):
+        writeCode(file, i, CodeSymbol(random.getrandbits(64)))
+    file.write("\t\tgoto i_0;\n")
+    writeEpilog(file)
\ No newline at end of file