Removed some legacy code

This commit is contained in:
tevador 2019-05-03 15:33:51 +02:00
parent 9e5eac8645
commit 1037cc0139
16 changed files with 0 additions and 27702 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,16 +0,0 @@
all: branch_always branch_predictably branch_randomly branch_mixed
branch_always: branch_always.c
gcc -O0 branch_always.c -o branch_always
branch_predictably: branch_predictably.c
gcc -O0 branch_predictably.c -o branch_predictably
branch_randomly: branch_randomly.c
gcc -O0 branch_randomly.c -o branch_randomly
branch_mixed: branch_mixed.c
gcc -O0 branch_mixed.c -o branch_mixed
clean:
rm branch_always branch_predictably branch_randomly branch_mixed

View file

@ -1,333 +0,0 @@
/*
* Prof
* ====
*
* Self-contained C/C++ profiler library for Linux.
*
* Prof offers a quick way to measure performance events (CPU clock cycles,
* cache misses, branch mispredictions, etc.) of C/C++ code snippets. Prof is
* just a wrapper around the `perf_event_open` system call, its main goal is to
* be easy to setup and painless to use for targeted optimizations, namely, when
* the hot spot has already been identified. In no way Prof is a replacement for
* a fully-fledged profiler like perf, gprof, callgrind, etc.
*
* Please be aware that Prof uses `__attribute__((constructor))` to be as more
* straightforward to setup as possible, so it cannot be included more than
* once.
*
* Examples
* --------
*
* ### Minimal
*
* The following snippet prints the rough number of CPU clock cycles spent in
* executing the code between the two Prof calls:
*
* ```c
* #include "prof.h"
*
* int main()
* {
* PROF_START();
* // slow code goes here...
* PROF_STDOUT();
* }
* ```
*
* ### Custom options
*
* The following snippet instead counts both read and write faults of the level
* 1 data cache that occur in the userland code between the two Prof calls:
*
* ```c
* #include <stdio.h>
*
* #define PROF_USER_EVENTS_ONLY
* #define PROF_EVENT_LIST \
* PROF_EVENT_CACHE(L1D, READ, MISS) \
* PROF_EVENT_CACHE(L1D, WRITE, MISS)
* #include "prof.h"
*
* int main()
* {
* uint64_t faults[2] = { 0 };
*
* PROF_START();
* // slow code goes here...
* PROF_DO(faults[index] += counter);
*
* // fast or uninteresting code goes here...
*
* PROF_START();
* // slow code goes here...
* PROF_DO(faults[index] += counter);
*
* printf("Total L1 faults: R = %lu; W = %lu\n", faults[0], faults[1]);
* }
* ```
*
* Installation
* ------------
*
* Just include `prof.h`. Here is a quick way to fetch the latest version:
*
* wget -q https://raw.githubusercontent.com/cyrus-and/prof/master/prof.h
*/
#ifndef PROF_H
#define PROF_H
#include <errno.h>
#include <linux/perf_event.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/syscall.h>
#include <unistd.h>
/*
* API
* ---
*/
/*
* Reset the counters and (re)start counting the events.
*
* The events to be monitored are specified by setting the `PROF_EVENT_LIST`
* macro before including this file to a list of `PROF_EVENT_*` invocations;
* defaults to counting the number CPU clock cycles.
*
* If the `PROF_USER_EVENTS_ONLY` macro is defined before including this file
* then kernel and hypervisor events are excluded from the count.
*/
#define PROF_START() \
do { \
PROF_IOCTL_(ENABLE); \
PROF_IOCTL_(RESET); \
} while (0)
/*
* Specify an event to be monitored, `type` and `config` are defined in the
* documentation of the `perf_event_open` system call.
*/
#define PROF_EVENT(type, config) \
(uint32_t)(type), (uint64_t)(config),
/*
* Same as `PROF_EVENT` but for hardware events; prefix `PERF_COUNT_HW_` must be
* omitted from `config`.
*/
#define PROF_EVENT_HW(config) \
PROF_EVENT(PERF_TYPE_HARDWARE, PERF_COUNT_HW_ ## config)
/*
* Same as `PROF_EVENT` but for software events; prefix `PERF_COUNT_SW_` must be
* omitted from `config`.
*/
#define PROF_EVENT_SW(config) \
PROF_EVENT(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ ## config)
/*
* Same as `PROF_EVENT` but for cache events; prefixes `PERF_COUNT_HW_CACHE_`,
* `PERF_COUNT_HW_CACHE_OP_` and `PERF_COUNT_HW_CACHE_RESULT_` must be omitted
* from `cache`, `op` and `result`, respectively. Again `cache`, `op` and
* `result` are defined in the documentation of the `perf_event_open` system
* call.
*/
#define PROF_EVENT_CACHE(cache, op, result) \
PROF_EVENT(PERF_TYPE_HW_CACHE, \
(PERF_COUNT_HW_CACHE_ ## cache) | \
(PERF_COUNT_HW_CACHE_OP_ ## op << 8) | \
(PERF_COUNT_HW_CACHE_RESULT_ ## result << 16))
/*
* Stop counting the events. The counter array can then be accessed with
* `PROF_COUNTERS`.
*/
#define PROF_STOP() \
do { \
PROF_IOCTL_(DISABLE); \
PROF_READ_COUNTERS_(prof_event_buf_); \
} while (0)
/*
* Access the counter array. The order of counters is the same of the events
* defined in `PROF_EVENT_LIST`. Elements of this array are 64 bit unsigned
* integers.
*/
#define PROF_COUNTERS \
(prof_event_buf_ + 1)
/*
* Stop counting the events and execute the code provided by `block` for each
* event. Within `code`: `index` refers to the event position index in the
* counter array defined by `PROF_COUNTERS`; `counter` is the actual value of
* the counter. `index` is a 64 bit unsigned integer.
*/
#define PROF_DO(block) \
do { \
uint64_t i_; \
PROF_STOP(); \
for (i_ = 0; i_ < prof_event_cnt_; i_++) { \
uint64_t index = i_; \
uint64_t counter = prof_event_buf_[i_ + 1]; \
(void)index; \
(void)counter; \
block; \
} \
} while (0)
/*
* Same as `PROF_DO` except that `callback` is the name of a *callable* object
* (e.g. a function) which, for each event, is be called with the two parameters
* `index` and `counter`.
*/
#define PROF_CALL(callback) \
PROF_DO(callback(index, counter))
/*
* Stop counting the events and write to `file` (a stdio.h `FILE *`) as many
* lines as are events in `PROF_EVENT_LIST`. Each line contains `index` and
* `counter` (as defined by `PROF_DO`) separated by a tabulation character. If
* there is only one event then `index` is omitted.
*/
#define PROF_FILE(file) \
PROF_DO(if (prof_event_cnt_ > 1) { \
fprintf((file), "%lu\t%lu\n", index, counter); \
} else { \
fprintf((file), "%lu\n", counter); \
} \
)
/*
* Same as `PROF_LOG_FILE` except that `file` is `stdout`.
*/
#define PROF_STDOUT() \
PROF_FILE(stdout)
/*
* Same as `PROF_LOG_FILE` except that `file` is `stderr`.
*/
#define PROF_STDERR() \
PROF_FILE(stderr)
/* DEFAULTS ----------------------------------------------------------------- */
#ifndef PROF_EVENT_LIST
#ifdef PERF_COUNT_HW_REF_CPU_CYCLES /* since Linux 3.3 */
#define PROF_EVENT_LIST PROF_EVENT_HW(REF_CPU_CYCLES)
#else
#define PROF_EVENT_LIST PROF_EVENT_HW(CPU_CYCLES)
#endif
#endif
/* UTILITY ------------------------------------------------------------------ */
#define PROF_ASSERT_(x) \
do { \
if (!(x)) { \
fprintf(stderr, "# %s:%d: PROF error", __FILE__, __LINE__); \
if (errno) { \
fprintf(stderr, " (%s)", strerror(errno)); \
} \
printf("\n"); \
abort(); \
} \
} while (0)
#define PROF_IOCTL_(mode) \
do { \
PROF_ASSERT_(ioctl(prof_fd_, \
PERF_EVENT_IOC_ ## mode, \
PERF_IOC_FLAG_GROUP) != -1); \
} while (0)
#define PROF_READ_COUNTERS_(buffer) \
do { \
const ssize_t to_read = sizeof(uint64_t) * (prof_event_cnt_ + 1); \
PROF_ASSERT_(read(prof_fd_, buffer, to_read) == to_read); \
} while (0)
/* SETUP -------------------------------------------------------------------- */
static int prof_fd_;
static uint64_t prof_event_cnt_;
static uint64_t *prof_event_buf_;
static void prof_init_(uint64_t dummy, ...) {
uint32_t type;
va_list ap;
prof_fd_ = -1;
prof_event_cnt_ = 0;
va_start(ap, dummy);
while (type = va_arg(ap, uint32_t), type != (uint32_t)-1) {
struct perf_event_attr pe;
uint64_t config;
int fd;
config = va_arg(ap, uint64_t);
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.size = sizeof(struct perf_event_attr);
pe.read_format = PERF_FORMAT_GROUP;
pe.type = type;
pe.config = config;
#ifdef PROF_USER_EVENTS_ONLY
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
#endif
fd = syscall(__NR_perf_event_open, &pe, 0, -1, prof_fd_, 0);
PROF_ASSERT_(fd != -1);
if (prof_fd_ == -1) {
prof_fd_ = fd;
}
prof_event_cnt_++;
}
va_end(ap);
prof_event_buf_ = (uint64_t *)malloc((prof_event_cnt_ + 1) *
sizeof(uint64_t));
}
void __attribute__((constructor)) prof_init()
{
prof_init_(0, PROF_EVENT_LIST /*,*/ (uint32_t)-1);
}
void __attribute__((destructor)) prof_fini()
{
PROF_ASSERT_(close(prof_fd_) != -1);
free(prof_event_buf_);
}
#endif
/*
* License
* -------
*
* Copyright (c) 2017 Andrea Cardaci <cyrus.and@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

View file

@ -1,314 +0,0 @@
//RandomX performance test for x86
//https://github.com/tevador/RandomX
//License: GPL v3
#include <cstdint>
#include <random>
#include <iostream>
#include <chrono>
#include <sstream>
#include <cmath>
#include <cstring>
#if defined(_WIN32) || defined(__MINGW32__) || defined(__CYGWIN__) || defined(__CYGWIN32__)
#define WINDOWS
#include <io.h>
#include <fcntl.h>
#endif
#if defined(__GNUC__) && defined(__x86_64__)
#include <x86intrin.h>
typedef unsigned __int128 uint128_t;
typedef __int128 int128_t;
static inline uint64_t umulhi64(uint64_t a, uint64_t b) {
return ((uint128_t)a * b) >> 64;
}
static inline uint64_t imulhi64(int64_t a, int64_t b) {
return ((int128_t)a * b) >> 64;
}
#define ror64 __rorq
#define rol64 __rolq
#define forceinline inline
#ifdef __clang__
static inline uint64_t __rolq(uint64_t a, int b) {
return (a << b) | (a >> (64 - b));
}
static inline uint64_t __rorq(uint64_t a, int b) {
return (a >> b) | (a << (64 - b));
}
#endif
#elif defined(_MSC_VER) && defined(_M_X64)
#include <intrin.h>
#include <stdlib.h>
#define umulhi64 __umulh
static inline uint64_t imulhi64(int64_t a, int64_t b) {
int64_t hi;
_mul128(a, b, &hi);
return hi;
}
#define ror64 _rotr64
#define rol64 _rotl64
#define forceinline __forceinline
#else
#error "Unsupported platform"
#endif
typedef union {
double f64;
int64_t i64;
uint64_t u64;
int32_t i32;
uint32_t u32;
} convertible_t;
forceinline void NOOP(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64;
}
forceinline void FNOOP(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = (double)a.i64;
}
forceinline void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 + b.u64;
}
forceinline void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 + b.u32;
}
forceinline void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 - b.u64;
}
forceinline void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 - b.u32;
}
forceinline void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 * b.u64;
}
forceinline void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = umulhi64(a.u64, b.u64);
}
forceinline void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = (uint64_t)a.u32 * b.u32;
}
forceinline void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = (int64_t)a.i32 * b.i32;
}
forceinline void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = imulhi64(a.i64, b.i64);
}
forceinline void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
}
forceinline void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
}
forceinline void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 & b.u64;
}
forceinline void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 & b.u32;
}
forceinline void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 | b.u64;
}
forceinline void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 | b.u32;
}
forceinline void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 ^ b.u64;
}
forceinline void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 ^ b.u32;
}
forceinline void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 << (b.u64 & 63);
}
forceinline void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 >> (b.u64 & 63);
}
forceinline void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = a.i64 >> (b.u64 & 63);
}
forceinline void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = rol64(a.u64, (b.u64 & 63));
}
forceinline void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = ror64(a.u64, (b.u64 & 63));
}
forceinline void FADD(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = (double)a.i64 + (double)b.i64;
}
forceinline void FSUB(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = (double)a.i64 - (double)b.i64;
}
forceinline void FMUL(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = (double)a.i64 * (double)b.i64;
}
forceinline void FDIV(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = (double)a.i64 / (double)b.i64;
}
forceinline void FSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
double d = fabs((double)a.i64);
c.f64 = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&d)));
}
static uint32_t mxcsr;
forceinline void FROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = (double)a.i64;
_mm_setcsr(mxcsr | ((uint32_t)(a.u64 << 13) & _MM_ROUND_MASK));
}
inline void init_FPU() {
mxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK;
_mm_setcsr(mxcsr);
}
template<typename T>
bool tryParse(char* buffer, T& out) {
std::istringstream ss(buffer);
if (!(ss >> out)) {
std::cout << "Invalid value '" << buffer << "'" << std::endl;
return false;
}
return true;
}
//#define ITERATIONS 10000000
#define SCRATCHPAD_SIZE (16 * 1024)
#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))
#define SCRATCHPAD_MASK (SCRATCHPAD_SIZE / sizeof(convertible_t) - 1)
#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK]
#define BENCHMARK(FUNC,TYPE) do { \
memcpy((void*)scratchpad, input, SCRATCHPAD_SIZE); \
tstart = std::chrono::high_resolution_clock::now(); \
for (uint64_t i = 0; i < iterations; ++i) { \
FUNC(SCRATCHPAD_16K(i + 8 + 0), r0, SCRATCHPAD_16K(i + 0)); \
SCRATCHPAD_16K(i + 0).u64 ^= r7.u64;\
FUNC(SCRATCHPAD_16K(i + 8 + 1), r1, SCRATCHPAD_16K(i + 1)); \
SCRATCHPAD_16K(i + 1).u64 ^= r6.u64;\
FUNC(SCRATCHPAD_16K(i + 8 + 2), r2, SCRATCHPAD_16K(i + 2)); \
SCRATCHPAD_16K(i + 2).u64 ^= r5.u64;\
FUNC(SCRATCHPAD_16K(i + 8 + 3), r3, SCRATCHPAD_16K(i + 3)); \
SCRATCHPAD_16K(i + 3).u64 ^= r4.u64;\
FUNC(SCRATCHPAD_16K(i + 8 + 4), r4, SCRATCHPAD_16K(i + 4)); \
SCRATCHPAD_16K(i + 4).u64 ^= r3.u64;\
FUNC(SCRATCHPAD_16K(i + 8 + 5), r5, SCRATCHPAD_16K(i + 5)); \
SCRATCHPAD_16K(i + 5).u64 ^= r2.u64;\
FUNC(SCRATCHPAD_16K(i + 8 + 6), r6, SCRATCHPAD_16K(i + 6)); \
SCRATCHPAD_16K(i + 6).u64 ^= r1.u64;\
FUNC(SCRATCHPAD_16K(i + 8 + 7), r7, SCRATCHPAD_16K(i + 7)); \
SCRATCHPAD_16K(i + 7).u64 ^= r0.u64;\
} \
tend = std::chrono::high_resolution_clock::now(); \
uint64_t acum = 0; \
for (int i = 0; i < SCRATCHPAD_LENGTH; ++i) \
acum += scratchpad[i].u64; \
std::cout << "| " << #FUNC << " | " << std::chrono::duration<double>(tend - tstart).count() << " | " << acum << " |" << std::endl; \
} while(false)
int main(int argc, char** argv) {
uint64_t iterations;
if (argc > 1) {
if (!tryParse(argv[1], iterations))
return 1;
}
else {
iterations = 100000000;
}
#ifdef WINDOWS
_setmode(_fileno(stdin), O_BINARY);
#endif
convertible_t input[SCRATCHPAD_LENGTH];
std::cout << "Reading " << sizeof(input) << " bytes from STDIN..." << std::endl;
std::cin.read((char*)input, sizeof(input));
if (!std::cin) {
std::cerr << "Insufficient input" << std::endl;
return 1;
}
convertible_t scratchpad[SCRATCHPAD_LENGTH];
convertible_t r0, r1, r2, r3, r4, r5, r6, r7;
r0.u64 = input[0].u64;
r1.u64 = input[1].u64;
r2.u64 = input[2].u64;
r3.u64 = input[3].u64;
r4.u64 = input[4].u64;
r5.u64 = input[5].u64;
r6.u64 = input[6].u64;
r7.u64 = input[7].u64;
std::chrono::high_resolution_clock::time_point tstart, tend;
std::cout << iterations << " iterations:" << std::endl << std::endl;
std::cout << "| operation | time [s] | (result) |" << std::endl;
std::cout << "|-----------|----------|----------|" << std::endl;
BENCHMARK(NOOP, u64);
BENCHMARK(ADD_64, u64);
BENCHMARK(ADD_32, u64);
BENCHMARK(SUB_64, u64);
BENCHMARK(SUB_32, u64);
BENCHMARK(MUL_64, u64);
BENCHMARK(MULH_64, u64);
BENCHMARK(MUL_32, u64);
BENCHMARK(IMUL_32, u64);
BENCHMARK(IMULH_64, u64);
BENCHMARK(DIV_64, u64);
BENCHMARK(IDIV_64, u64);
BENCHMARK(AND_64, u64);
BENCHMARK(AND_32, u64);
BENCHMARK(OR_64, u64);
BENCHMARK(OR_32, u64);
BENCHMARK(XOR_64, u64);
BENCHMARK(XOR_32, u64);
BENCHMARK(SHL_64, u64);
BENCHMARK(SHR_64, u64);
BENCHMARK(SAR_64, u64);
BENCHMARK(ROR_64, u64);
BENCHMARK(ROL_64, u64);
init_FPU();
BENCHMARK(FNOOP, f64);
BENCHMARK(FADD, f64);
BENCHMARK(FSUB, f64);
BENCHMARK(FMUL, f64);
BENCHMARK(FDIV, f64);
BENCHMARK(FSQRT, f64);
BENCHMARK(FROUND, f64);
return 0;
}

Binary file not shown.

Binary file not shown.

View file

@ -1,595 +0,0 @@
import random
import sys
import os
PROGRAM_SIZE = 512
INSTRUCTION_COUNT = 1024 * 1024
INSTRUCTION_WEIGHTS = [
("ADD_64", 16),
("ADD_32", 8),
("SUB_64", 16),
("SUB_32", 8),
("MUL_64", 7),
("MULH_64", 7),
("MUL_32", 7),
("IMUL_32", 7),
("IMULH_64", 7),
("DIV_64", 1),
("IDIV_64", 1),
("AND_64", 4),
("AND_32", 3),
("OR_64", 4),
("OR_32", 3),
("XOR_64", 4),
("XOR_32", 3),
("SHL_64", 6),
("SHR_64", 6),
("SAR_64", 6),
("ROL_64", 9),
("ROR_64", 9),
("FADD", 22),
("FSUB", 22),
("FMUL", 22),
("FDIV", 8),
("FSQRT", 6),
("FROUND", 2),
("CALL", 17),
("RET", 15),
]
def genBytes(count):
return ', '.join(str(random.getrandbits(8)) for i in range(count))
class OperandType:
INT32 = 0
UINT32 = 1
INT64 = 2
UINT64 = 3
FLOAT = 4
SHIFT = 5
def declareType(type):
converters = {
0: "int32_t",
1: "uint32_t",
2: "int64_t",
3: "uint64_t",
4: "double",
5: "int32_t"
}
return converters.get(type)
def toSigned32(x):
return x - ((x & 0x80000000) << 1)
def toSigned64(x):
return x - ((x & 0x8000000000000000) << 1)
def immediateTo(symbol, type):
converters = {
0: toSigned32(symbol.imm1),
1: symbol.imm1,
2: toSigned32(symbol.imm1),
3: symbol.imm1,
4: float(toSigned32(symbol.imm1) << 32),
5: symbol.imm0 & 63
}
return repr(converters.get(type))
def registerTo(expr, type):
converters = {
0: "(int64_t){0}",
1: "{0}",
2: "(int64_t){0}",
3: "{0}",
4: "{0}",
5: "({0} & 63)"
}
return converters.get(type).format(expr)
def registerFrom(num, type):
converters = {
0: "r{0}",
1: "r{0}",
2: "r{0}",
3: "r{0}",
4: "((convertible_t)f{0}).u64",
5: "r{0}"
}
return converters.get(type).format(num)
def convertibleTo(expr, type):
converters = {
0: "{0}.i32",
1: "{0}.u32",
2: "{0}.i64",
3: "{0}.u64",
4: "(double){0}.i64",
5: "({0}.u64 & 63)"
}
return converters.get(type).format(expr)
def convertibleFrom(expr, type):
converters = {
0: "{0}.i32",
1: "{0}.u32",
2: "{0}.i64",
3: "{0}.u64",
4: "{0}.f64",
5: "({0}.u64 & 63)"
}
return converters.get(type).format(expr)
def getRegister(num, type):
registers = {
0: "r{0}",
1: "r{0}",
2: "r{0}",
3: "r{0}",
4: "f{0}",
5: "r{0}"
}
return registers.get(type).format(num)
def writeInitialValues(file):
file.write("#ifdef RAM\n")
file.write("\tmmu.buffer = (char*)_mm_malloc(DRAM_SIZE, 16);\n")
file.write("\tif(!mmu.buffer) {\n")
file.write('\t\tprintf("DRAM buffer allocation failed\\n");\n')
file.write("\t\treturn 1;\n")
file.write("\t}\n")
file.write('\tprintf("Initializing DRAM buffer...\\n");\n')
file.write("\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)mmu.buffer, DRAM_SIZE);\n")
file.write("#endif\n")
file.write("\tclock_t clockStart = clock(), clockEnd;\n")
for i in range(8):
file.write("\tr{0} = *(uint64_t*)(aesSeed + {1});\n".format(i, i * 8))
for i in range(8):
file.write("\tf{0} = *(int64_t*)(aesSeed + {1});\n".format(i, 64 + i * 8))
file.write("\taesInitialize((__m128i*)aesKey, (__m128i*)aesSeed, (__m128i*)scratchpad, SCRATCHPAD_SIZE);\n")
file.write("\tmmu.ma = *(addr_t*)(aesKey + 8) & ~7U;\n")
file.write("#ifdef PRNTADDR\n")
file.write('\tprintf("DRAM address = %#010x\\n", mmu.ma);\n')
file.write("#endif\n")
file.write("\tmmu.mx = 0;\n")
file.write("\tsp = 0;\n")
file.write("\tic = {0};\n".format(INSTRUCTION_COUNT))
file.write("\tmxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK; //flush denormals to zero, round to nearest\n")
file.write("\t_mm_setcsr(mxcsr);\n")
def writeEpilog(file):
file.write("\tend:\n")
file.write("\t\tclockEnd = clock();\n")
for i in range(8):
file.write('\t\tprintf("r{0} = %-36" PRIu64 " f{0} = %g\\n", r{0}, f{0});\n'.format(i))
file.write(("\t\tuint64_t spadsum = 0;\n"
"\t\tfor(int i = 0; i < SCRATCHPAD_LENGTH; ++i) {\n"
"\t\t spadsum += scratchpad[i].u64;\n"
"\t\t}\n"
'\t\tprintf("scratchpad sum = %" PRIu64 "\\n", spadsum);\n'
'\t\tprintf("runtime: %f\\n", (clockEnd - clockStart) / (double)CLOCKS_PER_SEC);\n'
"#ifdef RAM\n"
"\t\t_mm_free((void*)mmu.buffer);\n"
"#endif\n"))
file.write("\t\treturn 0;")
file.write("}")
def writeCommon(file, i, symbol, type, name):
file.write("\ti_{0}: {{ //{1}\n".format(i, name))
file.write("\t\tif(0 == ic--) goto end;\n")
file.write("\t\tr{0} ^= {1};\n".format(symbol.rega, symbol.addr0))
file.write("\t\taddr_t addr = r{0};\n".format(symbol.rega))
def readA(symbol, type):
location = {
0: "readDram(&mmu, addr)",
1: "readDram(&mmu, addr)",
2: "readDram(&mmu, addr)",
3: "readDram(&mmu, addr)",
4: "SCRATCHPAD_256K(addr)",
5: "SCRATCHPAD_16K(addr)",
6: "SCRATCHPAD_16K(addr)",
7: "SCRATCHPAD_16K(addr)",
}
return convertibleTo(location.get(symbol.loca), type)
def writeC(symbol, type):
location = {
0: "SCRATCHPAD_256K(r{0} ^ {1})",
1: "SCRATCHPAD_16K(r{0} ^ {1})",
2: "SCRATCHPAD_16K(r{0} ^ {1})",
3: "SCRATCHPAD_16K(r{0} ^ {1})",
4: "",
5: "",
6: "",
7: ""
}
c = location.get(symbol.locc)
if c == "":
c = getRegister(symbol.regc, type)
else:
c = convertibleFrom(c.format(symbol.regc, symbol.addr1), type)
return c
def readB(symbol, type):
if symbol.locb < 6:
return registerTo(getRegister(symbol.regb, type), type)
else:
return immediateTo(symbol, type)
class CodeSymbol:
def __init__(self, qi):
self.opcode = qi & 255
self.loca = (qi >> 8) & 7
self.rega = (qi >> 16) & 7
self.locb = (qi >> 24) & 7
self.regb = (qi >> 32) & 7
self.locc = (qi >> 40) & 7
self.regc = (qi >> 48) & 7
self.imm0 = (qi >> 56) & 255
self.addr0 = (qi >> 64) & 0xFFFFFFFF
self.addr1 = self.imm1 = qi >> 96
def writeOperation(file, i, symbol, type, name, op):
writeCommon(file, i, symbol, type, name)
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
file.write("\t\t{0} = A {1} B; }}\n".format(writeC(symbol, type), op))
def write_ADD_64(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT64, 'ADD_64', '+');
def write_ADD_32(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT32, 'ADD_32', '+');
def write_SUB_64(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT64, 'SUB_64', '-');
def write_SUB_32(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT32, 'SUB_32', '-');
def write_MUL_64(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT64, 'MUL_64', '*');
def write_MULH_64(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'MULH_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
file.write("\t\t{0} = ((uint128_t)A * B) >> 64; }}\n".format(writeC(symbol, type)))
def write_MUL_32(file, i, symbol):
type = OperandType.UINT32
writeCommon(file, i, symbol, type, 'MUL_32')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
file.write("\t\t{0} = (uint64_t)A * B; }}\n".format(writeC(symbol, OperandType.UINT64)))
def write_IMUL_32(file, i, symbol):
type = OperandType.INT32
writeCommon(file, i, symbol, type, 'IMUL_32')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
file.write("\t\t{0} = (int64_t)A * B; }}\n".format(writeC(symbol, OperandType.INT64)))
def write_IMULH_64(file, i, symbol):
type = OperandType.INT64
writeCommon(file, i, symbol, type, 'IMULH_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(type), readB(symbol, type)))
file.write("\t\t{0} = ((int128_t)A * B) >> 64; }}\n".format(writeC(symbol, type)))
def write_DIV_64(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'DIV_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.UINT32), readB(symbol, OperandType.UINT32)))
file.write("\t\tif(B == 0) B = 1;\n".format(declareType(type), readB(symbol, type)))
file.write("\t\t{0} = A / B; }}\n".format(writeC(symbol, type)))
def write_IDIV_64(file, i, symbol):
type = OperandType.INT64
writeCommon(file, i, symbol, type, 'IDIV_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.INT32), readB(symbol, OperandType.INT32)))
file.write("\t\tif(B == 0) B = 1;\n".format(declareType(type), readB(symbol, type)))
file.write("\t\t{0} = A / B; }}\n".format(writeC(symbol, type)))
def write_AND_64(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT64, 'AND_64', '&');
def write_AND_32(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT32, 'AND_32', '&');
def write_OR_64(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT64, 'OR_64', '|');
def write_OR_32(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT32, 'OR_32', '|');
def write_XOR_64(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT64, 'XOR_64', '^');
def write_XOR_32(file, i, symbol):
writeOperation(file, i, symbol, OperandType.UINT32, 'XOR_32', '^');
def write_SHL_64(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'SHL_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
file.write("\t\t{0} = A << B; }}\n".format(writeC(symbol, type)))
def write_SHR_64(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'SHR_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
file.write("\t\t{0} = A >> B; }}\n".format(writeC(symbol, type)))
def write_SAR_64(file, i, symbol):
type = OperandType.INT64
writeCommon(file, i, symbol, type, 'SAR_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
file.write("\t\t{0} = A >> B; }}\n".format(writeC(symbol, type)))
def write_ROL_64(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'ROL_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
file.write("\t\t{0} = __rolq(A, B); }}\n".format(writeC(symbol, type)))
def write_ROR_64(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'ROR_64')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} B = {1};\n".format(declareType(OperandType.SHIFT), readB(symbol, OperandType.SHIFT)))
file.write("\t\t{0} = __rorq(A, B); }}\n".format(writeC(symbol, type)))
def write_FADD(file, i, symbol):
writeOperation(file, i, symbol, OperandType.FLOAT, 'FADD', '+');
def write_FSUB(file, i, symbol):
writeOperation(file, i, symbol, OperandType.FLOAT, 'FSUB', '-');
def write_FMUL(file, i, symbol):
writeOperation(file, i, symbol, OperandType.FLOAT, 'FMUL', '*');
def write_FDIV(file, i, symbol):
writeOperation(file, i, symbol, OperandType.FLOAT, 'FDIV', '/');
def write_FSQRT(file, i, symbol):
type = OperandType.FLOAT
writeCommon(file, i, symbol, type, 'FSQRT')
file.write("\t\t{0} A = fabs({1});\n".format(declareType(type), readA(symbol, type)))
file.write("\t\t{0} = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&A))); }}\n".format(writeC(symbol, type)))
def write_FROUND(file, i, symbol):
type = OperandType.FLOAT
writeCommon(file, i, symbol, type, 'FROUND')
file.write("\t\t{0} A = {1};\n".format(declareType(OperandType.INT64), readA(symbol, OperandType.INT64)))
file.write("\t\t{0} = A;\n".format(writeC(symbol, type)))
file.write("\t\t_mm_setcsr(mxcsr | ((uint32_t)(A << 13) & _MM_ROUND_MASK)); }\n")
def write_CALL(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'CALL')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
if symbol.locb < 6:
file.write("\t\tif((uint32_t)r{0} <= {1}) {{\n".format(symbol.regb, symbol.imm1))
file.write("\t\t\tPUSH_VALUE(A);\n");
file.write("\t\t\tPUSH_ADDRESS(&&i_{0});\n".format((i + 1) & (PROGRAM_SIZE - 1)));
file.write("\t\t\tgoto i_{0};\n".format((i + 1 + (symbol.imm0 & ((PROGRAM_SIZE >> 2) - 1))) & (PROGRAM_SIZE - 1)));
if symbol.locb < 6:
file.write("\t\t}}\n\t\t{0} = A;".format(writeC(symbol, type)))
file.write("\t\t}\n")
def write_RET(file, i, symbol):
type = OperandType.UINT64
writeCommon(file, i, symbol, type, 'RET')
file.write("\t\t{0} A = {1};\n".format(declareType(type), readA(symbol, type)))
file.write("\t\tif(!STACK_IS_EMPTY()")
if symbol.locb < 6:
file.write(" && (uint32_t)r{0} <= {1}".format(symbol.regb, symbol.imm1))
file.write(") {\n")
file.write("\t\t\tvoid* target = POP_ADDRESS();\n")
file.write("\t\t\tuint64_t C = POP_VALUE();\n")
file.write("\t\t\t{0} = A ^ C;\n".format(writeC(symbol, type)))
file.write("\t\t\tgoto *target;\n")
file.write("\t\t}}\n\t\t{0} = A; }}\n".format(writeC(symbol, type)))
opcodeMap = { }
def buildOpcodeMap():
functions = globals()
totalWeight = 0;
for instruction, weight in INSTRUCTION_WEIGHTS:
func = functions['write_' + instruction]
for i in range(weight):
opcodeMap[totalWeight] = func
totalWeight = totalWeight + 1
assert totalWeight == 256
def writeCode(file, i, symbol):
opcodeMap.get(symbol.opcode)(file, i, symbol)
def writeMain(file):
file.write(('__attribute__((optimize("Os"))) int main() {\n'
" register uint64_t r0, r1, r2, r3, r4, r5, r6, r7;\n"
" register double f0, f1, f2, f3, f4, f5, f6, f7;\n"
" register uint64_t ic, sp;\n"
" stack_t stack[STACK_LENGTH];\n"
" convertible_t scratchpad[SCRATCHPAD_LENGTH] __attribute__ ((aligned (16)));\n"
" mmu_t mmu;\n"
" uint32_t mxcsr;\n"
))
def writeProlog(file):
file.write(("#include <stdint.h>\n"
"#include <time.h>\n"
"#include <stdio.h>\n"
"#include <x86intrin.h>\n"
"#include <emmintrin.h>\n"
"#include <wmmintrin.h>\n"
"#include <math.h>\n"
"#include <inttypes.h>\n"
"typedef uint32_t addr_t;\n"
"typedef unsigned __int128 uint128_t;\n"
"typedef __int128 int128_t;\n"
"typedef unsigned char byte;\n"
"typedef union {\n"
" double f64;\n"
" int64_t i64;\n"
" uint64_t u64;\n"
" int32_t i32;\n"
" uint32_t u32;\n"
"} convertible_t;\n"
"typedef union {\n"
" uint64_t value;\n"
" void* address;\n"
"} stack_t;\n"
"typedef struct {\n"
" addr_t ma;\n"
" addr_t mx;\n"
"#ifdef RAM\n"
" const char* buffer;\n"
"#endif\n"
"} mmu_t;\n"
"#define DRAM_SIZE (1ULL << 32)\n"
"#define SCRATCHPAD_SIZE (256 * 1024)\n"
"#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))\n"
"#define SCRATCHPAD_MASK14 (16 * 1024 / sizeof(convertible_t) - 1)\n"
"#define SCRATCHPAD_MASK18 (SCRATCHPAD_LENGTH - 1)\n"
"#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK14]\n"
"#define SCRATCHPAD_256K(x) scratchpad[(x) & SCRATCHPAD_MASK18]\n"
"#define STACK_LENGTH (128 * 1024)\n"
"#ifdef RAM\n"
"#define DRAM_READ(mmu) (convertible_t)*(uint64_t*)((mmu)->buffer + (mmu)->ma)\n"
"#define PREFETCH(mmu) _mm_prefetch(((mmu)->buffer + (mmu)->ma), _MM_HINT_T0)\n"
"#else\n"
"#define DRAM_READ(mmu) (convertible_t)(uint64_t)__rolq(6364136223846793005ULL*((mmu)->ma)+1442695040888963407ULL,32)\n"
"#define PREFETCH(mmu)\n"
"#endif\n"
"#define PUSH_VALUE(x) stack[sp++].value = x\n"
"#define PUSH_ADDRESS(x) stack[sp++].address = x\n"
"#define STACK_IS_EMPTY() (sp == 0)\n"
"#define POP_VALUE() stack[--sp].value\n"
"#define POP_ADDRESS() stack[--sp].address\n"
"static convertible_t readDram(mmu_t* mmu, addr_t addr) {\n"
" convertible_t data;\n"
" data = DRAM_READ(mmu);\n"
" mmu->ma += 8;\n"
" mmu->mx ^= addr;\n"
" if((mmu->mx & 0x1FFF) == 0) {\n"
"#ifdef PRNTADDR\n"
' printf("DRAM jump %#010x -> %#010x\\n", mmu->ma, mmu->mx);\n'
"#endif\n"
" mmu->ma = mmu->mx;\n"
"#ifdef PREF\n"
" PREFETCH(mmu);\n"
"#endif\n"
" }\n"
" return data;\n"
"}\n"
"static inline __m128i sl_xor(__m128i tmp1) {\n"
" __m128i tmp4;\n"
" tmp4 = _mm_slli_si128(tmp1, 0x04);\n"
" tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
" tmp4 = _mm_slli_si128(tmp4, 0x04);\n"
" tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
" tmp4 = _mm_slli_si128(tmp4, 0x04);\n"
" tmp1 = _mm_xor_si128(tmp1, tmp4);\n"
" return tmp1;\n"
"}\n"
"#define AES_GENKEY_SUB(rcon) do { \\\n"
" __m128i xout1 = _mm_aeskeygenassist_si128(xout2, rcon); \\\n"
" xout1 = _mm_shuffle_epi32(xout1, 0xFF); \\\n"
" xout0 = sl_xor(xout0); \\\n"
" xout0 = _mm_xor_si128(xout0, xout1); \\\n"
" xout1 = _mm_aeskeygenassist_si128(xout0, 0x00); \\\n"
" xout1 = _mm_shuffle_epi32(xout1, 0xAA); \\\n"
" xout2 = sl_xor(xout2); \\\n"
" xout2 = _mm_xor_si128(xout2, xout1); } while(0)\n"
"static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) {\n"
" __m128i xout0, xout2;\n"
" xout0 = _mm_load_si128(memory);\n"
" xout2 = _mm_load_si128(memory+1);\n"
" *k0 = xout0;\n"
" *k1 = xout2;\n"
" AES_GENKEY_SUB(0x01);\n"
" *k2 = xout0;\n"
" *k3 = xout2;\n"
" AES_GENKEY_SUB(0x02);\n"
" *k4 = xout0;\n"
" *k5 = xout2;\n"
" AES_GENKEY_SUB(0x04);\n"
" *k6 = xout0;\n"
" *k7 = xout2;\n"
" AES_GENKEY_SUB(0x08);\n"
" *k8 = xout0;\n"
" *k9 = xout2;\n"
"}\n"
"static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) {\n"
" *x0 = _mm_aesenc_si128(*x0, key);\n"
" *x1 = _mm_aesenc_si128(*x1, key);\n"
" *x2 = _mm_aesenc_si128(*x2, key);\n"
" *x3 = _mm_aesenc_si128(*x3, key);\n"
" *x4 = _mm_aesenc_si128(*x4, key);\n"
" *x5 = _mm_aesenc_si128(*x5, key);\n"
" *x6 = _mm_aesenc_si128(*x6, key);\n"
" *x7 = _mm_aesenc_si128(*x7, key);\n"
"}\n"
"static void aesInitialize(__m128i* key, __m128i* seed, __m128i* output, size_t count) {\n"
" \n"
" __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;\n"
" __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;\n"
" \n"
" aes_genkey(key, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);\n"
" \n"
" xin0 = _mm_load_si128(seed + 0);\n"
" xin1 = _mm_load_si128(seed + 1);\n"
" xin2 = _mm_load_si128(seed + 2);\n"
" xin3 = _mm_load_si128(seed + 3);\n"
" xin4 = _mm_load_si128(seed + 4);\n"
" xin5 = _mm_load_si128(seed + 5);\n"
" xin6 = _mm_load_si128(seed + 6);\n"
" xin7 = _mm_load_si128(seed + 7);\n"
" \n"
" for (size_t i = 0; i < count / sizeof(__m128i); i += 8)\n"
" {\n"
" aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);\n"
" \n"
" _mm_store_si128(output + i + 0, xin0);\n"
" _mm_store_si128(output + i + 1, xin1);\n"
" _mm_store_si128(output + i + 2, xin2);\n"
" _mm_store_si128(output + i + 3, xin3);\n"
" _mm_store_si128(output + i + 4, xin4);\n"
" _mm_store_si128(output + i + 5, xin5);\n"
" _mm_store_si128(output + i + 6, xin6);\n"
" _mm_store_si128(output + i + 7, xin7);\n"
" }\n"
"}\n"))
with sys.stdout as file:
buildOpcodeMap()
writeProlog(file)
file.write("const byte aesKey[32] = {{ {0} }};\n".format(genBytes(32)))
file.write("const byte aesSeed[128] = {{ {0} }};\n".format(genBytes(128)))
writeMain(file)
writeInitialValues(file)
for i in range(PROGRAM_SIZE):
writeCode(file, i, CodeSymbol(random.getrandbits(128)))
if PROGRAM_SIZE > 0:
file.write("\t\tgoto i_0;\n")
writeEpilog(file)

View file

@ -1,69 +0,0 @@
//RandomX ALU + FPU test
//https://github.com/tevador/RandomX
//License: GPL v3
#include <cstdint>
namespace RandomX {
constexpr int RoundToNearest = 0;
constexpr int RoundDown = 1;
constexpr int RoundUp = 2;
constexpr int RoundToZero = 3;
typedef union {
double f64;
int64_t i64;
uint64_t u64;
int32_t i32;
uint32_t u32;
} convertible_t;
extern "C" {
void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c);
void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c);
void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c);
void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c);
void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c);
void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c);
void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c);
void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c);
void AND_64(convertible_t& a, convertible_t& b, convertible_t& c);
void AND_32(convertible_t& a, convertible_t& b, convertible_t& c);
void OR_64(convertible_t& a, convertible_t& b, convertible_t& c);
void OR_32(convertible_t& a, convertible_t& b, convertible_t& c);
void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c);
void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c);
void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c);
void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c);
void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c);
void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c);
void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c);
void FPINIT();
void FADD_64(convertible_t& a, double b, convertible_t& c);
void FSUB_64(convertible_t& a, double b, convertible_t& c);
void FMUL_64(convertible_t& a, double b, convertible_t& c);
void FDIV_64(convertible_t& a, double b, convertible_t& c);
void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c);
void FROUND(convertible_t& a, convertible_t& b, convertible_t& c);
inline void FADD(convertible_t& a, convertible_t& b, convertible_t& c) {
FADD_64(a, (double)b.i64, c);
}
inline void FSUB(convertible_t& a, convertible_t& b, convertible_t& c) {
FSUB_64(a, (double)b.i64, c);
}
inline void FMUL(convertible_t& a, convertible_t& b, convertible_t& c) {
FMUL_64(a, (double)b.i64, c);
}
inline void FDIV(convertible_t& a, convertible_t& b, convertible_t& c) {
FDIV_64(a, (double)b.i64, c);
}
}
}

View file

@ -1,247 +0,0 @@
//RandomX ALU + FPU test
//https://github.com/tevador/RandomX
//License: GPL v3
#include "Instructions.h"
#include <cfenv>
#include <cmath>
#if defined(__SIZEOF_INT128__)
typedef unsigned __int128 uint128_t;
typedef __int128 int128_t;
static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
return ((uint128_t)a * b) >> 64;
}
static inline uint64_t __imulhi64(int64_t a, int64_t b) {
return ((int128_t)a * b) >> 64;
}
#define umulhi64 __umulhi64
#define imulhi64 __imulhi64
#endif
#if defined(_MSC_VER)
#define HAS_VALUE(X) X ## 0
#define EVAL_DEFINE(X) HAS_VALUE(X)
#include <intrin.h>
#include <stdlib.h>
#define ror64 _rotr64
#define rol64 _rotl64
#if EVAL_DEFINE(__MACHINEARM64_X64(1))
#define umulhi64 __umulh
#endif
#if EVAL_DEFINE(__MACHINEX64(1))
static inline uint64_t __imulhi64(int64_t a, int64_t b) {
int64_t hi;
_mul128(a, b, &hi);
return hi;
}
#define imulhi64 __imulhi64
#endif
#endif
#ifndef ror64
static inline uint64_t __ror64(uint64_t a, int b) {
return (a >> b) | (a << (64 - b));
}
#define ror64 __ror64
#endif
#ifndef rol64
static inline uint64_t __rol64(uint64_t a, int b) {
return (a << b) | (a >> (64 - b));
}
#define rol64 __rol64
#endif
#ifndef sar64
#include <type_traits>
constexpr int64_t builtintShr64(int64_t value, int shift) noexcept {
return value >> shift;
}
struct UsesArithmeticShift : std::integral_constant<bool, builtintShr64(-1LL, 1) == -1LL> {
};
static inline int64_t __sar64(int64_t a, int b) {
return UsesArithmeticShift::value ? builtintShr64(a, b) : (a < 0 ? ~(~a >> b) : a >> b);
}
#define sar64 __sar64
#endif
#ifndef umulhi64
#define LO(x) ((x)&0xffffffff)
#define HI(x) ((x)>>32)
static inline uint64_t __umulhi64(uint64_t a, uint64_t b) {
uint64_t ah = HI(a), al = LO(a);
uint64_t bh = HI(b), bl = LO(b);
uint64_t x00 = al * bl;
uint64_t x01 = al * bh;
uint64_t x10 = ah * bl;
uint64_t x11 = ah * bh;
uint64_t m1 = LO(x10) + LO(x01) + HI(x00);
uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1);
uint64_t m3 = HI(x11) + HI(m2);
return (m3 << 32) + LO(m2);
}
#define umulhi64 __umulhi64
#endif
#ifndef imulhi64
static inline int64_t __imulhi64(int64_t a, int64_t b) {
int64_t hi = umulhi64(a, b);
if (a < 0LL) hi -= b;
if (b < 0LL) hi -= a;
return hi;
}
#define imulhi64 __imulhi64
#endif
static double FlushDenormal(double x) {
if (std::fpclassify(x) == FP_SUBNORMAL) {
return 0;
}
return x;
}
#define FTZ(x) FlushDenormal(x)
namespace RandomX {
extern "C" {
void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 + b.u64;
}
void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 + b.u32;
}
void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 - b.u64;
}
void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 - b.u32;
}
void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 * b.u64;
}
void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = umulhi64(a.u64, b.u64);
}
void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = (uint64_t)a.u32 * b.u32;
}
void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = (int64_t)a.i32 * b.i32;
}
void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.i64 = imulhi64(a.i64, b.i64);
}
void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U);
}
void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) {
if (a.i64 == INT64_MIN && b.i64 == -1)
c.i64 = INT64_MIN;
else
c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1);
}
void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 & b.u64;
}
void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 & b.u32;
}
void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 | b.u64;
}
void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 | b.u32;
}
void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 ^ b.u64;
}
void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u32 ^ b.u32;
}
void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 << (b.u64 & 63);
}
void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = a.u64 >> (b.u64 & 63);
}
void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = sar64(a.i64, b.u64 & 63);
}
void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = rol64(a.u64, (b.u64 & 63));
}
void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) {
c.u64 = ror64(a.u64, (b.u64 & 63));
}
void FPINIT() {
fesetround(FE_TONEAREST);
}
void FADD_64(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ((double)a.i64 + b);
}
void FSUB_64(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ((double)a.i64 - b);
}
void FMUL_64(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ((double)a.i64 * b);
}
void FDIV_64(convertible_t& a, double b, convertible_t& c) {
c.f64 = FTZ((double)a.i64 / b);
}
void FABSQRT(convertible_t& a, convertible_t& b, convertible_t& c) {
double d = fabs((double)a.i64);
c.f64 = FTZ(sqrt(d));
}
void FROUND(convertible_t& a, convertible_t& b, convertible_t& c) {
c.f64 = (double)a.i64;
switch (a.u64 & 3) {
case RoundDown:
fesetround(FE_DOWNWARD);
break;
case RoundUp:
fesetround(FE_UPWARD);
break;
case RoundToZero:
fesetround(FE_TOWARDZERO);
break;
default:
fesetround(FE_TONEAREST);
break;
}
}
}
}

View file

@ -1,276 +0,0 @@
;RandomX ALU + FPU test
;https://github.com/tevador/RandomX
;License: GPL v3
PUBLIC ADD_64
PUBLIC ADD_32
PUBLIC SUB_64
PUBLIC SUB_32
PUBLIC MUL_64
PUBLIC MULH_64
PUBLIC MUL_32
PUBLIC IMUL_32
PUBLIC IMULH_64
PUBLIC DIV_64
PUBLIC IDIV_64
PUBLIC AND_64
PUBLIC AND_32
PUBLIC OR_64
PUBLIC OR_32
PUBLIC XOR_64
PUBLIC XOR_32
PUBLIC SHL_64
PUBLIC SHR_64
PUBLIC SAR_64
PUBLIC ROL_64
PUBLIC ROR_64
PUBLIC FPINIT
PUBLIC FADD_64
PUBLIC FSUB_64
PUBLIC FMUL_64
PUBLIC FDIV_64
PUBLIC FABSQRT
PUBLIC FROUND
CONST SEGMENT
__XMMABS DB 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 0ffH, 07fH
CONST ENDS
.code
ADD_64 PROC
mov rax, QWORD PTR [rcx]
add rax, QWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
ADD_64 ENDP
ADD_32 PROC
mov eax, DWORD PTR [rcx]
add eax, DWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
ADD_32 ENDP
SUB_64 PROC
mov rax, QWORD PTR [rcx]
sub rax, QWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
SUB_64 ENDP
SUB_32 PROC
mov eax, DWORD PTR [rcx]
sub eax, DWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
SUB_32 ENDP
MUL_64 PROC
mov rax, QWORD PTR [rcx]
imul rax, QWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
MUL_64 ENDP
MULH_64 PROC
mov rax, QWORD PTR [rdx]
mul QWORD PTR [rcx]
mov QWORD PTR [r8], rdx
ret 0
MULH_64 ENDP
MUL_32 PROC
mov r9d, DWORD PTR [rcx]
mov eax, DWORD PTR [rdx]
imul r9, rax
mov QWORD PTR [r8], r9
ret 0
MUL_32 ENDP
IMUL_32 PROC
movsxd r9, DWORD PTR [rcx]
movsxd rax, DWORD PTR [rdx]
imul r9, rax
mov QWORD PTR [r8], r9
ret 0
IMUL_32 ENDP
IMULH_64 PROC
mov rax, QWORD PTR [rdx]
imul QWORD PTR [rcx]
mov QWORD PTR [r8], rdx
ret 0
IMULH_64 ENDP
DIV_64 PROC
mov r9d, DWORD PTR [rdx]
mov eax, 1
test r9d, r9d
cmovne eax, r9d
xor edx, edx
mov r9d, eax
mov rax, QWORD PTR [rcx]
div r9
mov QWORD PTR [r8], rax
ret 0
DIV_64 ENDP
IDIV_64 PROC
mov rax, QWORD PTR [rcx]
mov rcx, -9223372036854775808
cmp rax, rcx
jne SHORT SAFE_IDIV_64
cmp QWORD PTR [rdx], -1
jne SHORT SAFE_IDIV_64
mov QWORD PTR [r8], rcx
ret 0
SAFE_IDIV_64:
mov ecx, DWORD PTR [rdx]
test ecx, ecx
mov edx, 1
cmovne edx, ecx
movsxd rcx, edx
cqo
idiv rcx
mov QWORD PTR [r8], rax
ret 0
IDIV_64 ENDP
AND_64 PROC
mov rax, QWORD PTR [rcx]
and rax, QWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
AND_64 ENDP
AND_32 PROC
mov eax, DWORD PTR [rcx]
and eax, DWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
AND_32 ENDP
OR_64 PROC
mov rax, QWORD PTR [rcx]
or rax, QWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
OR_64 ENDP
OR_32 PROC
mov eax, DWORD PTR [rcx]
or eax, DWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
OR_32 ENDP
XOR_64 PROC
mov rax, QWORD PTR [rcx]
xor rax, QWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
XOR_64 ENDP
XOR_32 PROC
mov eax, DWORD PTR [rcx]
xor eax, DWORD PTR [rdx]
mov QWORD PTR [r8], rax
ret 0
XOR_32 ENDP
SHL_64 PROC
mov rax, QWORD PTR [rcx]
mov rcx, QWORD PTR [rdx]
shl rax, cl
mov QWORD PTR [r8], rax
ret 0
SHL_64 ENDP
SHR_64 PROC
mov rax, QWORD PTR [rcx]
mov rcx, QWORD PTR [rdx]
shr rax, cl
mov QWORD PTR [r8], rax
ret 0
SHR_64 ENDP
SAR_64 PROC
mov rax, QWORD PTR [rcx]
mov rcx, QWORD PTR [rdx]
sar rax, cl
mov QWORD PTR [r8], rax
ret 0
SAR_64 ENDP
ROL_64 PROC
mov rax, QWORD PTR [rcx]
mov rcx, QWORD PTR [rdx]
rol rax, cl
mov QWORD PTR [r8], rax
ret 0
ROL_64 ENDP
ROR_64 PROC
mov rax, QWORD PTR [rcx]
mov rcx, QWORD PTR [rdx]
ror rax, cl
mov QWORD PTR [r8], rax
ret 0
ROR_64 ENDP
FPINIT PROC
mov DWORD PTR [rsp+8], 40896
ldmxcsr DWORD PTR [rsp+8]
ret 0
FPINIT ENDP
FADD_64 PROC
cvtsi2sd xmm0, QWORD PTR [rcx]
addsd xmm0, xmm1
movsd QWORD PTR [r8], xmm0
ret 0
FADD_64 ENDP
FSUB_64 PROC
cvtsi2sd xmm0, QWORD PTR [rcx]
subsd xmm0, xmm1
movsd QWORD PTR [r8], xmm0
ret 0
FSUB_64 ENDP
FMUL_64 PROC
cvtsi2sd xmm0, QWORD PTR [rcx]
mulsd xmm0, xmm1
movsd QWORD PTR [r8], xmm0
ret 0
FMUL_64 ENDP
FDIV_64 PROC
cvtsi2sd xmm0, QWORD PTR [rcx]
divsd xmm0, xmm1
movsd QWORD PTR [r8], xmm0
ret 0
FDIV_64 ENDP
FABSQRT PROC
cvtsi2sd xmm0, QWORD PTR [rcx]
andps xmm0, XMMWORD PTR __XMMABS
sqrtsd xmm1, xmm0
movsd QWORD PTR [r8], xmm1
ret 0
FABSQRT ENDP
FROUND PROC
cvtsi2sd xmm0, QWORD PTR [rcx]
movsd QWORD PTR [r8], xmm0
mov rax, QWORD PTR [rcx]
shl rax, 13
and eax, 24576
or eax, 40896
mov DWORD PTR [rsp+8], eax
ldmxcsr DWORD PTR [rsp+8]
ret 0
FROUND ENDP
END

View file

@ -1,283 +0,0 @@
//RandomX ALU + FPU test
//https://github.com/tevador/RandomX
//License: GPL v3
#include <iostream>
#include <iomanip>
#include <limits>
#include "Instructions.h"
using namespace RandomX;
typedef void(*VmOperation)(convertible_t&, convertible_t&, convertible_t&);
double rxRound(uint32_t mode, int64_t x, int64_t y, VmOperation op) {
convertible_t a, b, c;
a.u64 = mode;
FROUND(a, b, c);
a.i64 = x;
b.i64 = y;
op(a, b, c);
return c.f64;
}
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
#define RX_EXECUTE_U64(va, vb, INST) do { \
a.u64 = va; \
b.u64 = vb; \
INST(a, b, c); \
} while(false)
#define RX_EXECUTE_I64(va, vb, INST) do { \
a.i64 = va; \
b.i64 = vb; \
INST(a, b, c); \
} while(false)
TEST_CASE("Integer addition (64-bit)", "[ADD_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_64);
REQUIRE(c.u64 == 0x100000000);
RX_EXECUTE_U64(0x8000000000000000, 0x8000000000000000, ADD_64);
REQUIRE(c.u64 == 0x0);
}
TEST_CASE("Integer addition (32-bit)", "[ADD_32]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xFFFFFFFF, 0x1, ADD_32);
REQUIRE(c.u64 == 0);
RX_EXECUTE_U64(0xFF00000000000001, 0x0000000100000001, ADD_32);
REQUIRE(c.u64 == 2);
}
TEST_CASE("Integer subtraction (64-bit)", "[SUB_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_64);
REQUIRE(c.u64 == 0xFFFFFFFF00000002);
}
TEST_CASE("Integer subtraction (32-bit)", "[SUB_32]") {
convertible_t a, b, c;
RX_EXECUTE_U64(1, 0xFFFFFFFF, SUB_32);
REQUIRE(c.u64 == 2);
}
TEST_CASE("Unsigned multiplication (64-bit, low half)", "[MUL_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_64);
REQUIRE(c.u64 == 0x28723424A9108E51);
}
TEST_CASE("Unsigned multiplication (64-bit, high half)", "[MULH_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MULH_64);
REQUIRE(c.u64 == 0xB4676D31D2B34883);
}
TEST_CASE("Unsigned multiplication (32-bit x 32-bit -> 64-bit)", "[MUL_32]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, MUL_32);
REQUIRE(c.u64 == 0xB001AA5FA9108E51);
}
TEST_CASE("Signed multiplication (32-bit x 32-bit -> 64-bit)", "[IMUL_32]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMUL_32);
REQUIRE(c.u64 == 0x03EBA0C1A9108E51);
}
TEST_CASE("Signed multiplication (64-bit, high half)", "[IMULH_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xBC550E96BA88A72B, 0xF5391FA9F18D6273, IMULH_64);
REQUIRE(c.u64 == 0x02D93EF1269D3EE5);
}
TEST_CASE("Unsigned division (64-bit / 32-bit -> 32-bit)", "[DIV_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(8774217225983458895, 3014068202, DIV_64);
REQUIRE(c.u64 == 2911087818);
RX_EXECUTE_U64(8774217225983458895, 0, DIV_64);
REQUIRE(c.u64 == 8774217225983458895);
RX_EXECUTE_U64(3014068202, 8774217225983458895, DIV_64);
REQUIRE(c.u64 == 2);
}
TEST_CASE("Signed division (64-bit / 32-bit -> 32-bit)", "[IDIV_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(8774217225983458895, 3014068202, IDIV_64);
REQUIRE(c.u64 == 0xFFFFFFFE67B4994E);
RX_EXECUTE_U64(8774217225983458895, 0, IDIV_64);
REQUIRE(c.u64 == 8774217225983458895);
RX_EXECUTE_U64(0x8000000000000000, 0xFFFFFFFFFFFFFFFF, IDIV_64);
REQUIRE(c.u64 == 0x8000000000000000);
RX_EXECUTE_U64(0xFFFFFFFFB3A707EA, 8774217225983458895, IDIV_64);
REQUIRE(c.u64 == 0xFFFFFFFFFFFFFFFF);
}
TEST_CASE("Bitwise AND (64-bit)", "[AND_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_64);
REQUIRE(c.u64 == 0x8888888888888888);
}
TEST_CASE("Bitwise AND (32-bit)", "[AND_32]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0xCCCCCCCCCCCCCCCC, 0xAAAAAAAAAAAAAAAA, AND_32);
REQUIRE(c.u64 == 0x88888888);
}
TEST_CASE("Bitwise OR (64-bit)", "[OR_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_64);
REQUIRE(c.u64 == 0xEEEEEEEEEEEEEEEE);
}
TEST_CASE("Bitwise OR (32-bit)", "[OR_32]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x4444444444444444, 0xAAAAAAAAAAAAAAAA, OR_32);
REQUIRE(c.u64 == 0xEEEEEEEE);
}
TEST_CASE("Bitwise XOR (64-bit)", "[XOR_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_64);
REQUIRE(c.u64 == 0x2222222222222222);
}
TEST_CASE("Bitwise XOR (32-bit)", "[XOR_32]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x8888888888888888, 0xAAAAAAAAAAAAAAAA, XOR_32);
REQUIRE(c.u64 == 0x22222222);
}
TEST_CASE("Logical left shift (64-bit)", "[SHL_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x3, 52, SHL_64);
REQUIRE(c.u64 == 0x30000000000000);
RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHL_64);
REQUIRE(c.u64 == 6978065200108797952);
RX_EXECUTE_U64(0x8000000000000000, 1, SHL_64);
REQUIRE(c.u64 == 0);
}
TEST_CASE("Logical right shift (64-bit)", "[SHR_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x3, 52, SHR_64);
REQUIRE(c.u64 == 0);
RX_EXECUTE_U64(953360005391419562, 4569451684712230561, SHR_64);
REQUIRE(c.u64 == 110985711);
RX_EXECUTE_U64(0x8000000000000000, 1, SHR_64);
REQUIRE(c.u64 == 0x4000000000000000);
}
TEST_CASE("Arithmetic right shift (64-bit)", "[SAR_64]") {
convertible_t a, b, c;
RX_EXECUTE_I64(-9, 2, SAR_64);
REQUIRE(c.i64 == -3);
RX_EXECUTE_I64(INT64_MIN, 63, SAR_64);
REQUIRE(c.i64 == -1);
RX_EXECUTE_I64(INT64_MAX, 163768499474606398, SAR_64);
REQUIRE(c.i64 == 1);
}
TEST_CASE("Circular left shift (64-bit)", "[ROL_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x3, 52, ROL_64);
REQUIRE(c.u64 == 0x30000000000000);
RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROL_64);
REQUIRE(c.u64 == 6978065200552740799);
RX_EXECUTE_U64(0x8000000000000000, 1, ROL_64);
REQUIRE(c.u64 == 1);
}
TEST_CASE("Circular right shift (64-bit)", "[ROR_64]") {
convertible_t a, b, c;
RX_EXECUTE_U64(0x3, 52, ROR_64);
REQUIRE(c.u64 == 12288);
RX_EXECUTE_U64(953360005391419562, 4569451684712230561, ROR_64);
REQUIRE(c.u64 == 0xD835C455069D81EF);
RX_EXECUTE_U64(0x8000000000000000, 1, ROR_64);
REQUIRE(c.u64 == 0x4000000000000000);
}
TEST_CASE("Denormal numbers are flushed to zero", "[FTZ]") {
FPINIT();
convertible_t a, c;
a.i64 = 1;
FDIV_64(a, std::numeric_limits<double>::max(), c);
REQUIRE(c.f64 == 0.0);
}
TEST_CASE("IEEE-754 compliance", "[FPU]") {
FPINIT();
convertible_t a, c;
a.i64 = 1;
FDIV_64(a, 0, c);
REQUIRE(c.f64 == std::numeric_limits<double>::infinity());
a.i64 = -1;
FDIV_64(a, 0, c);
REQUIRE(c.f64 == -std::numeric_limits<double>::infinity());
REQUIRE(rxRound(RoundToNearest, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
REQUIRE(rxRound(RoundDown, 33073499373184121, -37713516328519941, &FADD) == -4640016955335824.0);
REQUIRE(rxRound(RoundUp, 33073499373184121, -37713516328519941, &FADD) == -4640016955335812.0);
REQUIRE(rxRound(RoundToZero, 33073499373184121, -37713516328519941, &FADD) == -4640016955335816.0);
REQUIRE(rxRound(RoundToNearest, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107858e+18);
REQUIRE(rxRound(RoundDown, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107868e+18);
REQUIRE(rxRound(RoundUp, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
REQUIRE(rxRound(RoundToZero, -8570200862721897289, -1111111111111111119, &FSUB) == -7.4590897516107848e+18);
REQUIRE(rxRound(RoundToNearest, 1, -10, &FDIV) == -0.10000000000000001);
REQUIRE(rxRound(RoundDown, 1, -10, &FDIV) == -0.10000000000000001);
REQUIRE(rxRound(RoundUp, 1, -10, &FDIV) == -0.099999999999999992);
REQUIRE(rxRound(RoundToZero, 1, -10, &FDIV) == -0.099999999999999992);
REQUIRE(rxRound(RoundToNearest, -2, 0, &FABSQRT) == 1.4142135623730951);
REQUIRE(rxRound(RoundDown, -2, 0, &FABSQRT) == 1.4142135623730949);
REQUIRE(rxRound(RoundUp, -2, 0, &FABSQRT) == 1.4142135623730951);
REQUIRE(rxRound(RoundToZero, -2, 0, &FABSQRT) == 1.4142135623730949);
}

File diff suppressed because it is too large Load diff

View file

@ -1,10 +0,0 @@
CXXFLAGS=-Wall -std=c++17 -O0
TestAluFpu: TestAluFpu.o InstructionsPortable.o
$(CXX) TestAluFpu.o InstructionsPortable.o -o $@
TestAluFpu.o: TestAluFpu.cpp
InstructionsPortable.o: InstructionsPortable.cpp
clean:
rm -f TestAluFpu TestAluFpu.o InstructionsPortable.o