#ifndef PERF_H
# define PERF_H 1

#include <inttypes.h>

typedef intptr_t ntr;

typedef enum ntls_test_type {
  NTT_LOAD = 0,
  NTT_ADDR = 1,
  NTT_TVAL = 0,
  NTT_LTVAL = 2,
  NTT_SELF_TIMING = 0,
  NTT_CALLER_TIMING = 4,
  NTT_NO_OVERHEAD = 0,
  NTT_VARS_OVERHEAD = 8,
} ntt;

#define NTT_ADDRLOAD_CHAR(name) (name[2])
#define NTT_ADDRLOAD(name)				\
  (NTT_ADDRLOAD_CHAR (name) == 'L' ? NTT_LOAD		\
   : NTT_ADDRLOAD_CHAR (name) == 'A' ? NTT_ADDR		\
   : (abort (), 0))

#define NTT_TVAL_CHAR(name) (name[3])
#define NTT_TVAL(name)					\
  (NTT_TVAL_CHAR (name) == 'S' ? NTT_TVAL		\
   : NTT_TVAL_CHAR (name) == 'D' ? NTT_LTVAL		\
   : (abort (), 0))

#define NTT_TIMING_CHAR(name) (name[1])
#define NTT_TIMING(name)				\
  (NTT_TIMING_CHAR (name) == 'I' ? NTT_SELF_TIMING	\
   : NTT_TIMING_CHAR (name) == 'X' ? NTT_CALLER_TIMING	\
   : (abort (), 0))

#define NTT_OVERHEAD_CHAR(name) (name[0])
#define NTT_OVERHEAD(name)				\
  (NTT_OVERHEAD_CHAR (name) == 'N' ? NTT_NO_OVERHEAD	\
   : NTT_OVERHEAD_CHAR (name) == 'V' ? NTT_VARS_OVERHEAD \
   : (abort (), 0))

#define NTT(name) (NTT_ADDRLOAD (name) | NTT_TVAL (name) | NTT_TIMING (name))

typedef ntr (self_timing_t)(hp_timing_t *diff);
typedef ntr (caller_timing_t)(void);

typedef union test_function_type {
  self_timing_t *self_timing;
  caller_timing_t *caller_timing;
} tft;

typedef struct ntls_perf_test {
  char *name;
  tft func;
} npt;

/* We force ret into the register that holds the return value, such
   that the generated code for internal timing is as similar as
   possible to what we'd get without internal timing.  This avoids
   pipeline stalls on AMD64, where using =r instead of =a in the asm
   statement below causes GCC to copy the value returned by the TLS
   descriptor function to %esi and then add %fs:0 to it, and then
   immediately run rdtsc, a sequence that introduces a big delay.  In
   practice, this would pretty much never happen, since we can expect
   the compiler to do a good job of register allocation and
   scheduling.  */
#define NTLS_TEST(func, proto, expr, before, after)			\
  static ntr f_ ## func proto {						\
    ntr ret;								\
    before;								\
    asm volatile ("" : "=a" (ret) : "0" (expr));			\
    after;								\
    return ret;								\
  }									\
  static npt test_desc_ ## func = { #func, (tft)(f_ ## func) };		\
  static npt __attribute((section (".ntls.tests"), used))		\
    *test_ ## func = & test_desc_ ## func

/* Create one variable for each general-purpose register available for
   use, leaving one alone to hold the return value.  */
#if __x86_64__
# define NTLS_TEST_MANYVARS_BEFORE					\
  ntr a1; ntr a2; ntr a3; ntr a4; ntr a5; ntr a6; ntr a7;		\
  ntr a8; ntr a9; ntr a10; ntr a11; ntr a12; ntr a13; ntr a14;		\
  asm volatile (" " : "+r" (a1), "+r" (a2), "+r" (a3),			\
		"+r" (a4), "+r" (a5), "+r" (a6), "+r" (a7),		\
		"+r" (a8), "+r" (a9), "+r" (a10), "+r" (a11),		\
		"+r" (a12), "+r" (a13), "+r" (a14));
# define NTLS_TEST_MANYVARS_AFTER					\
  asm volatile (" " : : "r" (a1), "r" (a2), "r" (a3),			\
		"r" (a4), "r" (a5), "r" (a6), "r" (a7),			\
		"r" (a8), "r" (a9), "r" (a10), "r" (a11),		\
		"r" (a12), "r" (a13), "r" (a14));
#else
# define NTLS_TEST_MANYVARS_BEFORE					\
  ntr a1; ntr a2; ntr a3; ntr a4; ntr a5;				\
  asm volatile (" " : "+r" (a1), "+r" (a2), "+r" (a3),			\
		"+r" (a4), "+r" (a5));
# define NTLS_TEST_MANYVARS_AFTER					\
  asm volatile (" " : : "r" (a1), "r" (a2), "r" (a3),			\
		"r" (a4), "r" (a5));
#endif

#define NTLS_TEST_SELF(name, expr, code_before, code_after)		\
  NTLS_TEST (name, (hp_timing_t *diff), (expr),			\
	    hp_timing_t before; HP_TIMING_NOW (before);			\
	    code_before,						\
	    code_after							\
	    hp_timing_t after; HP_TIMING_NOW (after);			\
	    if (diff) HP_TIMING_DIFF (*diff, before, after))

#define NTLS_TEST_CALLER(name, expr, before, after)	\
  NTLS_TEST (name, (void), (expr), before, after)

#define NTLS_TESTS_NV(which, name, expr)		\
  NTLS_TEST_ ## which (N ## name, (expr), ,);		\
  NTLS_TEST_ ## which (V ## name, (expr),		\
		       NTLS_TEST_MANYVARS_BEFORE,	\
		       NTLS_TEST_MANYVARS_AFTER)

#define NTLS_TESTS_IX(name, expr)	\
  NTLS_TESTS_NV (SELF, I ## name, expr);	\
  NTLS_TESTS_NV (CALLER, X ## name, expr)

#define NTLS_TESTS_AL(name, expr)		\
  NTLS_TESTS_IX (A ## name, &(expr));	\
  NTLS_TESTS_IX (L ## name, expr)	\

#endif
