#include <par2/gf16/gf16_global.h>
#include <par2/osinfo/platform.h>
#include <par2/gf16/gf_add_common.h>

#define _mword __m256i
#define _MM(f) _mm256_ ## f
#define _MMI(f) _mm256_ ## f ## _si256
#define _FNSUFFIX _avx2
#ifdef __AVX2__
# define _AVAILABLE
#endif

#include <par2/gf16/gf_add_x86.h>

#ifdef _AVAILABLE
# undef _AVAILABLE
#endif
#undef _FNSUFFIX
#undef _MMI
#undef _MM
#undef _mword


#ifdef PARPAR_INCLUDE_BASIC_OPS
void gf_add_multi_avx2(unsigned regions, size_t offset, void *HEDLEY_RESTRICT dst, const void* const*HEDLEY_RESTRICT src, size_t len) {
#ifdef __AVX2__
	gf16_muladd_multi((void*)1, &gf_add_x_avx2, 6, regions, offset, dst, src, len, NULL);
	_mm256_zeroupper();
#else
	UNUSED(regions); UNUSED(offset); UNUSED(dst); UNUSED(src); UNUSED(len);
#endif
}
#endif

#ifdef __AVX2__
# ifdef PARPAR_INCLUDE_BASIC_OPS
#  define PACKED_FUNC(vs, il, it) \
void gf_add_multi_packed_v##vs##i##il##_avx2(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len) { \
	gf16_muladd_multi_packed((void*)vs, &gf_add_x_avx2, il, it, packedRegions, regions, dst, src, len, sizeof(__m256i)*vs, NULL); \
	_mm256_zeroupper(); \
} \
void gf_add_multi_packpf_v##vs##i##il##_avx2(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
	gf16_muladd_multi_packpf((void*)vs, &gf_add_x_avx2, il, it, packedRegions, regions, dst, src, len, sizeof(__m256i)*vs, NULL, vs>1, prefetchIn, prefetchOut); \
	_mm256_zeroupper(); \
}
# else
#  define PACKED_FUNC(vs, il, it) \
void gf_add_multi_packpf_v##vs##i##il##_avx2(unsigned packedRegions, unsigned regions, void *HEDLEY_RESTRICT dst, const void* HEDLEY_RESTRICT src, size_t len, const void* HEDLEY_RESTRICT prefetchIn, const void* HEDLEY_RESTRICT prefetchOut) { \
	gf16_muladd_multi_packpf((void*)vs, &gf_add_x_avx2, il, it, packedRegions, regions, dst, src, len, sizeof(__m256i)*vs, NULL, vs>1, prefetchIn, prefetchOut); \
	_mm256_zeroupper(); \
}
# endif
#else
# define PACKED_FUNC(vs, il, it) PACKED_STUB(avx2, vs, il, it)
#endif

PACKED_FUNC_NOTSLIM(avx2, 1, 2, 8)
#ifdef PLATFORM_AMD64
PACKED_FUNC_NOTSLIM(avx2, 1, 6, 18)
PACKED_FUNC(16, 1, 6)
#else
PACKED_FUNC_NOTSLIM(avx2, 1, 1, 6)
#endif
PACKED_FUNC(2, 1, 6)
PACKED_FUNC(2, 3, 12)

#undef PACKED_FUNC
