#include "hip/hip_runtime.h"
/*
    -- MAGMA (version 2.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date

       @author Mark Gates
       @author Tingxing Dong
       @author Azzam Haidar

       @generated from magmablas_hip/zgemv_fermi.hip.cpp, normal z -> s, Fri Sep 19 13:52:35 2025
*/
#include "magma_internal.h"
#include "commonblas_s.h"
#include "magma_templates.h"

#define PRECISION_s

#include "gemv_template_device.hip.hpp"

#include "gemv_config/gemvn_param.h"
#include "gemv_config/gemvt_param.h"

#define version(s,v) s ## _V_ ## v


/******************************************************************************/
// NoTrans kernel
template<const int DIM_X, const int DIM_Y, const int TILE_SIZE>
__global__ void
sgemvn_template_kernel_fermi(
    int m, int n, float alpha,
    const float * __restrict__ A, int lda,
    const float * __restrict__ x, int incx, float beta,
    float       * __restrict__ y, int incy)
{
#if (__CUDA_ARCH__ >= 200) || defined(MAGMA_HAVE_HIP)
    gemvn_template_device<float, DIM_X, DIM_Y, TILE_SIZE>
        (m, n, alpha, A, lda, x, incx, beta, y, incy);
#endif /* (__CUDA_ARCH__ >= 200) || defined(MAGMA_HAVE_HIP) */
}


/******************************************************************************/
// Trans/ConjTans kernel
template<const int DIM_X, const int DIM_Y, const int TILE_SIZE, magma_trans_t trans>
__global__ void
sgemvc_template_kernel_fermi(
    int m, int n, float alpha,
    const float * __restrict__ A, int lda,
    const float * __restrict__ x, int incx, float beta,
    float       * __restrict__ y, int incy)
{
#if (__CUDA_ARCH__ >= 200) || defined(MAGMA_HAVE_HIP)
    gemvc_template_device< float, DIM_X, DIM_Y, TILE_SIZE, trans >
        (m, n, alpha, A, lda, x, incx, beta, y, incy);
#endif /* (__CUDA_ARCH__ >= 200) || defined(MAGMA_HAVE_HIP) */
}


/******************************************************************************/
// NoTrans CPU driver
template<const int DIM_X, const int DIM_Y, const int TILE_SIZE>
void
sgemvn_template_fermi(
    magma_int_t m, magma_int_t n, float alpha,
    const float * __restrict__ A, magma_int_t lda,
    const float * __restrict__ x, magma_int_t incx, float beta,
    float       * __restrict__ y, magma_int_t incy,
    magma_queue_t queue)
{
    dim3 grid( magma_ceildiv(m, TILE_SIZE), 1 );
    dim3 threads( DIM_X, DIM_Y );

    hipLaunchKernelGGL(HIP_KERNEL_NAME(sgemvn_template_kernel_fermi<DIM_X, DIM_Y, TILE_SIZE>), dim3(grid), dim3(threads), 0, queue->hip_stream() , m, n, alpha, A, lda, x, incx, beta, y, incy);
}


/******************************************************************************/
// Trans/ConjTans CPU driver
template<const int DIM_X, const int DIM_Y, const int TILE_SIZE>
void
sgemvc_template_fermi(
    magma_trans_t trans, magma_int_t m, magma_int_t n, float alpha,
    const float * __restrict__ A, magma_int_t lda,
    const float * __restrict__ x, magma_int_t incx, float beta,
    float       * __restrict__ y, magma_int_t incy,
    magma_queue_t queue)
{
    dim3 grid    ( magma_ceildiv(n, TILE_SIZE), 1 );
    dim3 threads ( DIM_X, DIM_Y );

    if (trans == MagmaConjTrans) {
        hipLaunchKernelGGL(HIP_KERNEL_NAME(sgemvc_template_kernel_fermi< DIM_X, DIM_Y, TILE_SIZE, MagmaConjTrans >), dim3(grid), dim3(threads), 0, queue->hip_stream() , m, n, alpha, A, lda, x, incx, beta, y, incy);
    }
    else {
        hipLaunchKernelGGL(HIP_KERNEL_NAME(sgemvc_template_kernel_fermi< DIM_X, DIM_Y, TILE_SIZE, MagmaTrans >), dim3(grid), dim3(threads), 0, queue->hip_stream() , m, n, alpha, A, lda, x, incx, beta, y, incy);
    }
}


/***************************************************************************//**
    Purpose
    -------
    SGEMV performs one of the matrix-vector operations

        y := alpha*A*x    + beta*y,   or
        y := alpha*A**T*x + beta*y,   or
        y := alpha*A**H*x + beta*y,

    where alpha and beta are scalars, x and y are vectors and A is an
    m by n matrix.

    Arguments
    ----------
    @param[in]
    trans   magma_trans_t
            On entry, TRANS specifies the operation to be performed as
            follows:
      -     = MagmaNoTrans:    y := alpha*A  *x + beta*y
      -     = MagmaTrans:      y := alpha*A^T*x + beta*y
      -     = MagmaConjTrans:  y := alpha*A^H*x + beta*y

    @param[in]
    m       INTEGER
            On entry, m specifies the number of rows of the matrix A.

    @param[in]
    n       INTEGER
            On entry, n specifies the number of columns of the matrix A

    @param[in]
    alpha   REAL
            On entry, ALPHA specifies the scalar alpha.

    @param[in]
    dA      REAL array of dimension ( LDDA, n ) on the GPU.

    @param[in]
    ldda    INTEGER
            LDDA specifies the leading dimension of A.

    @param[in]
    dx      REAL array of dimension
            n if trans == MagmaNoTrans
            m if trans == MagmaTrans or MagmaConjTrans

    @param[in]
    incx    Specifies the increment for the elements of X.
            INCX must not be zero.

    @param[in]
    beta    REAL
            On entry, BETA specifies the scalar beta. When BETA is
            supplied as zero then Y need not be set on input.

    @param[out]
    dy      REAL array of dimension
            m if trans == MagmaNoTrans
            n if trans == MagmaTrans or MagmaConjTrans

    @param[in]
    incy    Specifies the increment for the elements of Y.
            INCY must not be zero.

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    @ingroup magma_gemv
*******************************************************************************/
extern "C" void
magmablas_sgemv(
    magma_trans_t trans, magma_int_t m, magma_int_t n,
    float alpha,
    magmaFloat_const_ptr dA, magma_int_t ldda,
    magmaFloat_const_ptr dx, magma_int_t incx,
    float beta,
    magmaFloat_ptr dy, magma_int_t incy,
    magma_queue_t queue)
{
    magma_int_t info = 0;
    if ( trans != MagmaNoTrans && trans != MagmaTrans && trans != MagmaConjTrans )
        info = -1;
    else if ( m < 0 )
        info = -2;
    else if ( n < 0 )
        info = -3;
    else if ( ldda < m )
        info = -6;
    else if ( incx == 0 )
        info = -8;
    else if ( incy == 0 )
        info = -11;

    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return;  //info;
    }

    // Quick return if possible.
    if( m == 0 || n == 0 || (alpha == MAGMA_S_ZERO && beta == MAGMA_S_ONE) ) return;

    // --------------------
    // CUDA ARCH 2.x (Fermi) version
    if ( trans == MagmaNoTrans ) {
        sgemvn_template_fermi<version(N, 106)>
            ( m, n, alpha, dA, ldda, dx, incx, beta, dy, incy, queue );
    }
    else {
        sgemvc_template_fermi<version(T, 189)>
            ( trans, m, n, alpha, dA, ldda, dx, incx, beta, dy, incy, queue );
    }
}
