// file kernel/n/x86-64/toom.S: Toom multiplication of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Multiplication de Toom                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

#if defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr)
        
                         # +-------------------------+
                         # |  Addition/soustraction  |
                         # +-------------------------+

# entre :
#   a = naturel de longueur 2p+q   rsi = &a,  rdx = p,  rcx = q
#   b = naturel de longueur 2p+2   rdi = &b
# contraintes : 0 < q <= p
#
# sortie :
#   b[0..p]      <-  a[0..p-1] + a[p..2p-1] + a[2p..2p+q-1]
#   b[p+1..2p+1] <- |a[0..p-1] - a[p..2p-1] + a[2p..2p+q-1]|
# CF <- signe de a[0..p-1] - a[p..2p-1] + a[2p..2p+q-1]
#
#
# registres modifis :
#   rcx <- 0
#   rax,rbx,rdx,rsi,rdi,rbp,r8,r9,r10 <- ind.

#undef L
#define L(x) .Lsn_fadd_sub3_##x
        ALIGN(32)
.Lsn_fadd_sub3:


        movq   %rdx,   %r9             # sauve p
        movq   %rdi,   %r10            # sauve &b0

        leaq   (%rsi,%rdx,8), %rbx     # rbx <- &a2
        leaq   (%rbx,%rdx,8), %rbx
        call   .Lsn_fadd               # b0 <- a0+a2
        setc   %cl                     # sauve la retenue
        movq   %rcx,   (%rdi)
        
        leaq 8(%rdi),   %rdi           # rdi <- &b1
        movq   %rsi,    %rbx           # rbx <- &a1
        movq   %r10,    %rsi           # rsi <- &b0
        movq   %r9,     %rcx           # rcx <- p
        leaq 1(%r9),    %rdx           # rdx <- p+1
        movq   %rbx,    %r8            # sauve &a1
        call   .Lsn_fasub              # b1 <- |a0-a1+a2|

        movq   %r8,     %rbx           # rbx <- &a1
        movq   %r10,    %rsi           # rsi <- &b0
        movq   %r10,    %rdi           # rdi <- &b0
        movq   %r9,     %rcx           # rcx <- p
        setc   %r8b                    # sauve le signe de a0-a1+a2
        call   .Lsn_fadd_1             # b0 <- a0+a1+a2
        adcq   %rcx,   (%rsi)          # dernier chiffre

        bt     $0,      %r8            # rcupre le signe de a0-a1+a2
        ret
        
        
                        # +--------------------------+
                        # |  Addition avec dcalage  |
                        # +--------------------------+

# entre :
#   a = naturel de longueur 2p+q   rsi = &a,  rdx = p,  rcx = q
#   b = naturel de longueur p+3    rdi = &b
# contraintes : 0 < q <= p, p > 2
#
# sortie :
#   b <-  a[0..p-1] + BASE*a[p..2p-1] + BASE^2*a[2p..2p+q-1]
#
# registres modifis :
#   rax,rbx,rcx,rdx,rsi,rdi,rbp,r8,r9,r10 <- ind.

#undef L
#define L(x) .Lsn_fadd_base_##x
        ALIGN(32)
.Lsn_fadd_base:

        movq   %rdx,   %r8              # sauve p
        movq   %rcx,   %r9              # sauve q
        movq   %rdi,   %r10             # sauve &b
        
        # b <- a0 + BASE*a1
        movq  (%rsi),   %rax
        movq   %rax,   (%rdi)
        leaq  (%rsi,%rdx,8), %rbx       # rbx <- &a1
        leaq 8(%rsi),   %rsi            # rsi <- &a0[1]
        leaq 8(%rdi),   %rdi            # rdi <- &b[1]
        leaq -1(%rdx),  %rcx            # rcx <- p-1
        call  .Lsn_fadd_1

        movq  (%rbx),   %rax
        adcq   %rcx,    %rax
        movq   %rax,   (%rdi)
        movq   %rcx, 16(%rdi)
        setc   %cl
        movq   %rcx,  8(%rdi)

        # b <- b + BASE^2*a2
        leaq 8(%rbx),   %rbx            # rbx <- &a2
        leaq 16(%r10),  %rsi            # rsi <- &b[2]
        movq   %r9,     %rcx            # rcx <- q
        leaq 1(%r8),    %rdx            # rdx <- p+1
        jmp  .Lsn_finc

#endif /* defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr) */

                            # +------------------+
                            # |  Multiplication  |
                            # +------------------+
        
# entre :
#   a = naturel de longueur la     rsi = &a, rdx = la
#   b = naturel de longueur lb     rbx = &b, rcx = lb
#   c = naturel de longueur la+lb  rdi = &c
# contraintes : 0 < lb <= la
#
# sortie :
#   c <- a * b
#
# registres modifis : tous
        
#ifdef assembly_sn_toommul
        ALIGN(4)
#ifdef debug_toommul
.Lsn_ftoommul_buggy:
#else
.Lsn_ftoommul:
#endif

#undef L
#define L(x) .Lsn_ftoommul_##x

        # petite multiplication => algorithme de Karatsuba
        cmpq   $toommul_lim, %rcx
        jbe    .Lsn_fkaramul
        
        leaq   2(%rdx), %rax
        xorq   %rdx,    %rdx
        movq   $3,      %rbp
        divq   %rbp                     # rax <- p = ceil(la/3)
	movq   %rax,    %rbp            # rbp <- p
        shlq   $1,      %rax            # rax <- 2p
        subq   %rax,    %rcx            # rcx <- r = lb - 2p
        jbe    L(tranches)              # si lb <= 2p, dcoupe a en tranches

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _e_
        #undef  _f_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        
        #define _a_   72(%rsp)
        #define _b_   64(%rsp)
        #define _c_   56(%rsp)
        #define _d_   48(%rsp)
        #define _e_   40(%rsp)
        #define _f_   32(%rsp)
        #define _p_   24(%rsp)
        #define _q_   16(%rsp)
        #define _r_    8(%rsp)
        #define _x_     (%rsp)

	movq  %rax,     %r8             # r8 <- 2p
        leaq  10(%rax,%rax,2), %rax     # rax <- 6p+10
        leaq  (,%rax,8), %rax
        ALLOCA                          # rserve 6p+10 chiffres dans la pile
        leaq   -2(%rdx,%rbp,1), %rdx    # rdx <- q
	movq   %rsp,     %rax           # rax <- &d
        pushq  %rsi                     # sauve &a
        pushq  %rbx                     # sauve &b
        pushq  %rdi                     # sauve &c
        pushq  %rax                     # sauve &d
        leaq   16(%rax,%r8,8), %rax
        pushq  %rax                     # sauve &e
        leaq   16(%rax,%r8,8), %rax
        pushq  %rax                     # sauve &f
        pushq  %rbp                     # sauve p
        pushq  %rdx                     # sauve q
        pushq  %rcx                     # sauve r
        pushq  $0                       # x <- 0

        # c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
        movq   %rdx,    %rcx            # rcx <- q
        movq   %rbp,    %rdx            # rdx <- p
        call   .Lsn_fadd_sub3
        adcq   %rcx,    _x_
        
        # c[2p+2..3p+2] <- b0 + b1 + b2, c[3p+3..4p+3] <- |b0 - b1 + b2|
        movq   _b_,     %rsi
        movq   _c_,     %rdi
        movq   _p_,     %rdx
        movq   _r_,     %rcx
        leaq   16(%rdi,%rdx,8), %rdi
        leaq     (%rdi,%rdx,8), %rdi
        call   .Lsn_fadd_sub3
        adcq   %rcx,    _x_

        # d <- (a0 + a1 + a2)(b0 + b1 + b2) = c0 + c1 + c2 + c3 + c4
        movq   _p_,     %rcx
        leaq   (,%rcx,2), %rdx
        movq   _c_,     %rbx
        leaq  16(%rbx,%rdx,8), %rsi
        movq   _d_,     %rdi
        movq   $0,  (%rdi,%rdx,8)
        testq  $-1, (%rsi,%rcx,8)
        movq   %rcx,    %rdx
        jz     1f
        incq   %rdx
1:
        testq  $-1, (%rbx,%rcx,8)
        jz     2f
        incq   %rcx
        xchgq  %rcx,    %rdx
        xchgq  %rbx,    %rsi
2:
        call   .Lsn_ftoommul
	
        # e <- |a0 - a1 + a2|*|b0 - b1 + b2| = |c0 - c1 + c2 - c3 + c4|
        movq   _p_,     %rcx
        movq   _c_,     %rbx
	leaq   (%rcx,%rcx,1), %rdx
        leaq  8(%rbx,%rcx,8), %rbx
        leaq 16(%rbx,%rdx,8), %rsi
        movq   _e_,     %rdi
        movq   $0,  (%rdi,%rdx,8)
        testq  $-1, (%rsi,%rcx,8)
        movq   %rcx,    %rdx
        jz     1f
        incq   %rdx
1:
        testq  $-1, (%rbx,%rcx,8)
        jz     2f
        incq   %rcx
        xchgq  %rcx,    %rdx
        xchgq  %rbx,    %rsi
2:
        call   .Lsn_ftoommul
        
        # c[0..p+2] <- a0 + BASE*a1 + BASE^2*a2
        movq   _a_,     %rsi
        movq   _c_,     %rdi
        movq   _p_,     %rdx
        movq   _q_,     %rcx
        call   .Lsn_fadd_base
        
        
        # c[p+3..2p+5] <- b0 + BASE*b1 + BASE^2*b2
        movq   _b_,     %rsi
        movq   _c_,     %rdi
        movq   _p_,     %rdx
        movq   _r_,     %rcx
        leaq 24(%rdi,%rdx,8), %rdi
        call   .Lsn_fadd_base

        # f <- (a0 + BASE*a1 + BASE^2*a2)*(b0 + BASE*b1 + BASE^2*b2)
        #    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
        movq   _p_,     %rcx
        movq   _f_,     %rdi
        xorq   %rax,    %rax
#if 0
        # ceci est inutile, les chiffres de rang 2p+2  2p+5
        # de f vont tre ignors (voir plus bas)
	leaq   (%rcx,%rcx,1), %rdx
        movq   %rax,   16(%rdi,%rdx,8)  # f[2p+2] <- 0
        movq   %rax,   24(%rdi,%rdx,8)  # f[2p+3] <- 0
        movq   %rax,   32(%rdi,%rdx,8)  # f[2p+4] <- 0
        movq   %rax,   40(%rdi,%rdx,8)  # f[2p+5] <- 0
#endif
        addq   $3,      %rcx            # rcx <- p+3
        movq   %rcx,    %rdx            # rdx <- p+3
        movq   _c_,     %rsi
        leaq    (%rsi,%rdx,8), %rbx     # rbx <- &c[p+3]
        cmpq  -8(%rsi,%rdx,8), %rax     # rdx <- lg(a0+BASE*a1+BASE^2*a2)
        adcq   $-1,     %rdx
        cmpq  -8(%rsi,%rdx,8), %rax
        adcq   $-1,     %rdx
        cmpq  -8(%rbx,%rcx,8), %rax     # rcx <- lg(b0+BASE*b1+BASE^2*b2)
        adcq   $-1,     %rcx
        cmpq  -8(%rbx,%rcx,8), %rax
        adcq   $-1,     %rcx
        cmpq   %rdx,    %rcx            # classe les arguments pour avoir
        jbe    1f                       # ... lg(rsi,rdx) >= lg(rbx,rcx)
        xchgq  %rcx,    %rdx
        xchgq  %rbx,    %rsi
1:
        call   .Lsn_ftoommul

        # c[0..2p-1] <- a0*b0 = c0
        movq   _a_,     %rsi
        movq   _b_,     %rbx
        movq   _c_,     %rdi
        movq   _p_,     %rdx
        movq   %rdx,    %rcx
        call   .Lsn_ftoommul

        # c[4*p..4p+q+r-1] <- a2*b2 = c4
        movq   _p_,     %rax
        leaq   (,%rax,4), %rax          # rax <- 4p
        movq   _a_,     %rsi
        movq   _b_,     %rbx
        movq   _c_,     %rdi
        leaq   (%rsi,%rax,4), %rsi      # rsi <- &a2
        leaq   (%rbx,%rax,4), %rbx      # rbx <- &b2
        leaq   (%rdi,%rax,8), %rdi      # rdi <- &c[4p]
        movq   _q_,     %rdx
        movq   _r_,     %rcx
        call   .Lsn_ftoommul
        
        # point de chute pour toom_sqr
.Lsn_toom_aux:

	# raffectation des variables locales
	movq   _c_,     %r9
	movq   _d_,     %r10
	movq   _f_,     %r11
	movq   _p_,     %r12
	mov    _q_,     %r13
	movq   _r_,     %r14
	addq   %r13,    %r14   # r14 <- q+r
	movq   _x_,     %r15
	
	#undef  _c_
	#undef  _d_
	#undef  _f_
	#undef  _p_
	#undef  _q_
	#undef  _r_
	#undef  _x_
	#define _c_  %r9
	#define _d_  %r10
	#define _f_  %r11
	#define _p_  %r12
	#define _q_  %r13
	#define _r_  %r14
	#define _x_  %r15
	
        # c[2p..4p] <- (d+e)/2 = c0 + c2 + c4, d <- (d-e)/2 = c1 + c3
        movq   _c_,     %rdi
        movq   _d_,     %rsi
        movq   _e_,     %rbx
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        leaq   -8(%rdi,%rcx,8), %rdi    # rdi <- &c[2p]
	leaq   .Lsn_fsub_1(%rip), %rax
	leaq   .Lsn_fadd_1(%rip), %r8
	bt     $0,      _x_
	cmovc  %rax,    %r8
	movq   -8(%rdi,%rcx,8), _x_     # sauve c[4p] dans x
	call   *%r8                     # c[2p..4p] <- d - e
        movq   _c_,     %rsi
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        movq   %rcx,    %rdx            # rdx <- 2p+1
        leaq   -8(%rsi,%rcx,8), %rsi    # rsi <- &c[2p]
        movq   %rsi,    %rbx            # rbx <- &c[2p]
        call   .Lsn_fhalf               # c[2p..4p] /= 2
        movq   %rdx,    %rcx            # rcx <- 2p+1
        movq   _d_,     %rsi
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # d -= c[2p..4p]
        xchgq -8(%rbx),  _x_            # restaure c[4p]

        # c[2p..4p] <- c[2p..4p] - c0 - c4 = c2
        leaq   (,_p_,2), %rcx           # rcx <- 2p
        movq   _c_,     %rbx
        leaq   (%rbx,%rcx,8), %rsi      # rsi <- &c[2p]
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # c[2p..4p-1] -= c0
        sbbq   %rcx,    _x_             # x -= retenue
        movq   %rbx,    %rsi            # rsi <- &c[2p]
        leaq   (,_p_,2), %rdx           # rdx <- 2p
        movq   _r_,     %rcx            # rcx <- q+r
        leaq   (%rsi,%rdx,8), %rbx      # rbx <- &c[4p]
        call   .Lsn_fdec                # c[2p..4p-1] -= c4
        sbbq   %rcx,    _x_             # x -= retenue

        # f <- f - c0 - BASE^2*c2 - BASE^4*c4 = BASE*c1 + BASE^3*c3
        #
        # rmq1 : f a 2p+6 chiffres mais on s en sert pour calculer -BASE*c3
        # qui tient sur p+q+2 chiffres -> on peut ignorer les chiffres de rang
        # 2p+2  2p+5 (d ailleurs on ne les a peut-tre mme pas calculs)
        #
        # rmq2 : f et c0 ont mme chiffre des units, donc on peut commencer
        # la soustraction au rang 1. Ce n est mme pas la peine de forcer
        # le premier chiffre  zro, on ne s en servira pas
        leaq   -1(,_p_,2), %rcx         # rcx <- 2p-1
        leaq   8(_c_),  %rbx            # rbx <- &c[1]
        leaq   8(_f_),  %rsi            # rsi <- &f[1]
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # f -= c0
        sbbq   %rcx,    (%rsi)          # propage la retenue
        sbbq   %rcx,   8(%rsi)
        leaq   (,_p_,2), %rcx           # rcx <- 2p
        leaq  16(_f_),  %rsi            # rsi <- &f[2]
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # f -= c2*BASE^2
        # on pourrait prendre rdx = 2p-2 ici puisqu on ne veut que les
        # 2p+2 premiers chiffres de f, mais sn_fdec va planter si
        # on a q+r > 2p-2.
        leaq   (,_p_,2), %rdx           # rdx <- 2p
        leaq   32(_f_), %rsi            # rsi <- &f[4]
        movq   _r_,     %rcx            # rcx <- q+r
        call   .Lsn_fdec                # f -= c4*BASE^4

        # f <- f - BASE*d = BASE*(BASE^2 - 1)*c3
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        leaq   8(_f_), %rsi             # rsi <- &f[1]
        movq   _d_,     %rbx            # rbx <- &d
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # f -= BASE*d

        # f <- -f/(BASE^2 - 1) = -BASE*c3 mod BASE^(2p+2)
        leaq   -1(,_p_,2), %rcx         # rcx <- 2p - 1
        leaq   8(_f_), %rbx             # rbx <- &f[1]
        leaq  16(%rbx), %rsi            # rsi <- &f[3]
	movq   %rsi,    %rdi
        call   .Lsn_fadd_1              # divise par 1 - BASE^2
        testq  $-1,  -8(%rsi)
        jz     L(c3_nul)                # ZF = 1 ssi c3 = 0

        # c[3p..4p+q+r-1] += c3
	leaq   (_p_,_p_,2),  %rdx
	leaq   (_c_,%rdx,8), %rsi       # rsi <- &c[3p]
	leaq   1(_p_,_q_,1), %rcx       # rcx <- p+q+1
        leaq   8(_f_),  %rbx            # rbx <- &f[1]
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # retranche BASE^(p+q+1) - c3
        jb     2f                       # s il n y a pas de retenue, alors
1:                                      # il faut ajouter BASE^(p+q+1)
        incq   (%rsi)
        leaq   8(%rsi), %rsi
        jz     1b
2:
        
        # d <- d + f/BASE = c1
        leaq   1(,_p_,2), %rcx          # rcx <- 2p+1
        movq   _d_,     %rsi            # rsi <- &d
        leaq   8(_f_),  %rbx            # rbx <- &f[1]
	movq   %rsi,    %rdi
        call   .Lsn_fadd_1              # d -= c3

        # c[p..4p+q+r-1] += c1
L(c3_nul):
	leaq  (_c_,_p_,8), %rsi         # rsi <- &c[p]
        leaq   1(,_p_,2),  %rcx         # rcx <- 2p+1
	leaq  -1(_p_,_r_,1), %rdx
	addq  %rcx,     %rdx            # rdx <- 3p+q+r
        movq   _d_,     %rbx
        call   .Lsn_finc                # ajoute c1

        # c[4p] += x
	leaq   (,_p_,4), %rcx
	leaq   (_c_,%rcx,8), %rsi       # rsi <- &c[4p]
	addq   _x_,     (%rsi)          # c(4p] += x
        jnc    2f
1:
        leaq   8(%rsi), %rsi            # propage la retenue
        incq   (%rsi)
        jz     1b
2:

        # termin
        leaq  (_p_,_p_,2),   %rax       # rax <- 6p+20
	leaq  20(,%rax,2),   %rax
        leaq  (%rsp,%rax,8), %rsp       # nettoie la pile
        ret

        # ici lb <= 2*ceil(la/3) : dcoupage en tranches
        ALIGN(4)
L(tranches):
        
        addq   %rax,    %rcx            # rcx <- lb
        leaq   -2(%rdx,%rbp,1), %rdx    # rdx <- q
        leaq   (%rdx,%rbp,2),   %rdx    # rdx <- la

        # Le code qui suit est recopi mot  mot dans karamul, en remplaant
        # les deux appels  sn_ftoommul par des appels  sn_fkaramul.
        # Attention  rpercuter les mises  jour !

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
        #define _d_  40(%rsp)
        #define _la_ 32(%rsp)
        #define _lb_ 24(%rsp)
        #define _a_  16(%rsp)
        #define _b_   8(%rsp)
        #define _c_    (%rsp)
        
        leaq   (,%rcx,8), %rax
        ALLOCA                          # rserve lb chiffres dans la pile
        pushq  %rdx                     # sauve la
        pushq  %rcx                     # sauve lb

        # premire multiplication : c <- a[0..(la % lb)-1]*b
        movq   %rdx,    %rax
        movq   $0,      %rdx            # rdx:rax <- la
        divq   %rcx                     # rdx <- la % lb
        testq  %rdx,    %rdx            # si la est multiple de lb ...
        jnz    1f
        movq   %rcx,    %rdx
1:
        xchgq  %rbx,    %rsi            # permute les arguments ...
        xchgq  %rcx,    %rdx            # pour avoir rdx >= rcx
        leaq   (%rbx,%rcx,8), %rax
        pushq  %rax                     # a += a[la % lb]
        pushq  %rsi                     # sauve &b
        leaq   (%rdi,%rcx,8), %rax
        pushq  %rax                     # c += c[la % lb]
        subq   %rcx,    _la_            # la -= la % lb
        call   .Lsn_ftoommul

        # multiplications suivantes
        ALIGN(4)
L(loop):
        movq   _c_,     %rsi
        leaq   _d_,     %rdi
        movq   _lb_,    %rcx
        cld;   rep movsq                # d <- c[0..lb-1]
        
        movq   _c_,     %rdi
        movq   _b_,     %rsi
        movq   _a_,     %rbx
        movq   _lb_,    %rdx
        movq   %rdx,    %rcx            # rcx <- lb
        call   .Lsn_ftoommul            # c[0..2lb-1] <- a[0..lb-1]*b

        movq   _c_,     %rsi
        leaq   _d_,     %rbx
        movq   _lb_,    %rcx
        leaq   (,%rcx,2), %rdx          # rdx <- 2*lb
        call   .Lsn_finc                # c <- c + d

        movq   _lb_,    %rax
        leaq   (,%rax,8), %rcx
        addq   %rcx,    _c_             # c+=lb
        addq   %rcx,    _a_             # a+=lb
        subq   %rax,    _la_            # la -= lb
        jne    L(loop)

        # termin
        leaq   40(%rsp,%rax,8), %rsp    # nettoie la pile
        ret


                              # +---------------+
                              # |  Interface C  |
                              # +---------------+

#  void xn(toommul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur lb
#  c = naturel de longueur la+lb, non confondu avec a ou b
#  contraintes : 0 < lb <= la
#
#  sortie :
#  c <- a*b

#ifdef debug_toommul
ENTER(sn_toommul_buggy)
#else
ENTER(sn_toommul)
#endif

	movq   %rdx,   %rbx
	movq   %rsi,   %rdx
	movq   %rdi,   %rsi
	movq   %r8,    %rdi
#ifdef debug_toommul
        call   .Lsn_ftoommul_buggy      # effectue la multiplication
#else
        call   .Lsn_ftoommul
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_toommul */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_ftoommul renvoie vers la version C

#if !defined(assembly_sn_toommul) || defined(debug_toommul)
        ALIGN(32)
.Lsn_ftoommul:

	movq   %rdi,   %r8
	movq   %rsi,   %rdi
	movq   %rdx,   %rsi
	movq   %rbx,   %rdx
        jmp    SUBR(sn_toommul)
        
#endif /* !defined(assembly_sn_toommul) || defined(debug_toommul) */



                                 # +---------+
                                 # |  Carr  |
                                 # +---------+

# entre :
#   a = naturel de longueur la     rsi = &a, rdx = la
#   c = naturel de longueur 2*la   rdi = &c
# contraintes : 0 < la
#
# sortie :
#   c <- a^2
#
# registres modifis : tous


#ifdef assembly_sn_toomsqr
        ALIGN(32)
#ifdef debug_toommul
.Lsn_ftoomsqr_buggy:
#else
.Lsn_ftoomsqr:
#endif

#undef L
#define L(x) .Lsn_ftoomsqr_##x

        # petit carr => algorithme de Karatsuba
        cmpq   $toomsqr_lim, %rdx
        jbe    .Lsn_fkarasqr

        movq   %rdx,    %rcx            # rcx <- la
        leaq   2(%rdx), %rax
        xorq   %rdx,    %rdx
        movq   $3,      %rbp
        divq   %rbp                     # rax <- p = ceil(la/3)
	movq   %rax,    %rbp            # rbp <- p
        shlq   $1,      %rax            # rax <- 2p
        subq   %rax,    %rcx            # rcx <- q = la - 2p

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _e_
        #undef  _f_
        #undef  _p_
        #undef  _q_
        #undef  _r_
        #undef  _x_
        
        #define _a_   72(%rsp)
        #define _b_   64(%rsp)
        #define _c_   56(%rsp)
        #define _d_   48(%rsp)
        #define _e_   40(%rsp)
        #define _f_   32(%rsp)
        #define _p_   24(%rsp)
        #define _q_   16(%rsp)
        #define _r_    8(%rsp)
        #define _x_     (%rsp)
        
	movq  %rax,     %r8             # r8 <- 2p
        leaq  10(%rax,%rax,2), %rax     # rax <- 6p+10
        leaq  (,%rax,8), %rax
        ALLOCA                          # rserve 6p+10 chiffres dans la pile
	movq   %rsp,     %rax           # rax <- &d
        pushq  %rsi                     # sauve &a
        pushq  %rsi                     # sauve &b (= &a)
        pushq  %rdi                     # sauve &c
        pushq  %rax                     # sauve &d
        leaq   16(%rax,%r8,8), %rax
        pushq  %rax                     # sauve &e
        leaq   16(%rax,%r8,8), %rax
        pushq  %rax                     # sauve &f
        pushq  %rbp                     # sauve p
        pushq  %rcx                     # sauve q
        pushq  %rcx                     # sauve r (= q)
        pushq  $0                       # x <- 0

        # c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
        movq   %rbp,    %rdx            # rdx <- p
        call   .Lsn_fadd_sub3

        # d <- (a0 + a1 + a2)^2 = c0 + c1 + c2 + c3 + c4
        movq   _p_,     %rdx
        leaq   (,%rdx,2), %rcx
        movq   _c_,     %rsi
        movq   _d_,     %rdi
        movq   $0,  (%rdi,%rcx,8)
        testq  $-1, (%rsi,%rdx,8)
        jz     1f
        incq   %rdx
1:
        call   .Lsn_ftoomsqr

        # e <- (a0 - a1 + a2)^2 = c0 - c1 + c2 - c3 + c4
        movq   _p_,     %rdx
        leaq   (,%rdx,2), %rcx
        movq   _c_,     %rsi
        leaq  8(%rsi,%rdx,8), %rsi
        movq   _e_,     %rdi
        movq   $0,  (%rdi,%rcx,8)
        testq  $-1, (%rsi,%rdx,8)
        jz     1f
        incq   %rdx
1:
        call   .Lsn_ftoomsqr
        
        # c[0..p+2] <- a0 + BASE*a1 + BASE^2*a2
        movq   _a_,     %rsi
        movq   _c_,     %rdi
        movq   _p_,     %rdx
        movq   _q_,     %rcx
        call   .Lsn_fadd_base
        
        # f <- (a0 + BASE*a1 + BASE^2*a2)^2
        #    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
        movq   _p_,     %rdx
        movq   _f_,     %rdi
        xorq   %rax,    %rax
#if 0
        # ceci est inutile, les chiffres de rang 2p+2  2p+5
        # de f vont tre ignors (voir plus haut)
	leaq   (%rdx,%rdx,1), %rcx
        movq   %rax,   16(%rdi,%rcx,8)  # f[2p+2] <- 0
        movq   %rax,   24(%rdi,%rcx,8)  # f[2p+3] <- 0
        movq   %rax,   32(%rdi,%rcx,8)  # f[2p+4] <- 0
        movq   %rax,   40(%rdi,%rcx,8)  # f[2p+5] <- 0
#endif
        addq   $3,      %rdx            # rdx <- p+3
        movq   _c_,     %rsi
        cmpq  -8(%rsi,%rdx,8), %rax     # rdx <- lg(a0+BASE*a1+BASE^2*a2)
        adcq   $-1,     %rdx
        cmpq  -8(%rsi,%rdx,8), %rax
        adcq   $-1,     %rdx
        call   .Lsn_ftoomsqr

        # c[0..2p-1] <- a0^2 = c0
        movq   _a_,     %rsi
        movq   _c_,     %rdi
        movq   _p_,     %rdx
        call   .Lsn_ftoomsqr

        # c[4*p..4p+q+r-1] <- a2^2 = c4
        movq   _p_,     %rax
        leaq   (,%rax,4), %rax          # rax <- 4p
        movq   _a_,     %rsi
        movq   _c_,     %rdi
        leaq   (%rsi,%rax,4), %rsi      # rsi <- &a2
        leaq   (%rdi,%rax,8), %rdi      # rdi <- &c[4p]
        movq   _q_,     %rdx
        call   .Lsn_ftoomsqr
        
        jmp    .Lsn_toom_aux            # continue avec toommul

        
	                      # +---------------+
                              # |  interface C  |
                              # +---------------+

#  void xn(toomsqr)(chiffre *a, long la, chiffre *b)
#
#  entre :
#  a = naturel de longueur la
#  b = naturel de longueur 2*la, non confondu avec a
#  contraintes : 0 < la
#
#  sortie :
#  b <- a^2

#ifdef debug_toommul
ENTER(sn_toomsqr_buggy)
#else
ENTER(sn_toomsqr)
#endif

	movq   %rdx,   %rax
	movq   %rsi,   %rdx
	movq   %rdi,   %rsi
	movq   %rax,   %rdi
#ifdef debug_toommul
        call   .Lsn_ftoomsqr_buggy      # calcule le carr
#else
        call   .Lsn_ftoomsqr
#endif
        RETURN_WITH_SP
#endif /* assembly_sn_toomsqr */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_ftoomsqr renvoie vers la version C
        
#if !defined(assembly_sn_toomsqr) || defined(debug_toommul)
        ALIGN(32)
.Lsn_ftoomsqr:

	movq   %rdi,   %rax
	movq   %rsi,   %rdi
	movq   %rdx,   %rsi
	movq   %rax,   %rdx
        jmp    SUBR(sn_toomsqr)
        
#endif /* !defined(assembly_sn_toomsqr) || defined(debug_toommul) */


