Support for Signal calls.

Merge in RedPhone

// FREEBIE
This commit is contained in:
Moxie Marlinspike
2015-09-09 13:54:29 -07:00
parent 3d4ae60d81
commit d83a3d71bc
2585 changed files with 803492 additions and 45 deletions

View File

@@ -0,0 +1,27 @@
<OBSOLETE>
All assember in this directory are just version of the file
crypto/bn/bn_asm.c.
Quite a few of these files are just the assember output from gcc since on
quite a few machines they are 2 times faster than the system compiler.
For the x86, I have hand written assember because of the bad job all
compilers seem to do on it. This normally gives a 2 time speed up in the RSA
routines.
For the DEC alpha, I also hand wrote the assember (except the division which
is just the output from the C compiler pasted on the end of the file).
On the 2 alpha C compilers I had access to, it was not possible to do
64b x 64b -> 128b calculations (both long and the long long data types
were 64 bits). So the hand assember gives access to the 128 bit result and
a 2 times speedup :-).
There are 3 versions of assember for the HP PA-RISC.
pa-risc.s is the origional one which works fine and generated using gcc :-)
pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
by Chris Ruemmler from HP (with some help from the HP C compiler).
</OBSOLETE>

View File

@@ -0,0 +1,321 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
# instructed to '-tune host' code with in-line assembler. Other
# benchmarks improve by 15-20%. To anchor it to something else, the
# code provides approximately the same performance per GHz as AMD64.
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
# difference.
# int bn_mul_mont(
$rp="a0"; # BN_ULONG *rp,
$ap="a1"; # const BN_ULONG *ap,
$bp="a2"; # const BN_ULONG *bp,
$np="a3"; # const BN_ULONG *np,
$n0="a4"; # const BN_ULONG *n0,
$num="a5"; # int num);
$lo0="t0";
$hi0="t1";
$lo1="t2";
$hi1="t3";
$aj="t4";
$bi="t5";
$nj="t6";
$tp="t7";
$alo="t8";
$ahi="t9";
$nlo="t10";
$nhi="t11";
$tj="t12";
$i="s3";
$j="s4";
$m1="s5";
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl bn_mul_mont
.align 5
.ent bn_mul_mont
bn_mul_mont:
lda sp,-48(sp)
stq ra,0(sp)
stq s3,8(sp)
stq s4,16(sp)
stq s5,24(sp)
stq fp,32(sp)
mov sp,fp
.mask 0x0400f000,-48
.frame fp,48,ra
.prologue 0
.align 4
.set reorder
sextl $num,$num
mov 0,v0
cmplt $num,4,AT
bne AT,.Lexit
ldq $hi0,0($ap) # ap[0]
s8addq $num,16,AT
ldq $aj,8($ap)
subq sp,AT,sp
ldq $bi,0($bp) # bp[0]
lda AT,-4096(zero) # mov -4096,AT
ldq $n0,0($n0)
and sp,AT,sp
mulq $hi0,$bi,$lo0
ldq $hi1,0($np) # np[0]
umulh $hi0,$bi,$hi0
ldq $nj,8($np)
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov 2,$j
umulh $aj,$bi,$ahi
mov sp,$tp
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
s8addq $j,$np,$nj
.align 4
.L1st:
.set noreorder
ldq $aj,0($aj)
addl $j,1,$j
ldq $nj,0($nj)
lda $tp,8($tp)
addq $alo,$hi0,$lo0
mulq $aj,$bi,$alo
cmpult $lo0,$hi0,AT
addq $nlo,$hi1,$lo1
mulq $nj,$m1,$nlo
addq $ahi,AT,$hi0
cmpult $lo1,$hi1,v0
cmplt $j,$num,$tj
umulh $aj,$bi,$ahi
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
s8addq $j,$np,$nj
stq $lo1,-8($tp)
nop
unop
bne $tj,.L1st
.set reorder
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
stq $lo1,0($tp)
addq $hi1,$hi0,$hi1
cmpult $hi1,$hi0,AT
stq $hi1,8($tp)
stq AT,16($tp)
mov 1,$i
.align 4
.Louter:
s8addq $i,$bp,$bi
ldq $hi0,0($ap)
ldq $aj,8($ap)
ldq $bi,0($bi)
ldq $hi1,0($np)
ldq $nj,8($np)
ldq $tj,0(sp)
mulq $hi0,$bi,$lo0
umulh $hi0,$bi,$hi0
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
mov 2,$j
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov sp,$tp
umulh $aj,$bi,$ahi
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
.align 4
.Linner:
.set noreorder
ldq $tj,8($tp) #L0
nop #U1
ldq $aj,0($aj) #L1
s8addq $j,$np,$nj #U0
ldq $nj,0($nj) #L0
nop #U1
addq $alo,$hi0,$lo0 #L1
lda $tp,8($tp)
mulq $aj,$bi,$alo #U1
cmpult $lo0,$hi0,AT #L0
addq $nlo,$hi1,$lo1 #L1
addl $j,1,$j
mulq $nj,$m1,$nlo #U1
addq $ahi,AT,$hi0 #L0
addq $lo0,$tj,$lo0 #L1
cmpult $lo1,$hi1,v0 #U0
umulh $aj,$bi,$ahi #U1
cmpult $lo0,$tj,AT #L0
addq $lo1,$lo0,$lo1 #L1
addq $nhi,v0,$hi1 #U0
umulh $nj,$m1,$nhi #U1
s8addq $j,$ap,$aj #L0
cmpult $lo1,$lo0,v0 #L1
cmplt $j,$num,$tj #U0 # borrow $tj
addq $hi0,AT,$hi0 #L0
addq $hi1,v0,$hi1 #U1
stq $lo1,-8($tp) #L1
bne $tj,.Linner #U0
.set reorder
ldq $tj,8($tp)
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
ldq $tj,16($tp)
addq $lo1,$lo0,$j
cmpult $j,$lo0,v0
addq $hi1,v0,$hi1
addq $hi1,$hi0,$lo1
stq $j,0($tp)
cmpult $lo1,$hi0,$hi1
addq $lo1,$tj,$lo1
cmpult $lo1,$tj,AT
addl $i,1,$i
addq $hi1,AT,$hi1
stq $lo1,8($tp)
cmplt $i,$num,$tj # borrow $tj
stq $hi1,16($tp)
bne $tj,.Louter
s8addq $num,sp,$tj # &tp[num]
mov $rp,$bp # put rp aside
mov sp,$tp
mov sp,$ap
mov 0,$hi0 # clear borrow bit
.align 4
.Lsub: ldq $lo0,0($tp)
ldq $lo1,0($np)
lda $tp,8($tp)
lda $np,8($np)
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
cmpult $lo0,$lo1,AT
subq $lo1,$hi0,$lo0
cmpult $lo1,$lo0,$hi0
or $hi0,AT,$hi0
stq $lo0,0($rp)
cmpult $tp,$tj,v0
lda $rp,8($rp)
bne v0,.Lsub
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
mov sp,$tp
mov $bp,$rp # restore rp
and sp,$hi0,$ap
bic $bp,$hi0,$bp
bis $bp,$ap,$ap # ap=borrow?tp:rp
.align 4
.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
lda $tp,8($tp)
lda $rp,8($rp)
lda $ap,8($ap)
stq zero,-8($tp) # zap tp
cmpult $tp,$tj,AT
stq $aj,-8($rp)
bne AT,.Lcopy
mov 1,v0
.Lexit:
.set noreorder
mov fp,sp
/*ldq ra,0(sp)*/
ldq s3,8(sp)
ldq s4,16(sp)
ldq s5,24(sp)
ldq fp,32(sp)
lda sp,48(sp)
ret (ra)
.end bn_mul_mont
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
print $code;
close STDOUT;

View File

@@ -0,0 +1,201 @@
#include "arm_arch.h"
.text
.code 32
#if __ARM_ARCH__>=7
.fpu neon
#endif
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov r4,#0
bic r5,r1,#3<<30 @ a1=a&0x3fffffff
str r4,[sp,#0] @ tab[0]=0
add r6,r5,r5 @ a2=a1<<1
str r5,[sp,#4] @ tab[1]=a1
eor r7,r5,r6 @ a1^a2
str r6,[sp,#8] @ tab[2]=a2
mov r8,r5,lsl#2 @ a4=a1<<2
str r7,[sp,#12] @ tab[3]=a1^a2
eor r9,r5,r8 @ a1^a4
str r8,[sp,#16] @ tab[4]=a4
eor r4,r6,r8 @ a2^a4
str r9,[sp,#20] @ tab[5]=a1^a4
eor r7,r7,r8 @ a1^a2^a4
str r4,[sp,#24] @ tab[6]=a2^a4
and r8,r12,r0,lsl#2
str r7,[sp,#28] @ tab[7]=a1^a2^a4
and r9,r12,r0,lsr#1
ldr r5,[sp,r8] @ tab[b & 0x7]
and r8,r12,r0,lsr#4
ldr r7,[sp,r9] @ tab[b >> 3 & 0x7]
and r9,r12,r0,lsr#7
ldr r6,[sp,r8] @ tab[b >> 6 & 0x7]
eor r5,r5,r7,lsl#3 @ stall
mov r4,r7,lsr#29
ldr r7,[sp,r9] @ tab[b >> 9 & 0x7]
and r8,r12,r0,lsr#10
eor r5,r5,r6,lsl#6
eor r4,r4,r6,lsr#26
ldr r6,[sp,r8] @ tab[b >> 12 & 0x7]
and r9,r12,r0,lsr#13
eor r5,r5,r7,lsl#9
eor r4,r4,r7,lsr#23
ldr r7,[sp,r9] @ tab[b >> 15 & 0x7]
and r8,r12,r0,lsr#16
eor r5,r5,r6,lsl#12
eor r4,r4,r6,lsr#20
ldr r6,[sp,r8] @ tab[b >> 18 & 0x7]
and r9,r12,r0,lsr#19
eor r5,r5,r7,lsl#15
eor r4,r4,r7,lsr#17
ldr r7,[sp,r9] @ tab[b >> 21 & 0x7]
and r8,r12,r0,lsr#22
eor r5,r5,r6,lsl#18
eor r4,r4,r6,lsr#14
ldr r6,[sp,r8] @ tab[b >> 24 & 0x7]
and r9,r12,r0,lsr#25
eor r5,r5,r7,lsl#21
eor r4,r4,r7,lsr#11
ldr r7,[sp,r9] @ tab[b >> 27 & 0x7]
tst r1,#1<<30
and r8,r12,r0,lsr#28
eor r5,r5,r6,lsl#24
eor r4,r4,r6,lsr#8
ldr r6,[sp,r8] @ tab[b >> 30 ]
eorne r5,r5,r0,lsl#30
eorne r4,r4,r0,lsr#2
tst r1,#1<<31
eor r5,r5,r7,lsl#27
eor r4,r4,r7,lsr#5
eorne r5,r5,r0,lsl#31
eorne r4,r4,r0,lsr#1
eor r5,r5,r6,lsl#30
eor r4,r4,r6,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
beq .Lialu
ldr r12, [sp] @ 5th argument
vmov.32 d26, r2, r1
vmov.32 d27, r12, r3
vmov.i64 d28, #0x0000ffffffffffff
vmov.i64 d29, #0x00000000ffffffff
vmov.i64 d30, #0x000000000000ffff
vext.8 d2, d26, d26, #1 @ A1
vmull.p8 q1, d2, d27 @ F = A1*B
vext.8 d0, d27, d27, #1 @ B1
vmull.p8 q0, d26, d0 @ E = A*B1
vext.8 d4, d26, d26, #2 @ A2
vmull.p8 q2, d4, d27 @ H = A2*B
vext.8 d16, d27, d27, #2 @ B2
vmull.p8 q8, d26, d16 @ G = A*B2
vext.8 d6, d26, d26, #3 @ A3
veor q1, q1, q0 @ L = E + F
vmull.p8 q3, d6, d27 @ J = A3*B
vext.8 d0, d27, d27, #3 @ B3
veor q2, q2, q8 @ M = G + H
vmull.p8 q0, d26, d0 @ I = A*B3
veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8
vand d3, d3, d28
vext.8 d16, d27, d27, #4 @ B4
veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16
vand d5, d5, d29
vmull.p8 q8, d26, d16 @ K = A*B4
veor q3, q3, q0 @ N = I + J
veor d2, d2, d3
veor d4, d4, d5
veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24
vand d7, d7, d30
vext.8 q1, q1, q1, #15
veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d17, #0
vext.8 q2, q2, q2, #14
veor d6, d6, d7
vmull.p8 q0, d26, d27 @ D = A*B
vext.8 q8, q8, q8, #12
vext.8 q3, q3, q3, #13
veor q1, q1, q2
veor q3, q3, q8
veor q0, q0, q1
veor q0, q0, q3
vst1.32 {q0}, [r0]
bx lr @ bx lr
.align 4
.Lialu:
#endif
stmdb sp!,{r4-r10,lr}
mov r10,r0 @ reassign 1st argument
mov r0,r3 @ r0=b1
ldr r3,[sp,#32] @ load b0
mov r12,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str r5,[r10,#8]
str r4,[r10,#12]
eor r0,r0,r3 @ flip b0 and b1
eor r1,r1,r2 @ flip a0 and a1
eor r3,r3,r0
eor r2,r2,r1
eor r0,r0,r3
eor r1,r1,r2
bl mul_1x1_ialu @ a0·b0
str r5,[r10]
str r4,[r10,#4]
eor r1,r1,r2
eor r0,r0,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
ldmia r10,{r6-r9}
eor r5,r5,r4
eor r4,r4,r7
eor r5,r5,r6
eor r4,r4,r8
eor r5,r5,r9
eor r4,r4,r9
str r4,[r10,#8]
eor r5,r5,r4
add sp,sp,#32 @ destroy tab[8]
str r5,[r10,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
.align 5
.comm OPENSSL_armcap_P,4,4

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
#
# April 2014
#
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#if __ARM_ARCH__>=7
.fpu neon
#endif
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
str $a0,[sp,#0] @ tab[0]=0
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,#4] @ tab[1]=a1
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,#8] @ tab[2]=a2
mov $a4,$a1,lsl#2 @ a4=a1<<2
str $a12,[sp,#12] @ tab[3]=a1^a2
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,#16] @ tab[4]=a4
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,#20] @ tab[5]=a1^a4
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,#24] @ tab[6]=a2^a4
and $i0,$mask,$b,lsl#2
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
and $i1,$mask,$b,lsr#1
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr#4
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr#7
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl#3 @ stall
mov $hi,$t1,lsr#29
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr#10
eor $lo,$lo,$t0,lsl#6
eor $hi,$hi,$t0,lsr#26
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr#13
eor $lo,$lo,$t1,lsl#9
eor $hi,$hi,$t1,lsr#23
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr#16
eor $lo,$lo,$t0,lsl#12
eor $hi,$hi,$t0,lsr#20
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr#19
eor $lo,$lo,$t1,lsl#15
eor $hi,$hi,$t1,lsr#17
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr#22
eor $lo,$lo,$t0,lsl#18
eor $hi,$hi,$t0,lsr#14
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr#25
eor $lo,$lo,$t1,lsl#21
eor $hi,$hi,$t1,lsr#11
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,#1<<30
and $i0,$mask,$b,lsr#28
eor $lo,$lo,$t0,lsl#24
eor $hi,$hi,$t0,lsr#8
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
eorne $lo,$lo,$b,lsl#30
eorne $hi,$hi,$b,lsr#2
tst $a,#1<<31
eor $lo,$lo,$t1,lsl#27
eor $hi,$hi,$t1,lsr#5
eorne $lo,$lo,$b,lsl#31
eorne $hi,$hi,$b,lsr#1
eor $lo,$lo,$t0,lsl#30
eor $hi,$hi,$t0,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
{
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
beq .Lialu
ldr r12, [sp] @ 5th argument
vmov.32 $a, r2, r1
vmov.32 $b, r12, r3
vmov.i64 $k48, #0x0000ffffffffffff
vmov.i64 $k32, #0x00000000ffffffff
vmov.i64 $k16, #0x000000000000ffff
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
vst1.32 {$r}, [r0]
ret @ bx lr
.align 4
.Lialu:
#endif
___
}
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
eor $lo,$lo,$hi
add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
.comm OPENSSL_armcap_P,4,4
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

View File

@@ -0,0 +1,579 @@
#include "arm_arch.h"
.text
.code 32
#if __ARM_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-bn_mul_mont
#endif
.global bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,bn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
tst r0,#1 @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov r0,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov r0,r0,lsl#2 @ rescale r0 for byte count
sub sp,sp,r0 @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub r0,r0,#4 @ "num=num-1"
add r4,r2,r0 @ &bp[num-1]
add r0,sp,r0 @ r0 to point at &tp[num-1]
ldr r8,[r0,#14*4] @ &n0
ldr r2,[r2] @ bp[0]
ldr r5,[r1],#4 @ ap[0],ap++
ldr r6,[r3],#4 @ np[0],np++
ldr r8,[r8] @ *n0
str r4,[r0,#15*4] @ save &bp[num]
umull r10,r11,r5,r2 @ ap[0]*bp[0]
str r8,[r0,#14*4] @ save n0 value
mul r8,r10,r8 @ "tp[0]"*n0
mov r12,#0
umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
mov r4,sp
.L1st:
ldr r5,[r1],#4 @ ap[j],ap++
mov r10,r11
ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[0]
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
cmp r4,r0
bne .L1st
adds r12,r12,r11
ldr r4,[r0,#13*4] @ restore bp
mov r14,#0
ldr r8,[r0,#14*4] @ restore n0
adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
str r14,[r0,#4] @ tp[num]=
.Louter:
sub r7,r0,sp @ "original" r0-1 value
sub r1,r1,r7 @ "rewind" ap to &ap[1]
ldr r2,[r4,#4]! @ *(++bp)
sub r3,r3,r7 @ "rewind" np to &np[1]
ldr r5,[r1,#-4] @ ap[0]
ldr r10,[sp] @ tp[0]
ldr r6,[r3,#-4] @ np[0]
ldr r7,[sp,#4] @ tp[1]
mov r11,#0
umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
str r4,[r0,#13*4] @ save bp
mul r8,r10,r8
mov r12,#0
umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
mov r4,sp
.Linner:
ldr r5,[r1],#4 @ ap[j],ap++
adds r10,r11,r7 @ +=tp[j]
ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[i]
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adc r11,r11,#0
ldr r7,[r4,#8] @ tp[j+1]
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
cmp r4,r0
bne .Linner
adds r12,r12,r11
mov r14,#0
ldr r4,[r0,#13*4] @ restore bp
adc r14,r14,#0
ldr r8,[r0,#14*4] @ restore n0
adds r12,r12,r7
ldr r7,[r0,#15*4] @ restore &bp[num]
adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
str r14,[r0,#4] @ tp[num]=
cmp r4,r7
bne .Louter
ldr r2,[r0,#12*4] @ pull rp
add r0,r0,#4 @ r0 to point at &tp[num]
sub r5,r0,sp @ "original" num value
mov r4,sp @ "rewind" r4
mov r1,r4 @ "borrow" r1
sub r3,r3,r5 @ "rewind" r3 to &np[0]
subs r7,r7,r7 @ "clear" carry flag
.Lsub: ldr r7,[r4],#4
ldr r6,[r3],#4
sbcs r7,r7,r6 @ tp[j]-np[j]
str r7,[r2],#4 @ rp[j]=
teq r4,r0 @ preserve carry
bne .Lsub
sbcs r14,r14,#0 @ upmost carry
mov r4,sp @ "rewind" r4
sub r2,r2,r5 @ "rewind" r2
and r1,r4,r14
bic r3,r2,r14
orr r1,r1,r3 @ ap=borrow?tp:rp
.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh
str sp,[r4],#4 @ zap tp
str r7,[r2],#4
cmp r4,r0
bne .Lcopy
add sp,r0,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH__>=5
bx lr @ .word 0xe12fff1e
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont,.-bn_mul_mont
#if __ARM_ARCH__>=7
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
sub r7,sp,#16
vld1.32 {d28[0]}, [r2,:32]!
sub r7,r7,r5,lsl#4
vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-(
and r7,r7,#-64
vld1.32 {d30[0]}, [r4,:32]
mov sp,r7 @ alloca
veor d8,d8,d8
subs r8,r5,#8
vzip.16 d28,d8
vmull.u32 q6,d28,d0[0]
vmull.u32 q7,d28,d0[1]
vmull.u32 q8,d28,d1[0]
vshl.i64 d10,d13,#16
vmull.u32 q9,d28,d1[1]
vadd.u64 d10,d10,d12
veor d8,d8,d8
vmul.u32 d29,d10,d30
vmull.u32 q10,d28,d2[0]
vld1.32 {d4-d7}, [r3]!
vmull.u32 q11,d28,d2[1]
vmull.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmull.u32 q13,d28,d3[1]
bne .LNEON_1st
@ special case for num=8, everything is in register bank...
vmlal.u32 q6,d29,d4[0]
sub r9,r5,#1
vmlal.u32 q7,d29,d4[1]
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vmov q5,q6
vmlal.u32 q11,d29,d6[1]
vmov q6,q7
vmlal.u32 q12,d29,d7[0]
vmov q7,q8
vmlal.u32 q13,d29,d7[1]
vmov q8,q9
vmov q9,q10
vshr.u64 d10,d10,#16
vmov q10,q11
vmov q11,q12
vadd.u64 d10,d10,d11
vmov q12,q13
veor q13,q13
vshr.u64 d10,d10,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {d28[0]}, [r2,:32]!
veor d8,d8,d8
vzip.16 d28,d8
vadd.u64 d12,d12,d10
vmlal.u32 q6,d28,d0[0]
vmlal.u32 q7,d28,d0[1]
vmlal.u32 q8,d28,d1[0]
vshl.i64 d10,d13,#16
vmlal.u32 q9,d28,d1[1]
vadd.u64 d10,d10,d12
veor d8,d8,d8
subs r9,r9,#1
vmul.u32 d29,d10,d30
vmlal.u32 q10,d28,d2[0]
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q13,d28,d3[1]
vmlal.u32 q6,d29,d4[0]
vmlal.u32 q7,d29,d4[1]
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vmov q5,q6
vmlal.u32 q11,d29,d6[1]
vmov q6,q7
vmlal.u32 q12,d29,d7[0]
vmov q7,q8
vmlal.u32 q13,d29,d7[1]
vmov q8,q9
vmov q9,q10
vshr.u64 d10,d10,#16
vmov q10,q11
vmov q11,q12
vadd.u64 d10,d10,d11
vmov q12,q13
veor q13,q13
vshr.u64 d10,d10,#16
bne .LNEON_outer8
vadd.u64 d12,d12,d10
mov r7,sp
vshr.u64 d10,d12,#16
mov r8,r5
vadd.u64 d13,d13,d10
add r6,sp,#16
vshr.u64 d10,d13,#16
vzip.16 d12,d13
b .LNEON_tail2
.align 4
.LNEON_1st:
vmlal.u32 q6,d29,d4[0]
vld1.32 {d0-d3}, [r1]!
vmlal.u32 q7,d29,d4[1]
subs r8,r8,#8
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vld1.32 {d4-d5}, [r3]!
vmlal.u32 q11,d29,d6[1]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q12,d29,d7[0]
vmlal.u32 q13,d29,d7[1]
vst1.64 {q8-q9}, [r7,:256]!
vmull.u32 q6,d28,d0[0]
vld1.32 {d6-d7}, [r3]!
vmull.u32 q7,d28,d0[1]
vst1.64 {q10-q11}, [r7,:256]!
vmull.u32 q8,d28,d1[0]
vmull.u32 q9,d28,d1[1]
vst1.64 {q12-q13}, [r7,:256]!
vmull.u32 q10,d28,d2[0]
vmull.u32 q11,d28,d2[1]
vmull.u32 q12,d28,d3[0]
vmull.u32 q13,d28,d3[1]
bne .LNEON_1st
vmlal.u32 q6,d29,d4[0]
add r6,sp,#16
vmlal.u32 q7,d29,d4[1]
sub r1,r1,r5,lsl#2 @ rewind r1
vmlal.u32 q8,d29,d5[0]
vld1.64 {q5}, [sp,:128]
vmlal.u32 q9,d29,d5[1]
sub r9,r5,#1
vmlal.u32 q10,d29,d6[0]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q11,d29,d6[1]
vshr.u64 d10,d10,#16
vld1.64 {q6}, [r6, :128]!
vmlal.u32 q12,d29,d7[0]
vst1.64 {q8-q9}, [r7,:256]!
vmlal.u32 q13,d29,d7[1]
vst1.64 {q10-q11}, [r7,:256]!
vadd.u64 d10,d10,d11
veor q4,q4,q4
vst1.64 {q12-q13}, [r7,:256]!
vld1.64 {q7-q8}, [r6, :256]!
vst1.64 {q4}, [r7,:128]
vshr.u64 d10,d10,#16
b .LNEON_outer
.align 4
.LNEON_outer:
vld1.32 {d28[0]}, [r2,:32]!
sub r3,r3,r5,lsl#2 @ rewind r3
vld1.32 {d0-d3}, [r1]!
veor d8,d8,d8
mov r7,sp
vzip.16 d28,d8
sub r8,r5,#8
vadd.u64 d12,d12,d10
vmlal.u32 q6,d28,d0[0]
vld1.64 {q9-q10},[r6,:256]!
vmlal.u32 q7,d28,d0[1]
vmlal.u32 q8,d28,d1[0]
vld1.64 {q11-q12},[r6,:256]!
vmlal.u32 q9,d28,d1[1]
vshl.i64 d10,d13,#16
veor d8,d8,d8
vadd.u64 d10,d10,d12
vld1.64 {q13},[r6,:128]!
vmul.u32 d29,d10,d30
vmlal.u32 q10,d28,d2[0]
vld1.32 {d4-d7}, [r3]!
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q13,d28,d3[1]
.LNEON_inner:
vmlal.u32 q6,d29,d4[0]
vld1.32 {d0-d3}, [r1]!
vmlal.u32 q7,d29,d4[1]
subs r8,r8,#8
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q10,d29,d6[0]
vld1.64 {q6}, [r6, :128]!
vmlal.u32 q11,d29,d6[1]
vst1.64 {q8-q9}, [r7,:256]!
vmlal.u32 q12,d29,d7[0]
vld1.64 {q7-q8}, [r6, :256]!
vmlal.u32 q13,d29,d7[1]
vst1.64 {q10-q11}, [r7,:256]!
vmlal.u32 q6,d28,d0[0]
vld1.64 {q9-q10}, [r6, :256]!
vmlal.u32 q7,d28,d0[1]
vst1.64 {q12-q13}, [r7,:256]!
vmlal.u32 q8,d28,d1[0]
vld1.64 {q11-q12}, [r6, :256]!
vmlal.u32 q9,d28,d1[1]
vld1.32 {d4-d7}, [r3]!
vmlal.u32 q10,d28,d2[0]
vld1.64 {q13}, [r6, :128]!
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vmlal.u32 q13,d28,d3[1]
bne .LNEON_inner
vmlal.u32 q6,d29,d4[0]
add r6,sp,#16
vmlal.u32 q7,d29,d4[1]
sub r1,r1,r5,lsl#2 @ rewind r1
vmlal.u32 q8,d29,d5[0]
vld1.64 {q5}, [sp,:128]
vmlal.u32 q9,d29,d5[1]
subs r9,r9,#1
vmlal.u32 q10,d29,d6[0]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q11,d29,d6[1]
vld1.64 {q6}, [r6, :128]!
vshr.u64 d10,d10,#16
vst1.64 {q8-q9}, [r7,:256]!
vmlal.u32 q12,d29,d7[0]
vld1.64 {q7-q8}, [r6, :256]!
vmlal.u32 q13,d29,d7[1]
vst1.64 {q10-q11}, [r7,:256]!
vadd.u64 d10,d10,d11
vst1.64 {q12-q13}, [r7,:256]!
vshr.u64 d10,d10,#16
bne .LNEON_outer
mov r7,sp
mov r8,r5
.LNEON_tail:
vadd.u64 d12,d12,d10
vld1.64 {q9-q10}, [r6, :256]!
vshr.u64 d10,d12,#16
vadd.u64 d13,d13,d10
vld1.64 {q11-q12}, [r6, :256]!
vshr.u64 d10,d13,#16
vld1.64 {q13}, [r6, :128]!
vzip.16 d12,d13
.LNEON_tail2:
vadd.u64 d14,d14,d10
vst1.32 {d12[0]}, [r7, :32]!
vshr.u64 d10,d14,#16
vadd.u64 d15,d15,d10
vshr.u64 d10,d15,#16
vzip.16 d14,d15
vadd.u64 d16,d16,d10
vst1.32 {d14[0]}, [r7, :32]!
vshr.u64 d10,d16,#16
vadd.u64 d17,d17,d10
vshr.u64 d10,d17,#16
vzip.16 d16,d17
vadd.u64 d18,d18,d10
vst1.32 {d16[0]}, [r7, :32]!
vshr.u64 d10,d18,#16
vadd.u64 d19,d19,d10
vshr.u64 d10,d19,#16
vzip.16 d18,d19
vadd.u64 d20,d20,d10
vst1.32 {d18[0]}, [r7, :32]!
vshr.u64 d10,d20,#16
vadd.u64 d21,d21,d10
vshr.u64 d10,d21,#16
vzip.16 d20,d21
vadd.u64 d22,d22,d10
vst1.32 {d20[0]}, [r7, :32]!
vshr.u64 d10,d22,#16
vadd.u64 d23,d23,d10
vshr.u64 d10,d23,#16
vzip.16 d22,d23
vadd.u64 d24,d24,d10
vst1.32 {d22[0]}, [r7, :32]!
vshr.u64 d10,d24,#16
vadd.u64 d25,d25,d10
vld1.64 {q6}, [r6, :128]!
vshr.u64 d10,d25,#16
vzip.16 d24,d25
vadd.u64 d26,d26,d10
vst1.32 {d24[0]}, [r7, :32]!
vshr.u64 d10,d26,#16
vadd.u64 d27,d27,d10
vld1.64 {q7-q8}, [r6, :256]!
vshr.u64 d10,d27,#16
vzip.16 d26,d27
subs r8,r8,#8
vst1.32 {d26[0]}, [r7, :32]!
bne .LNEON_tail
vst1.32 {d10[0]}, [r7, :32] @ top-most bit
sub r3,r3,r5,lsl#2 @ rewind r3
subs r1,sp,#0 @ clear carry flag
add r2,sp,r5,lsl#2
.LNEON_sub:
ldmia r1!, {r4-r7}
ldmia r3!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq r1,r2 @ preserves carry
stmia r0!, {r8-r11}
bne .LNEON_sub
ldr r10, [r1] @ load top-most bit
veor q0,q0,q0
sub r11,r2,sp @ this is num*4
veor q1,q1,q1
mov r1,sp
sub r0,r0,r11 @ rewind r0
mov r3,r2 @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia r1!, {r4-r7}
ldmia r0, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [r3,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [r3,:256]! @ wipe
movcc r11,r7
ldmia r1, {r4-r7}
stmia r0!, {r8-r11}
sub r1,r1,#16
ldmia r0, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [r1,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [r3,:256]! @ wipe
movcc r11,r7
teq r1,r2 @ preserves carry
stmia r0!, {r8-r11}
bne .LNEON_copy_n_zap
sub sp,ip,#96
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
bx lr @ .word 0xe12fff1e
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
.align 2
#if __ARM_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif

View File

@@ -0,0 +1,675 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2007.
# Montgomery multiplication for ARMv4.
#
# Performance improvement naturally varies among CPU implementations
# and compilers. The code was observed to provide +65-35% improvement
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
$alo="r10"; # sl, gcc uses it to keep @GOT
$ahi="r11"; # fp
$nlo="r12"; # ip
########### # r13 is stack pointer
$nhi="r14"; # lr
########### # r15 is program counter
#### argument block layout relative to &tp[num-1], a.k.a. $num
$_rp="$num,#12*4";
# ap permanently resides in r1
$_bp="$num,#13*4";
# np permanently resides in r3
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
#include "arm_arch.h"
.text
.code 32
#if __ARM_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-bn_mul_mont
#endif
.global bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,bn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
tst r0,#1 @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov $num,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl#2 @ rescale $num for byte count
sub sp,sp,$num @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub $num,$num,#4 @ "num=num-1"
add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap],#4 @ ap[0],ap++
ldr $nj,[$np],#4 @ np[0],np++
ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,sp @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi,#0
ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
bne .Louter
ldr $rp,[$_rp] @ pull rp
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,sp @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp],#4 @ rp[j]=
teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
and $ap,$tp,$nhi
bic $np,$rp,$nhi
orr $ap,$ap,$np @ ap=borrow?tp:rp
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
str sp,[$tp],#4 @ zap tp
str $tj,[$rp],#4
cmp $tp,$num
bne .Lcopy
add sp,$num,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont,.-bn_mul_mont
___
{
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero=&Dlo($Z);
my $temp=&Dlo($Temp);
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
sub $toutptr,sp,#16
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $toutptr,$toutptr,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
veor $zero,$zero,$zero
subs $inner,$num,#8
vzip.16 $Bi,$zero
vmull.u32 $A0xB,$Bi,${A0}[0]
vmull.u32 $A1xB,$Bi,${A0}[1]
vmull.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmull.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
vmul.u32 $Ni,$temp,$M0
vmull.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
@ special case for num=8, everything is in register bank...
vmlal.u32 $A0xB,$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
vmlal.u32 $A3xB,$Bi,${A1}[1]
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
vmlal.u32 $A0xB,$Ni,${N0}[0]
vmlal.u32 $A1xB,$Ni,${N0}[1]
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vmov $Temp,$A0xB
vmlal.u32 $A5xB,$Ni,${N2}[1]
vmov $A0xB,$A1xB
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmov $A1xB,$A2xB
vmlal.u32 $A7xB,$Ni,${N3}[1]
vmov $A2xB,$A3xB
vmov $A3xB,$A4xB
vshr.u64 $temp,$temp,#16
vmov $A4xB,$A5xB
vmov $A5xB,$A6xB
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vmov $A6xB,$A7xB
veor $A7xB,$A7xB
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
mov $toutptr,sp
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
mov $inner,$num
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
add $tinptr,sp,#16
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
b .LNEON_tail2
.align 4
.LNEON_1st:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.32 {$N0-$N1}, [$nptr]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmull.u32 $A0xB,$Bi,${A0}[0]
vld1.32 {$N2-$N3}, [$nptr]!
vmull.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmull.u32 $A2xB,$Bi,${A1}[0]
vmull.u32 $A3xB,$Bi,${A1}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmull.u32 $A4xB,$Bi,${A2}[0]
vmull.u32 $A5xB,$Bi,${A2}[1]
vmull.u32 $A6xB,$Bi,${A3}[0]
vmull.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_1st
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
sub $outer,$num,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vshr.u64 $temp,$temp,#16
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
veor $Z,$Z,$Z
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vst1.64 {$Z}, [$toutptr,:128]
vshr.u64 $temp,$temp,#16
b .LNEON_outer
.align 4
.LNEON_outer:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
vld1.32 {$A0-$A3}, [$aptr]!
veor $zero,$zero,$zero
mov $toutptr,sp
vzip.16 $Bi,$zero
sub $inner,$num,#8
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vshl.i64 $temp,`&Dhi("$A0xB")`,#16
veor $zero,$zero,$zero
vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
vld1.64 {$A7xB},[$tinptr,:128]!
vmul.u32 $Ni,$temp,$M0
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 $A7xB,$Bi,${A3}[1]
.LNEON_inner:
vmlal.u32 $A0xB,$Ni,${N0}[0]
vld1.32 {$A0-$A3}, [$aptr]!
vmlal.u32 $A1xB,$Ni,${N0}[1]
subs $inner,$inner,#8
vmlal.u32 $A2xB,$Ni,${N1}[0]
vmlal.u32 $A3xB,$Ni,${N1}[1]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A4xB,$Ni,${N2}[0]
vld1.64 {$A0xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vmlal.u32 $A0xB,$Bi,${A0}[0]
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vmlal.u32 $A1xB,$Bi,${A0}[1]
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vmlal.u32 $A2xB,$Bi,${A1}[0]
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vmlal.u32 $A3xB,$Bi,${A1}[1]
vld1.32 {$N0-$N3}, [$nptr]!
vmlal.u32 $A4xB,$Bi,${A2}[0]
vld1.64 {$A7xB}, [$tinptr, :128]!
vmlal.u32 $A5xB,$Bi,${A2}[1]
vmlal.u32 $A6xB,$Bi,${A3}[0]
vmlal.u32 $A7xB,$Bi,${A3}[1]
bne .LNEON_inner
vmlal.u32 $A0xB,$Ni,${N0}[0]
add $tinptr,sp,#16
vmlal.u32 $A1xB,$Ni,${N0}[1]
sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
vmlal.u32 $A2xB,$Ni,${N1}[0]
vld1.64 {$Temp}, [sp,:128]
vmlal.u32 $A3xB,$Ni,${N1}[1]
subs $outer,$outer,#1
vmlal.u32 $A4xB,$Ni,${N2}[0]
vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
vmlal.u32 $A5xB,$Ni,${N2}[1]
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,$temp,#16
vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
vmlal.u32 $A6xB,$Ni,${N3}[0]
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vmlal.u32 $A7xB,$Ni,${N3}[1]
vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
vadd.u64 $temp,$temp,`&Dhi("$Temp")`
vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
vshr.u64 $temp,$temp,#16
bne .LNEON_outer
mov $toutptr,sp
mov $inner,$num
.LNEON_tail:
vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dlo("$A0xB")`,#16
vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A0xB")`,#16
vld1.64 {$A7xB}, [$tinptr, :128]!
vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
.LNEON_tail2:
vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A1xB")`,#16
vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
vshr.u64 $temp,`&Dhi("$A1xB")`,#16
vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A2xB")`,#16
vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
vshr.u64 $temp,`&Dhi("$A2xB")`,#16
vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A3xB")`,#16
vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
vshr.u64 $temp,`&Dhi("$A3xB")`,#16
vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A4xB")`,#16
vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
vshr.u64 $temp,`&Dhi("$A4xB")`,#16
vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A5xB")`,#16
vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
vshr.u64 $temp,`&Dhi("$A5xB")`,#16
vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A6xB")`,#16
vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
vld1.64 {$A0xB}, [$tinptr, :128]!
vshr.u64 $temp,`&Dhi("$A6xB")`,#16
vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
vshr.u64 $temp,`&Dlo("$A7xB")`,#16
vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
vshr.u64 $temp,`&Dhi("$A7xB")`,#16
vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
subs $inner,$inner,#8
vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
veor q0,q0,q0
sub r11,$bptr,sp @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
sub sp,ip,#96
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,774 @@
#!/usr/local/bin/perl
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&bn_sub_part_words("bn_sub_part_words");
&asm_finish();
sub bn_mul_add_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("maw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
&pmuludq("mm2","mm0"); # mm2 = w*a[0]
&movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
&pmuludq("mm4","mm0"); # mm4 = w*a[1]
&movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
&pmuludq("mm6","mm0"); # mm6 = w*a[2]
&movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
&pmuludq("mm7","mm0"); # mm7 = w*a[3]
&paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
&movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
&paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
&movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
&paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
&movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
&paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
&movd(&DWP(0,$r,"",0),"mm1");
&movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
&pmuludq("mm2","mm0"); # mm2 = w*a[4]
&psrlq("mm1",32); # mm1 = carry0
&movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
&pmuludq("mm4","mm0"); # mm4 = w*a[5]
&paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
&movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
&pmuludq("mm6","mm0"); # mm6 = w*a[6]
&movd(&DWP(4,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry1
&movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
&add($a,32);
&pmuludq("mm3","mm0"); # mm3 = w*a[7]
&paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
&movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
&paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
&movd(&DWP(8,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry2
&paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
&movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
&paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
&movd(&DWP(12,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry3
&paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
&movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
&paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
&movd(&DWP(16,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry4
&paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
&movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
&paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
&movd(&DWP(20,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry5
&paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
&movd(&DWP(24,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry6
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
&movd(&DWP(28,$r,"",0),"mm1");
&lea($r,&DWP(32,$r));
&psrlq("mm1",32); # mm1 = carry_out
&sub($c,8);
&jz(&label("maw_sse2_exit"));
&set_label("maw_sse2_entry");
&test($c,0xfffffff8);
&jnz(&label("maw_sse2_unrolled"));
&set_label("maw_sse2_loop",4);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&movd("mm3",&DWP(0,$r)); # mm3 = r[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm3"); # carry += r[i]
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("maw_sse2_loop"));
&set_label("maw_sse2_exit");
&movd("eax","mm1"); # c = carry_out
&emms();
&ret();
&set_label("maw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",16);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&sub("ecx",8);
&lea($a,&DWP(32,$a));
&lea($r,&DWP(32,$r));
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i*4,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
sub bn_mul_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("mw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry = 0
&set_label("mw_sse2_loop",16);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("mw_sse2_loop"));
&movd("eax","mm1"); # return carry
&emms();
&ret();
&set_label("mw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
sub bn_sqr_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("sqr_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&set_label("sqr_sse2_loop",16);
&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
&pmuludq("mm0","mm0"); # a[i] *= a[i]
&lea($a,&DWP(4,$a)); # a++
&movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
&sub($c,1);
&lea($r,&DWP(8,$r)); # r += 2
&jnz(&label("sqr_sse2_loop"));
&emms();
&ret();
&set_label("sqr_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
sub bn_div_words
{
local($name)=@_;
&function_begin_B($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ecx",&wparam(2)); #
&div("ecx");
&ret();
&function_end_B($name);
}
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_part_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP(0,$a,"",0)); # *a
&mov($tmp2,&DWP(0,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP(0,$r,"",0),$tmp1); # *r
&add($a, 4);
&add($b, 4);
&add($r, 4);
&dec($num) if ($i != 6);
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
&cmp(&wparam(4),0);
&je(&label("pw_end"));
&mov($num,&wparam(4)); # get dl
&cmp($num,0);
&je(&label("pw_end"));
&jge(&label("pw_pos"));
&comment("pw_neg");
&mov($tmp2,0);
&sub($tmp2,$num);
&mov($num,$tmp2);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_neg_finish"));
&set_label("pw_neg_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl<0 Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("pw_end")) if ($i != 6);
}
&jmp(&label("pw_end"));
&set_label("pw_pos",0);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
&set_label("pw_pos_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl>0 Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_tail_nc".$i));
&dec($num) if ($i != 6);
&jz(&label("pw_end")) if ($i != 6);
}
&mov($c,1);
&jmp(&label("pw_end"));
&set_label("pw_nc_loop",0);
for ($i=0; $i<8; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_tail_nc".$i,0);
&dec($num) if ($i != 6);
&jz(&label("pw_nc_end")) if ($i != 6);
}
&set_label("pw_nc_end",0);
&mov($c,0);
&set_label("pw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,287 @@
#!/usr/local/bin/perl
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}

View File

@@ -0,0 +1,851 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2010
#
# "Teaser" Montgomery multiplication module for IA-64. There are
# several possibilities for improvement:
#
# - modulo-scheduling outer loop would eliminate quite a number of
# stalls after ldf8, xma and getf.sig outside inner loop and
# improve shorter key performance;
# - shorter vector support [with input vectors being fetched only
# once] should be added;
# - 2x unroll with help of n0[1] would make the code scalable on
# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
# acute interest, because upcoming Tukwila's individual cores are
# reportedly based on Itanium 2 design;
# - dedicated squaring procedure(?);
#
# January 2010
#
# Shorter vector support is implemented by zero-padding ap and np
# vectors up to 8 elements, or 512 bits. This means that 256-bit
# inputs will be processed only 2 times faster than 512-bit inputs,
# not 4 [as one would expect, because algorithm complexity is n^2].
# The reason for padding is that inputs shorter than 512 bits won't
# be processed faster anyway, because minimal critical path of the
# core loop happens to match 512-bit timing. Either way, it resulted
# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
# 1024-bit one [in comparison to original version of *this* module].
#
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
# this module is:
# sign verify sign/s verify/s
# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
#
# ... and *without* (but still with ia64.S):
#
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
#
# As it can be seen, RSA sign performance improves by 130-30%,
# hereafter less for longer keys, while verify - by 74-13%.
# DSA performance improves by 115-30%.
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
$code=<<___;
.explicit
.text
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
bn_mul_mont:
.prologue
.body
{ .mmi; cmp4.le p6,p7=2,r37;;
(p6) cmp4.lt.unc p8,p9=8,r37
mov ret0=r0 };;
{ .bbb;
(p9) br.cond.dptk.many bn_mul_mont_8
(p8) br.cond.dpnt.many bn_mul_mont_general
(p7) br.ret.spnt.many b0 };;
.endp bn_mul_mont#
prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
rptr=r8; aptr=r9; bptr=r14; nptr=r15;
tptr=r16; // &tp[0]
tp_1=r17; // &tp[-1]
num=r18; len=r19; lc=r20;
topbit=r21; // carry bit from tmp[num]
n0=f6;
m0=f7;
bi=f8;
.align 64
.local bn_mul_mont_general#
.proc bn_mul_mont_general#
bn_mul_mont_general:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
$ADDP aptr=0,in1
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; .vframe prevsp
mov prevsp=sp
$ADDP bptr=0,in2
.save pr,prevpr
mov prevpr=pr };;
.body
.rotf alo[6],nlo[4],ahi[8],nhi[6]
.rotr a[3],n[3],t[2]
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 alo[4]=[aptr],16 // ap[0]
$ADDP r30=8,in1 };;
{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
ldf8 alo[2]=[aptr],16 // ap[2]
$ADDP in4=0,in4 };;
{ .mmi; ldf8 alo[1]=[r30] // ap[3]
ldf8 n0=[in4] // n0
$ADDP rptr=0,in0 }
{ .mmi; $ADDP nptr=0,in3
mov r31=16
zxt4 num=in5 };;
{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
shladd len=num,3,r0
shladd r31=num,3,r31 };;
{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
add lc=-5,num
sub r31=sp,r31 };;
{ .mfb; and sp=-16,r31 // alloca
xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
nop.b 0 }
{ .mfb; nop.m 0
xmpy.lu alo[4]=alo[4],bi
brp.loop.imp .L1st_ctop,.L1st_cend-16
};;
{ .mfi; nop.m 0
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
add tp_1=8,sp }
{ .mfi; nop.m 0
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20001f<<16
// ------^----- (p40) at first (p23)
// ----------^^ p[16:20]=1
};;
{ .mfi; nop.m 0
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
mov ar.lc=lc }
{ .mfi; nop.m 0
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
.align 32
.L1st_ctop:
.pred.rel "mutex",p40,p42
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23) }
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p23) st8 [tp_1]=n[2],8
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mmb; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
br.ctop.sptk .L1st_ctop };;
.L1st_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
add num=-1,num };; // num--
{ .mmi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
sub aptr=aptr,len };; // rewind
{ .mmi; .pred.rel "mutex",p40,p42
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
sub nptr=nptr,len };;
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;
.Louter:
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 ahi[3]=[tptr] // tp[0]
add r30=8,aptr };;
{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
ldf8 alo[3]=[r30],16 // ap[1]
add r31=8,nptr };;
{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
brp.loop.imp .Linner_ctop,.Linner_cend-16
}
{ .mfb; ldf8 alo[1]=[r30] // ap[3]
xma.lu alo[4]=alo[4],bi,ahi[3]
clrrrb.pr };;
{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
nop.i 0 }
{ .mfi; ldf8 nlo[1]=[r31] // np[1]
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20101f<<16
// ------^----- (p40) at first (p23)
// --------^--- (p30) at first (p22)
// ----------^^ p[16:20]=1
};;
{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
mov ar.lc=lc }
{ .mfi;
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
// in latter case accounts for two-tick pipeline stall, which means
// that its performance would be ~20% lower than optimal one. No
// attempt was made to address this, because original Itanium is
// hardly represented out in the wild...
.align 32
.Linner_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p30,p32
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23)
{ .mfi; (p16) nop.m 0
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p16) nop.f 0
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p21) ld8 t[0]=[tptr],8
(p16) nop.f 0
(p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p30) add a[1]=a[1],t[1] } // (p22)
{ .mfi; (p16) nop.m 0
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p32) add a[1]=a[1],t[1],1 };; // (p22)
{ .mmi; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
(p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
{ .mmb; (p23) st8 [tp_1]=n[2],8
(p32) cmp.leu p31,p29=a[1],t[1] // (p22)
br.ctop.sptk .Linner_ctop };;
.Linner_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
nop.i 0 };;
{ .mmi; .pred.rel "mutex",p31,p33
(p31) add a[0]=a[0],topbit
(p33) add a[0]=a[0],topbit,1
mov topbit=r0 };;
{ .mfi; .pred.rel "mutex",p31,p33
(p31) cmp.ltu p32,p30=a[0],topbit
(p33) cmp.leu p32,p30=a[0],topbit
}
{ .mfi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
};;
{ .mmi; .pred.rel "mutex",p44,p46
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
(p32) add topbit=r0,r0,1 }
{ .mmi; st8 [tp_1]=n[0],8
cmp4.ne p6,p0=1,num
sub aptr=aptr,len };; // rewind
{ .mmi; sub nptr=nptr,len
(p41) add topbit=r0,r0,1
add tptr=16,sp }
{ .mmb; add tp_1=8,sp
add num=-1,num // num--
(p6) br.cond.sptk.many .Louter };;
{ .mbb; add lc=4,lc
brp.loop.imp .Lsub_ctop,.Lsub_cend-16
clrrrb.pr };;
{ .mii; nop.m 0
mov pr.rot=0x10001<<16
// ------^---- (p33) at first (p17)
mov ar.lc=lc }
{ .mii; nop.m 0
mov ar.ec=3
nop.i 0 };;
.Lsub_ctop:
.pred.rel "mutex",p33,p35
{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
(p16) nop.f 0
(p33) sub n[1]=t[1],n[1] } // (p17)
{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
(p16) nop.f 0
(p35) sub n[1]=t[1],n[1],1 };; // (p17)
{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
(p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
(p18) nop.b 0 }
{ .mib; (p18) nop.m 0
(p35) cmp.geu p34,p32=n[1],t[1] // (p17)
br.ctop.sptk .Lsub_ctop };;
.Lsub_cend:
{ .mmb; .pred.rel "mutex",p34,p36
(p34) sub topbit=topbit,r0 // (p19)
(p36) sub topbit=topbit,r0,1
brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
}
{ .mmb; sub rptr=rptr,len // rewind
sub tptr=tptr,len
clrrrb.pr };;
{ .mmi; and aptr=tptr,topbit
andcm bptr=rptr,topbit
mov pr.rot=1<<16 };;
{ .mii; or nptr=aptr,bptr
mov ar.lc=lc
mov ar.ec=3 };;
.Lcopy_ctop:
{ .mmb; (p16) ld8 n[0]=[nptr],8
(p18) st8 [tptr]=r0,8
(p16) nop.b 0 }
{ .mmb; (p16) nop.m 0
(p18) st8 [rptr]=n[2],8
br.ctop.sptk .Lcopy_ctop };;
.Lcopy_cend:
{ .mmi; mov ret0=1 // signal "handled"
rum 1<<5 // clear um.mfh
mov ar.lc=prevlc }
{ .mib; .restore sp
mov sp=prevsp
mov pr=prevpr,0x1ffff
br.ret.sptk.many b0 };;
.endp bn_mul_mont_general#
a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
t0=r15;
ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
.align 64
.skip 48 // aligns loop body
.local bn_mul_mont_8#
.proc bn_mul_mont_8#
bn_mul_mont_8:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
.vframe prevsp
mov prevsp=sp
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; add r17=-6*16,sp
add sp=-7*16,sp
.save pr,prevpr
mov prevpr=pr };;
{ .mmi; .save.gf 0,0x10
stf.spill [sp]=f16,-16
.save.gf 0,0x20
stf.spill [r17]=f17,32
add r16=-5*16,prevsp};;
{ .mmi; .save.gf 0,0x40
stf.spill [r16]=f18,32
.save.gf 0,0x80
stf.spill [r17]=f19,32
$ADDP aptr=0,in1 };;
{ .mmi; .save.gf 0,0x100
stf.spill [r16]=f20,32
.save.gf 0,0x200
stf.spill [r17]=f21,32
$ADDP r29=8,in1 };;
{ .mmi; .save.gf 0,0x400
stf.spill [r16]=f22
.save.gf 0,0x800
stf.spill [r17]=f23
$ADDP rptr=0,in0 };;
.body
.rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
.rotr t[8]
// load input vectors padding them to 8 elements
{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
ldf8 ai1=[r29],16 // ap[1]
$ADDP bptr=0,in2 }
{ .mmi; $ADDP r30=8,in2
$ADDP nptr=0,in3
$ADDP r31=8,in3 };;
{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
ldf8 bj[6]=[r30],16 // bp[1]
cmp4.le p4,p5=3,in5 }
{ .mmi; ldf8 ni0=[nptr],16 // np[0]
ldf8 ni1=[r31],16 // np[1]
cmp4.le p6,p7=4,in5 };;
{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
(p5)fcvt.fxu ai2=f0
cmp4.le p8,p9=5,in5 }
{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
(p7)fcvt.fxu ai3=f0
cmp4.le p10,p11=6,in5 }
{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
(p5)fcvt.fxu bj[5]=f0
cmp4.le p12,p13=7,in5 }
{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
(p7)fcvt.fxu bj[4]=f0
cmp4.le p14,p15=8,in5 }
{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
(p5)fcvt.fxu ni2=f0
addp4 r28=-1,in5 }
{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
(p7)fcvt.fxu ni3=f0
$ADDP in4=0,in4 };;
{ .mfi; ldf8 n0=[in4]
fcvt.fxu tf[1]=f0
nop.i 0 }
{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
(p9)fcvt.fxu ai4=f0
mov t[0]=r0 }
{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
(p11)fcvt.fxu ai5=f0
mov t[1]=r0 }
{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
(p9)fcvt.fxu bj[3]=f0
mov t[2]=r0 }
{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
(p11)fcvt.fxu bj[2]=f0
mov t[3]=r0 }
{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
(p9)fcvt.fxu ni4=f0
mov t[4]=r0 }
{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
(p11)fcvt.fxu ni5=f0
mov t[5]=r0 };;
{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
(p13)fcvt.fxu ai6=f0
mov t[6]=r0 }
{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
(p15)fcvt.fxu ai7=f0
mov t[7]=r0 }
{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
(p13)fcvt.fxu bj[1]=f0
mov ar.lc=r28 }
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
(p15)fcvt.fxu bj[0]=f0
mov ar.ec=1 }
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
(p13)fcvt.fxu ni6=f0
mov pr.rot=1<<16 }
{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
(p15)fcvt.fxu ni7=f0
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
};;
// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
// to measure with help of Interval Time Counter indicated that the
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
// addressing the issue is problematic, because I don't have access
// to platform-specific instruction-level profiler. On Itanium it
// should run in 56*n ticks, because of higher xma latency...
.Louter_8_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 0:
(p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
(p40) add a3=a3,n3 } // (p17) a3+=n3
{ .mfi; (p42) add a3=a3,n3,1
(p16) xma.lu alo[0]=ai0,bj[7],tf[1]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 4:
(p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
(p41) add a4=a4,n4 } // (p17) a4+=n4
{ .mfi; (p43) add a4=a4,n4,1
(p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
(p16) nop.i 0 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p16) nop.m 0 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 8:
(p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
(p40) add a5=a5,n5 } // (p17) a5+=n5
{ .mfi; (p42) add a5=a5,n5,1
(p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a1=alo[1] // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mfi; (p16) nop.m 0 // 10:
(p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
(p40) cmp.ltu p43,p41=a5,n5 }
{ .mfi; (p42) cmp.leu p43,p41=a5,n5
(p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
(p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
(p41) add a6=a6,n6 } // (p17) a6+=n6
{ .mfi; (p43) add a6=a6,n6,1
(p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a2=alo[2] // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mfi; (p16) nop.m 0 // 14:
(p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
(p41) cmp.ltu p42,p40=a6,n6 }
{ .mfi; (p43) cmp.leu p42,p40=a6,n6
(p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
(p16) nop.i 0 };;
{ .mii; (p16) nop.m 0 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 16:
(p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
(p40) add a7=a7,n7 } // (p17) a7+=n7
{ .mfi; (p42) add a7=a7,n7,1
(p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a3=alo[3] // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mfi; (p16) nop.m 0 // 18:
(p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
(p40) cmp.ltu p43,p41=a7,n7 }
{ .mfi; (p42) cmp.leu p43,p41=a7,n7
(p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n1=nlo[1] // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 20:
(p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
(p41) add a8=a8,n8 } // (p17) a8+=n8
{ .mfi; (p43) add a8=a8,n8,1
(p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a4=alo[4] // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 };;
{ .mfi; (p16) nop.m 0 // 22:
(p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
(p41) cmp.ltu p42,p40=a8,n8 }
{ .mfi; (p43) cmp.leu p42,p40=a8,n8
(p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n2=nlo[2] // 23:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 };;
{ .mfi; (p16) nop.m 0 // 24:
(p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
(p16) add a1=a1,n1 } // (p16) a1+=n1
{ .mfi; (p16) nop.m 0
(p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
(p17) mov t[0]=r0 };;
{ .mii; (p16) getf.sig a5=alo[5] // 25:
(p16) add t0=t[7],a1 // (p16) t[7]+=a1
(p42) add t[0]=t[0],r0,1 };;
{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
(p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
(p50) add t[0]=t[0],r0,1 }
{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
(p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n3=nlo[3] // 27:
(p16) cmp.ltu.unc p50,p48=t0,a1
(p16) nop.i 0 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 28:
(p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
(p40) add a2=a2,n2 } // (p16) a2+=n2
{ .mfi; (p42) add a2=a2,n2,1
(p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a6=alo[6] // 29:
(p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
(p50) add t[6]=t[6],a2,1 };;
{ .mfi; (p16) nop.m 0 // 30:
(p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
(p40) cmp.ltu p41,p39=a2,n2 }
{ .mfi; (p42) cmp.leu p41,p39=a2,n2
(p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
(p16) nop.i 0 };;
{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
(p16) nop.f 0
(p48) cmp.ltu p49,p47=t[6],a2 }
{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
(p16) nop.f 0
br.ctop.sptk.many .Louter_8_ctop };;
.Louter_8_cend:
// above loop has to execute one more time, without (p16), which is
// replaced with merged move of np[8] to GPR bank
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mmi; (p0) getf.sig n1=ni0 // 0:
(p40) add a3=a3,n3 // (p17) a3+=n3
(p42) add a3=a3,n3,1 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mmi; (p0) getf.sig n2=ni1 // 4:
(p41) add a4=a4,n4 // (p17) a4+=n4
(p43) add a4=a4,n4,1 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p0) nop.f 0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p0) getf.sig n3=ni2 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) getf.sig n4=ni3 // 8:
(p40) add a5=a5,n5 // (p17) a5+=n5
(p42) add a5=a5,n5,1 };;
{ .mii; (p0) nop.m 0 // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mii; (p0) nop.m 0 // 10:
(p40) cmp.ltu p43,p41=a5,n5
(p42) cmp.leu p43,p41=a5,n5 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
(p41) add a6=a6,n6 // (p17) a6+=n6
(p43) add a6=a6,n6,1 };;
{ .mii; (p0) getf.sig n5=ni4 // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mii; (p0) nop.m 0 // 14:
(p41) cmp.ltu p42,p40=a6,n6
(p43) cmp.leu p42,p40=a6,n6 };;
{ .mii; (p0) getf.sig n6=ni5 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) nop.m 0 // 16:
(p40) add a7=a7,n7 // (p17) a7+=n7
(p42) add a7=a7,n7,1 };;
{ .mii; (p0) nop.m 0 // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mii; (p0) nop.m 0 // 18:
(p40) cmp.ltu p43,p41=a7,n7
(p42) cmp.leu p43,p41=a7,n7 };;
{ .mii; (p0) getf.sig n7=ni6 // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p0) nop.m 0 // 20:
(p41) add a8=a8,n8 // (p17) a8+=n8
(p43) add a8=a8,n8,1 };;
{ .mmi; (p0) nop.m 0 // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 }
{ .mmi; (p17) mov t[0]=r0
(p41) cmp.ltu p42,p40=a8,n8
(p43) cmp.leu p42,p40=a8,n8 };;
{ .mmi; (p0) getf.sig n8=ni7 // 22:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 }
{ .mmi; (p42) add t[0]=t[0],r0,1
(p0) add r16=-7*16,prevsp
(p0) add r17=-6*16,prevsp };;
// subtract np[8] from carrybit|tmp[8]
// carrybit|tmp[8] layout upon exit from above loop is:
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
{ .mmi; (p50)add t[0]=t[0],r0,1
add r18=-5*16,prevsp
sub n1=t0,n1 };;
{ .mmi; cmp.gtu p34,p32=n1,t0;;
.pred.rel "mutex",p32,p34
(p32)sub n2=t[7],n2
(p34)sub n2=t[7],n2,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
(p34)cmp.geu p35,p33=n2,t[7];;
.pred.rel "mutex",p33,p35
(p33)sub n3=t[6],n3 }
{ .mmi; (p35)sub n3=t[6],n3,1;;
(p33)cmp.gtu p34,p32=n3,t[6]
(p35)cmp.geu p34,p32=n3,t[6] };;
.pred.rel "mutex",p32,p34
{ .mii; (p32)sub n4=t[5],n4
(p34)sub n4=t[5],n4,1;;
(p32)cmp.gtu p35,p33=n4,t[5] }
{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
.pred.rel "mutex",p33,p35
(p33)sub n5=t[4],n5
(p35)sub n5=t[4],n5,1 };;
{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
(p35)cmp.geu p34,p32=n5,t[4];;
.pred.rel "mutex",p32,p34
(p32)sub n6=t[3],n6 }
{ .mmi; (p34)sub n6=t[3],n6,1;;
(p32)cmp.gtu p35,p33=n6,t[3]
(p34)cmp.geu p35,p33=n6,t[3] };;
.pred.rel "mutex",p33,p35
{ .mii; (p33)sub n7=t[2],n7
(p35)sub n7=t[2],n7,1;;
(p33)cmp.gtu p34,p32=n7,t[2] }
{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
.pred.rel "mutex",p32,p34
(p32)sub n8=t[1],n8
(p34)sub n8=t[1],n8,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
(p34)cmp.geu p35,p33=n8,t[1];;
.pred.rel "mutex",p33,p35
(p33)sub a8=t[0],r0 }
{ .mmi; (p35)sub a8=t[0],r0,1;;
(p33)cmp.gtu p34,p32=a8,t[0]
(p35)cmp.geu p34,p32=a8,t[0] };;
// save the result, either tmp[num] or tmp[num]-np[num]
.pred.rel "mutex",p32,p34
{ .mmi; (p32)st8 [rptr]=n1,8
(p34)st8 [rptr]=t0,8
add r19=-4*16,prevsp};;
{ .mmb; (p32)st8 [rptr]=n2,8
(p34)st8 [rptr]=t[7],8
(p5)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n3,8
(p34)st8 [rptr]=t[6],8
(p7)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n4,8
(p34)st8 [rptr]=t[5],8
(p9)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n5,8
(p34)st8 [rptr]=t[4],8
(p11)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n6,8
(p34)st8 [rptr]=t[3],8
(p13)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n7,8
(p34)st8 [rptr]=t[2],8
(p15)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n8,8
(p34)st8 [rptr]=t[1],8
nop.b 0 };;
.Ldone: // epilogue
{ .mmi; ldf.fill f16=[r16],64
ldf.fill f17=[r17],64
nop.i 0 }
{ .mmi; ldf.fill f18=[r18],64
ldf.fill f19=[r19],64
mov pr=prevpr,0x1ffff };;
{ .mmi; ldf.fill f20=[r16]
ldf.fill f21=[r17]
mov ar.lc=prevlc }
{ .mmi; ldf.fill f22=[r18]
ldf.fill f23=[r19]
mov ret0=1 } // signal "handled"
{ .mib; rum 1<<5
.restore sp
mov sp=prevsp
br.ret.sptk.many b0 };;
.endp bn_mul_mont_8#
.type copyright#,\@object
copyright:
stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=shift and open STDOUT,">$output";
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,284 @@
.text
.set noat
.set noreorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
lw $8,16($29)
lw $9,20($29)
slt $1,$9,4
bnez $1,1f
li $2,0
slt $1,$9,17 # on in-order CPU
bnez $1,bn_mul_mont_internal
nop
1: jr $31
li $4,0
.end bn_mul_mont
.align 5
.ent bn_mul_mont_internal
bn_mul_mont_internal:
.frame $30,14*4,$31
.mask 0x40000000|16711680,-4
sub $29,14*4
sw $30,(14-1)*4($29)
sw $23,(14-2)*4($29)
sw $22,(14-3)*4($29)
sw $21,(14-4)*4($29)
sw $20,(14-5)*4($29)
sw $19,(14-6)*4($29)
sw $18,(14-7)*4($29)
sw $17,(14-8)*4($29)
sw $16,(14-9)*4($29)
move $30,$29
.set reorder
lw $8,0($8)
lw $13,0($6) # bp[0]
lw $12,0($5) # ap[0]
lw $14,0($7) # np[0]
sub $29,2*4 # place for two extra words
sll $9,2
li $1,-4096
sub $29,$9
and $29,$1
multu $12,$13
lw $16,4($5)
lw $18,4($7)
mflo $10
mfhi $11
multu $10,$8
mflo $23
multu $16,$13
mflo $16
mfhi $17
multu $14,$23
mflo $24
mfhi $25
multu $18,$23
addu $24,$10
sltu $1,$24,$10
addu $25,$1
mflo $18
mfhi $19
move $15,$29
li $22,2*4
.align 4
.L1st:
.set noreorder
add $12,$5,$22
add $14,$7,$22
lw $12,($12)
lw $14,($14)
multu $12,$13
addu $10,$16,$11
addu $24,$18,$25
sltu $1,$10,$11
sltu $2,$24,$25
addu $11,$17,$1
addu $25,$19,$2
mflo $16
mfhi $17
addu $24,$10
sltu $1,$24,$10
multu $14,$23
addu $25,$1
addu $22,4
sw $24,($15)
sltu $2,$22,$9
mflo $18
mfhi $19
bnez $2,.L1st
add $15,4
.set reorder
addu $10,$16,$11
sltu $1,$10,$11
addu $11,$17,$1
addu $24,$18,$25
sltu $2,$24,$25
addu $25,$19,$2
addu $24,$10
sltu $1,$24,$10
addu $25,$1
sw $24,($15)
addu $25,$11
sltu $1,$25,$11
sw $25,4($15)
sw $1,2*4($15)
li $21,4
.align 4
.Louter:
add $13,$6,$21
lw $13,($13)
lw $12,($5)
lw $16,4($5)
lw $20,($29)
multu $12,$13
lw $14,($7)
lw $18,4($7)
mflo $10
mfhi $11
addu $10,$20
multu $10,$8
sltu $1,$10,$20
addu $11,$1
mflo $23
multu $16,$13
mflo $16
mfhi $17
multu $14,$23
mflo $24
mfhi $25
multu $18,$23
addu $24,$10
sltu $1,$24,$10
addu $25,$1
mflo $18
mfhi $19
move $15,$29
li $22,2*4
lw $20,4($15)
.align 4
.Linner:
.set noreorder
add $12,$5,$22
add $14,$7,$22
lw $12,($12)
lw $14,($14)
multu $12,$13
addu $10,$16,$11
addu $24,$18,$25
sltu $1,$10,$11
sltu $2,$24,$25
addu $11,$17,$1
addu $25,$19,$2
mflo $16
mfhi $17
addu $10,$20
addu $22,4
multu $14,$23
sltu $1,$10,$20
addu $24,$10
addu $11,$1
sltu $2,$24,$10
lw $20,2*4($15)
addu $25,$2
sltu $1,$22,$9
mflo $18
mfhi $19
sw $24,($15)
bnez $1,.Linner
add $15,4
.set reorder
addu $10,$16,$11
sltu $1,$10,$11
addu $11,$17,$1
addu $10,$20
sltu $2,$10,$20
addu $11,$2
lw $20,2*4($15)
addu $24,$18,$25
sltu $1,$24,$25
addu $25,$19,$1
addu $24,$10
sltu $2,$24,$10
addu $25,$2
sw $24,($15)
addu $24,$25,$11
sltu $25,$24,$11
addu $24,$20
sltu $1,$24,$20
addu $25,$1
sw $24,4($15)
sw $25,2*4($15)
addu $21,4
sltu $2,$21,$9
bnez $2,.Louter
.set noreorder
add $20,$29,$9 # &tp[num]
move $15,$29
move $5,$29
li $11,0 # clear borrow bit
.align 4
.Lsub: lw $10,($15)
lw $24,($7)
add $15,4
add $7,4
subu $24,$10,$24 # tp[i]-np[i]
sgtu $1,$24,$10
subu $10,$24,$11
sgtu $11,$10,$24
sw $10,($4)
or $11,$1
sltu $1,$15,$20
bnez $1,.Lsub
add $4,4
subu $11,$25,$11 # handle upmost overflow bit
move $15,$29
sub $4,$9 # restore rp
not $25,$11
and $5,$11,$29
and $6,$25,$4
or $5,$5,$6 # ap=borrow?tp:rp
.align 4
.Lcopy: lw $12,($5)
add $5,4
sw $0,($15)
add $15,4
sltu $1,$15,$20
sw $12,($4)
bnez $1,.Lcopy
add $4,4
li $4,1
li $2,1
.set noreorder
move $29,$30
lw $30,(14-1)*4($29)
lw $23,(14-2)*4($29)
lw $22,(14-3)*4($29)
lw $21,(14-4)*4($29)
lw $20,(14-5)*4($29)
lw $19,(14-6)*4($29)
lw $18,(14-7)*4($29)
lw $17,(14-8)*4($29)
lw $16,(14-9)*4($29)
jr $31
add $29,14*4
.end bn_mul_mont_internal
.rdata
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro@openssl.org>"

View File

@@ -0,0 +1,426 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys, at least not on
# in-order-execution cores. While 512-bit RSA sign operations can be
# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
# verify:-( All comparisons are against bn_mul_mont-free assembler.
# The module might be of interest to embedded system developers, as
# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
# code.
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$SZREG=8;
} else {
$PTR_ADD="add";
$PTR_SUB="sub";
$REG_S="sw";
$REG_L="lw";
$SZREG=4;
}
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
#
# <appro@openssl.org>
#
######################################################################
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($flavour =~ /64|n32/i) {
$LD="ld";
$ST="sd";
$MULTU="dmultu";
$ADDU="daddu";
$SUBU="dsubu";
$BNSZ=8;
} else {
$LD="lw";
$ST="sw";
$MULTU="multu";
$ADDU="addu";
$SUBU="subu";
$BNSZ=4;
}
# int bn_mul_mont(
$rp=$a0; # BN_ULONG *rp,
$ap=$a1; # const BN_ULONG *ap,
$bp=$a2; # const BN_ULONG *bp,
$np=$a3; # const BN_ULONG *np,
$n0=$a4; # const BN_ULONG *n0,
$num=$a5; # int num);
$lo0=$a6;
$hi0=$a7;
$lo1=$t1;
$hi1=$t2;
$aj=$s0;
$bi=$s1;
$nj=$s2;
$tp=$s3;
$alo=$s4;
$ahi=$s5;
$nlo=$s6;
$nhi=$s7;
$tj=$s8;
$i=$s9;
$j=$s10;
$m1=$s11;
$FRAMESIZE=14;
$code=<<___;
.text
.set noat
.set noreorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
___
$code.=<<___ if ($flavour =~ /o32/i);
lw $n0,16($sp)
lw $num,20($sp)
___
$code.=<<___;
slt $at,$num,4
bnez $at,1f
li $t0,0
slt $at,$num,17 # on in-order CPU
bnez $at,bn_mul_mont_internal
nop
1: jr $ra
li $a0,0
.end bn_mul_mont
.align 5
.ent bn_mul_mont_internal
bn_mul_mont_internal:
.frame $fp,$FRAMESIZE*$SZREG,$ra
.mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
move $fp,$sp
.set reorder
$LD $n0,0($n0)
$LD $bi,0($bp) # bp[0]
$LD $aj,0($ap) # ap[0]
$LD $nj,0($np) # np[0]
$PTR_SUB $sp,2*$BNSZ # place for two extra words
sll $num,`log($BNSZ)/log(2)`
li $at,-4096
$PTR_SUB $sp,$num
and $sp,$at
$MULTU $aj,$bi
$LD $alo,$BNSZ($ap)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$MULTU $lo0,$n0
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
.align 4
.L1st:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$MULTU $nj,$m1
$ADDU $hi1,$at
addu $j,$BNSZ
$ST $lo1,($tp)
sltu $t0,$j,$num
mflo $nlo
mfhi $nhi
bnez $t0,.L1st
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo1,$nlo,$hi1
sltu $t0,$lo1,$hi1
$ADDU $hi1,$nhi,$t0
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
$ST $lo1,($tp)
$ADDU $hi1,$hi0
sltu $at,$hi1,$hi0
$ST $hi1,$BNSZ($tp)
$ST $at,2*$BNSZ($tp)
li $i,$BNSZ
.align 4
.Louter:
$PTR_ADD $bi,$bp,$i
$LD $bi,($bi)
$LD $aj,($ap)
$LD $alo,$BNSZ($ap)
$LD $tj,($sp)
$MULTU $aj,$bi
$LD $nj,($np)
$LD $nlo,$BNSZ($np)
mflo $lo0
mfhi $hi0
$ADDU $lo0,$tj
$MULTU $lo0,$n0
sltu $at,$lo0,$tj
$ADDU $hi0,$at
mflo $m1
$MULTU $alo,$bi
mflo $alo
mfhi $ahi
$MULTU $nj,$m1
mflo $lo1
mfhi $hi1
$MULTU $nlo,$m1
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo $nlo
mfhi $nhi
move $tp,$sp
li $j,2*$BNSZ
$LD $tj,$BNSZ($tp)
.align 4
.Linner:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU $aj,$bi
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo $alo
mfhi $ahi
$ADDU $lo0,$tj
addu $j,$BNSZ
$MULTU $nj,$m1
sltu $at,$lo0,$tj
$ADDU $lo1,$lo0
$ADDU $hi0,$at
sltu $t0,$lo1,$lo0
$LD $tj,2*$BNSZ($tp)
$ADDU $hi1,$t0
sltu $at,$j,$num
mflo $nlo
mfhi $nhi
$ST $lo1,($tp)
bnez $at,.Linner
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo0,$tj
sltu $t0,$lo0,$tj
$ADDU $hi0,$t0
$LD $tj,2*$BNSZ($tp)
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo1,$hi1
$ADDU $hi1,$nhi,$at
$ADDU $lo1,$lo0
sltu $t0,$lo1,$lo0
$ADDU $hi1,$t0
$ST $lo1,($tp)
$ADDU $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
$ADDU $lo1,$tj
sltu $at,$lo1,$tj
$ADDU $hi1,$at
$ST $lo1,$BNSZ($tp)
$ST $hi1,2*$BNSZ($tp)
addu $i,$BNSZ
sltu $t0,$i,$num
bnez $t0,.Louter
.set noreorder
$PTR_ADD $tj,$sp,$num # &tp[num]
move $tp,$sp
move $ap,$sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: $LD $lo0,($tp)
$LD $lo1,($np)
$PTR_ADD $tp,$BNSZ
$PTR_ADD $np,$BNSZ
$SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu $at,$lo1,$lo0
$SUBU $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
$ST $lo0,($rp)
or $hi0,$at
sltu $at,$tp,$tj
bnez $at,.Lsub
$PTR_ADD $rp,$BNSZ
$SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,$sp
$PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,$sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: $LD $aj,($ap)
$PTR_ADD $ap,$BNSZ
$ST $zero,($tp)
$PTR_ADD $tp,$BNSZ
sltu $at,$tp,$tj
$ST $aj,($rp)
bnez $at,.Lcopy
$PTR_ADD $rp,$BNSZ
li $a0,1
li $t0,1
.set noreorder
move $sp,$fp
$REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end bn_mul_mont_internal
.rdata
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,327 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys. While 512-bit
# RSA private key operations are 40% faster, 1024-bit ones are hardly
# faster at all, while longer key operations are slower by up to 20%.
# It might be of interest to embedded system developers though, as
# it's smaller than 1KB, yet offers ~3x improvement over compiler
# generated code.
#
# The module targets N32 and N64 MIPS ABIs and currently is a bit
# IRIX-centric, i.e. is likely to require adaptation for other OSes.
# int bn_mul_mont(
$rp="a0"; # BN_ULONG *rp,
$ap="a1"; # const BN_ULONG *ap,
$bp="a2"; # const BN_ULONG *bp,
$np="a3"; # const BN_ULONG *np,
$n0="a4"; # const BN_ULONG *n0,
$num="a5"; # int num);
$lo0="a6";
$hi0="a7";
$lo1="v0";
$hi1="v1";
$aj="t0";
$bi="t1";
$nj="t2";
$tp="t3";
$alo="s0";
$ahi="s1";
$nlo="s2";
$nhi="s3";
$tj="s4";
$i="s5";
$j="s6";
$fp="t8";
$m1="t9";
$FRAME=8*(2+8);
$code=<<___;
#include <asm.h>
#include <regdef.h>
.text
.set noat
.set reorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
.set noreorder
PTR_SUB sp,64
move $fp,sp
.frame $fp,64,ra
slt AT,$num,4
li v0,0
beqzl AT,.Lproceed
nop
jr ra
PTR_ADD sp,$fp,64
.set reorder
.align 5
.Lproceed:
ld $n0,0($n0)
ld $bi,0($bp) # bp[0]
ld $aj,0($ap) # ap[0]
ld $nj,0($np) # np[0]
PTR_SUB sp,16 # place for two extra words
sll $num,3
li AT,-4096
PTR_SUB sp,$num
and sp,AT
sd s0,0($fp)
sd s1,8($fp)
sd s2,16($fp)
sd s3,24($fp)
sd s4,32($fp)
sd s5,40($fp)
sd s6,48($fp)
sd s7,56($fp)
dmultu $aj,$bi
ld $alo,8($ap)
ld $nlo,8($np)
mflo $lo0
mfhi $hi0
dmultu $lo0,$n0
mflo $m1
dmultu $alo,$bi
mflo $alo
mfhi $ahi
dmultu $nj,$m1
mflo $lo1
mfhi $hi1
dmultu $nlo,$m1
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
mflo $nlo
mfhi $nhi
move $tp,sp
li $j,16
.align 4
.L1st:
.set noreorder
PTR_ADD $aj,$ap,$j
ld $aj,($aj)
PTR_ADD $nj,$np,$j
ld $nj,($nj)
dmultu $aj,$bi
daddu $lo0,$alo,$hi0
daddu $lo1,$nlo,$hi1
sltu AT,$lo0,$hi0
sltu s7,$lo1,$hi1
daddu $hi0,$ahi,AT
daddu $hi1,$nhi,s7
mflo $alo
mfhi $ahi
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
dmultu $nj,$m1
daddu $hi1,AT
addu $j,8
sd $lo1,($tp)
sltu s7,$j,$num
mflo $nlo
mfhi $nhi
bnez s7,.L1st
PTR_ADD $tp,8
.set reorder
daddu $lo0,$alo,$hi0
sltu AT,$lo0,$hi0
daddu $hi0,$ahi,AT
daddu $lo1,$nlo,$hi1
sltu s7,$lo1,$hi1
daddu $hi1,$nhi,s7
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
sd $lo1,($tp)
daddu $hi1,$hi0
sltu AT,$hi1,$hi0
sd $hi1,8($tp)
sd AT,16($tp)
li $i,8
.align 4
.Louter:
PTR_ADD $bi,$bp,$i
ld $bi,($bi)
ld $aj,($ap)
ld $alo,8($ap)
ld $tj,(sp)
dmultu $aj,$bi
ld $nj,($np)
ld $nlo,8($np)
mflo $lo0
mfhi $hi0
daddu $lo0,$tj
dmultu $lo0,$n0
sltu AT,$lo0,$tj
daddu $hi0,AT
mflo $m1
dmultu $alo,$bi
mflo $alo
mfhi $ahi
dmultu $nj,$m1
mflo $lo1
mfhi $hi1
dmultu $nlo,$m1
daddu $lo1,$lo0
sltu AT,$lo1,$lo0
daddu $hi1,AT
mflo $nlo
mfhi $nhi
move $tp,sp
li $j,16
ld $tj,8($tp)
.align 4
.Linner:
.set noreorder
PTR_ADD $aj,$ap,$j
ld $aj,($aj)
PTR_ADD $nj,$np,$j
ld $nj,($nj)
dmultu $aj,$bi
daddu $lo0,$alo,$hi0
daddu $lo1,$nlo,$hi1
sltu AT,$lo0,$hi0
sltu s7,$lo1,$hi1
daddu $hi0,$ahi,AT
daddu $hi1,$nhi,s7
mflo $alo
mfhi $ahi
daddu $lo0,$tj
addu $j,8
dmultu $nj,$m1
sltu AT,$lo0,$tj
daddu $lo1,$lo0
daddu $hi0,AT
sltu s7,$lo1,$lo0
ld $tj,16($tp)
daddu $hi1,s7
sltu AT,$j,$num
mflo $nlo
mfhi $nhi
sd $lo1,($tp)
bnez AT,.Linner
PTR_ADD $tp,8
.set reorder
daddu $lo0,$alo,$hi0
sltu AT,$lo0,$hi0
daddu $hi0,$ahi,AT
daddu $lo0,$tj
sltu s7,$lo0,$tj
daddu $hi0,s7
ld $tj,16($tp)
daddu $lo1,$nlo,$hi1
sltu AT,$lo1,$hi1
daddu $hi1,$nhi,AT
daddu $lo1,$lo0
sltu s7,$lo1,$lo0
daddu $hi1,s7
sd $lo1,($tp)
daddu $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
daddu $lo1,$tj
sltu AT,$lo1,$tj
daddu $hi1,AT
sd $lo1,8($tp)
sd $hi1,16($tp)
addu $i,8
sltu s7,$i,$num
bnez s7,.Louter
.set noreorder
PTR_ADD $tj,sp,$num # &tp[num]
move $tp,sp
move $ap,sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: ld $lo0,($tp)
ld $lo1,($np)
PTR_ADD $tp,8
PTR_ADD $np,8
dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu AT,$lo1,$lo0
dsubu $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
sd $lo0,($rp)
or $hi0,AT
sltu AT,$tp,$tj
bnez AT,.Lsub
PTR_ADD $rp,8
dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,sp
PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
and $ap,$hi0,sp
and $bp,$hi1,$rp
or $ap,$ap,$bp # ap=borrow?tp:rp
.align 4
.Lcopy: ld $aj,($ap)
PTR_ADD $ap,8
PTR_ADD $tp,8
sd zero,-8($tp)
sltu AT,$tp,$tj
sd $aj,($rp)
bnez AT,.Lcopy
PTR_ADD $rp,8
ld s0,0($fp)
ld s1,8($fp)
ld s2,16($fp)
ld s3,24($fp)
ld s4,32($fp)
ld s5,40($fp)
ld s6,48($fp)
ld s7,56($fp)
li v0,1
jr ra
PTR_ADD sp,$fp,64
.set reorder
END(bn_mul_mont)
.rdata
.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,995 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# On PA-7100LC this module performs ~90-50% better, less for longer
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
# multiplication, which in turn means that "baseline" performance was
# optimal in respect to instruction set capabilities. Fair comparison
# with vendor compiler is problematic, because OpenSSL doesn't define
# BN_LLONG [presumably] for historical reasons, which drives compiler
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
# shifts and additions] instead. This means that you should observe
# several times improvement over code generated by vendor compiler
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
# improvement coefficient was never collected on PA-7100LC, or any
# other 1.1 CPU, because I don't have access to such machine with
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
# of ~5x on PA-8600.
#
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
# reportedly ~2x faster than vendor compiler generated code [according
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
# this implementation is actually 32-bit one, in the sense that it
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
# module picks halves of 64-bit values in reverse order and pretends
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
# i.e. there is no "wider" multiplication like on most other 64-bit
# platforms. This means that even being effectively 32-bit, this
# implementation performs "64-bit" computational task in same amount
# of arithmetic operations, most notably multiplications. It requires
# more memory references, most notably to tp[num], but this doesn't
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
# 2.0 code path provides virtually same performance as pa-risc2[W].s:
# it's ~10% better for shortest key length and ~10% worse for longest
# one.
#
# In case it wasn't clear. The module has two distinct code paths:
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
# additions and 64-bit integer loads, not to mention specific
# instruction scheduling. In 64-bit build naturally only 2.0 code path
# is assembled. In 32-bit application context both code paths are
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
# is taken automatically. Also, in 32-bit build the module imposes
# couple of limitations: vector lengths has to be even and vector
# addresses has to be 64-bit aligned. Normally neither is a problem:
# most common key lengths are even and vectors are commonly malloc-ed,
# which ensures alignment.
#
# Special thanks to polarhome.com for providing HP-UX account on
# PA-RISC 1.1 machine, and to correspondent who chose to remain
# anonymous for testing the code on PA-RISC 2.0 machine.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
$BN_SZ =$SIZE_T;
} else {
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
$BN_SZ =$SIZE_T;
if (open CONF,"<${dir}../../opensslconf.h") {
while(<CONF>) {
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
$BN_SZ=8;
$LEVEL="2.0";
last;
}
}
close CONF;
}
}
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
# [+ argument transfer]
$LOCALS=$FRAME-$FRAME_MARKER;
$FRAME+=32; # local variables
$tp="%r31";
$ti1="%r29";
$ti0="%r28";
$rp="%r26";
$ap="%r25";
$bp="%r24";
$np="%r23";
$n0="%r22"; # passed through stack in 32-bit
$num="%r21"; # passed through stack in 32-bit
$idx="%r20";
$arrsz="%r19";
$nm1="%r7";
$nm0="%r6";
$ab1="%r5";
$ab0="%r4";
$fp="%r3";
$hi1="%r2";
$hi0="%r1";
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
$fm0="%fr4"; $fti=$fm0;
$fbi="%fr5L";
$fn0="%fr5R";
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.ALIGN 64
bn_mul_mont
.PROC
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
ldo -$FRAME(%sp),$fp
___
$code.=<<___ if ($SIZE_T==4);
ldw `-$FRAME_MARKER-4`($fp),$n0
ldw `-$FRAME_MARKER-8`($fp),$num
nop
nop ; alignment
___
$code.=<<___ if ($BN_SZ==4);
comiclr,<= 6,$num,%r0 ; are vectors long enough?
b L\$abort
ldi 0,%r28 ; signal "unhandled"
add,ev %r0,$num,$num ; is $num even?
b L\$abort
nop
or $ap,$np,$ti1
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
b L\$abort
nop
nop ; alignment
nop
fldws 0($n0),${fn0}
fldws,ma 4($bp),${fbi} ; bp[0]
___
$code.=<<___ if ($BN_SZ==8);
comib,> 3,$num,L\$abort ; are vectors long enough?
ldi 0,%r28 ; signal "unhandled"
addl $num,$num,$num ; I operate on 32-bit values
fldws 4($n0),${fn0} ; only low part of n0
fldws 4($bp),${fbi} ; bp[0] in flipped word order
___
$code.=<<___;
fldds 0($ap),${fai} ; ap[0,1]
fldds 0($np),${fni} ; np[0,1]
sh2addl $num,%r0,$arrsz
ldi 31,$hi0
ldo 36($arrsz),$hi1 ; space for tp[num+1]
andcm $hi1,$hi0,$hi1 ; align
addl $hi1,%sp,%sp
$PUSH $fp,-$SIZE_T(%sp)
ldo `$LOCALS+16`($fp),$xfer
ldo `$LOCALS+32+4`($fp),$tp
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
xmpyu ${fn0},${fab0}R,${fm0}
addl $arrsz,$ap,$ap ; point at the end
addl $arrsz,$np,$np
subi 0,$arrsz,$idx ; j=0
ldo 8($idx),$idx ; j++++
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
fstds ${fab1},0($xfer)
fstds ${fnm1},8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
___
$code.=<<___ if ($BN_SZ==4);
mtctl $hi0,%cr11 ; $hi0 still holds 31
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
b L\$parisc11
nop
___
$code.=<<___; # PA-RISC 2.0 code-path
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $ab0,63,32,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
addl $ab0,$nm0,$nm0 ; low part is discarded
extrd,u $nm0,31,32,$hi1
L\$1st
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
stw $nm1,-4($tp) ; tp[j-1]
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
ldd -16($xfer),$ab0
addl $ab1,$nm1,$nm1
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[1]
___
$code.=<<___ if ($BN_SZ==8);
fldws 0($bp),${fbi} ; bp[1] in flipped word order
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nm1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
addl $hi1,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2]
flddx $idx($np),${fni} ; np[2]
ldo 8($idx),$idx ; j++++
ldd -16($xfer),$ab0 ; 33-bit value
ldd -8($xfer),$nm0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
extrd,u $ab0,31,32,$ti0 ; carry bit
extrd,u $ab0,63,32,$ab0
fstds ${fab1},0($xfer)
addl $ti0,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
addl $ab0,$nm0,$nm0 ; low part is discarded
ldw 0($tp),$ti1 ; tp[1]
extrd,u $nm0,31,32,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
L\$inner
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldw 4($tp),$ti0 ; tp[j]
stw $nm1,-4($tp) ; tp[j-1]
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ti0,$ti0
addl $ti0,$ab0,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $nm1,31,32,$hi1
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldd -16($xfer),$ab0
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
addl $ti0,$ab0,$ab0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,31,32,$hi0
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[i]
___
$code.=<<___ if ($BN_SZ==8);
ldi 12,$ti0 ; bp[i] in flipped word order
addl,ev %r0,$num,$num
ldi -4,$ti0
addl $ti0,$bp,$bp
fldws 0($bp),${fbi}
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0]
addl $hi0,$ab1,$ab1
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
fstws,mb ${fab0}L,-8($xfer) ; save high part
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nm1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
addl $hi1,$hi0,$hi0
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone
addl $hi0,$ab1,$ab1
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
stw $nm1,-4($tp) ; tp[j-1]
addl $hi1,$hi0,$hi0
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
___
$code.=<<___ if ($BN_SZ==4);
ldws,ma 4($tp),$ti0
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
b L\$sub_pa11
addl $tp,$arrsz,$tp
L\$sub
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
___
$code.=<<___ if ($BN_SZ==8);
ldd,ma 8($tp),$ti0
L\$sub
ldd $idx($np),$hi0
shrpd $ti0,$ti0,32,$ti0 ; flip word order
std $ti0,-8($tp) ; save flipped value
sub,db $ti0,$hi0,$hi1
ldd,ma 8($tp),$ti0
addib,<> 8,$idx,L\$sub
std,ma $hi1,8($rp)
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
sub,db $ti0,%r0,$hi1
ldo -8($tp),$tp
___
$code.=<<___;
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
$ablo=$ab0;
$abhi=$ab1;
$nmlo0=$nm0;
$nmhi0=$nm1;
$nmlo1="%r9";
$nmhi1="%r8";
$code.=<<___;
b L\$done
nop
.ALIGN 8
L\$parisc11
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -12($xfer),$ablo
ldw -16($xfer),$hi0
ldw -4($xfer),$nmlo0
ldw -8($xfer),$nmhi0
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
add $ablo,$nmlo0,$nmlo0 ; discarded
addc %r0,$nmhi0,$hi1
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
nop
L\$1st_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 12($xfer),$nmlo1
addc %r0,$abhi,$hi0
ldw 8($xfer),$nmhi1
add $ablo,$nmlo1,$nmlo1
fstds ${fab1},0($xfer)
addc %r0,$nmhi1,$nmhi1
fstds ${fnm1},8($xfer)
add $hi1,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$hi1
ldw -16($xfer),$abhi
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
ldw -4($xfer),$nmlo0
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -8($xfer),$nmhi0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
fstds ${fab0},-16($xfer)
add $ablo,$nmlo0,$nmlo0
fstds ${fnm0},-8($xfer)
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st_pa11 ; j++++
addc %r0,$nmhi0,$hi1
ldw 8($xfer),$nmhi1
ldw 12($xfer),$nmlo1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
add $hi0,$ablo,$ablo
fstds ${fab1},0($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm1},8($xfer)
add $ablo,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -8($xfer),$nmhi0
addc %r0,$nmhi1,$hi1
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[1]
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nmlo1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer_pa11
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
ldw -16($xfer),$abhi ; carry bit actually
ldo 8($idx),$idx ; j++++
ldw -12($xfer),$ablo
ldw -8($xfer),$nmhi0
ldw -4($xfer),$nmlo0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
fstds ${fab1},0($xfer)
addl $abhi,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
add $ablo,$nmlo0,$nmlo0 ; discarded
ldw 0($tp),$ti1 ; tp[1]
addc %r0,$nmhi0,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
L\$inner_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
ldw 12($xfer),$nmlo1
add $ti1,$ablo,$ablo
ldw 8($xfer),$nmhi1
addc %r0,$abhi,$hi0
fstds ${fab1},0($xfer)
add $ablo,$nmlo1,$nmlo1
fstds ${fnm1},8($xfer)
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
ldw 8($tp),$ti1 ; tp[j]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
ldw -8($xfer),$nmhi0
addc %r0,$abhi,$abhi
stw $nmlo1,-4($tp) ; tp[j-1]
add $ti0,$ablo,$ablo
fstds ${fab0},-16($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm0},-8($xfer)
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner_pa11 ; j++++
addc %r0,$nmhi0,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
ldw 12($xfer),$nmlo1
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldw 8($xfer),$nmhi1
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
fstds ${fab1},0($xfer)
add $ti1,$ablo,$ablo
fstds ${fnm1},8($xfer)
addc %r0,$abhi,$hi0
ldw -16($xfer),$abhi
add $ablo,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$nmhi1
ldw -8($xfer),$nmhi0
add $hi1,$nmlo1,$nmlo1
ldw -4($xfer),$nmlo0
addc %r0,$nmhi1,$hi1
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$abhi
add $ti0,$ablo,$ablo
ldw 8($tp),$ti1 ; tp[j]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone_pa11; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[i]
flddx $idx($ap),${fai} ; ap[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nmlo1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer_pa11
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone_pa11
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
stw $nmlo1,-4($tp) ; tp[j-1]
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32+4`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
ldw -4($tp),$ti0
addl $tp,$arrsz,$tp
L\$sub_pa11
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub_pa11
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy_pa11
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
nop ; alignment
L\$done
___
}
$code.=<<___;
ldi 1,%r28 ; signal "handled"
ldo $FRAME($fp),%sp ; destroy tp[num+1]
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
L\$abort
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $sub = sub {
my ($mod,$args) = @_;
my $orig = "sub$mod\t$args";
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
$opcode|=(1<<10); # e1
$opcode|=(1<<8); # e2
$opcode|=(1<<5); # d
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
# flip word order in 64-bit mode...
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
# assemble 2.0 instructions in 32-bit mode...
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
s/\bbv\b/bve/gm if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View File

@@ -0,0 +1,334 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
# "Teaser" Montgomery multiplication module for PowerPC. It's possible
# to gain a bit more by modulo-scheduling outer loop, then dedicated
# squaring procedure should give further 20% and code can be adapted
# for 32-bit application running on 64-bit CPU. As for the latter.
# It won't be able to achieve "native" 64-bit performance, because in
# 32-bit application context every addc instruction will have to be
# expanded as addc, twice right shift by 32 and finally adde, etc.
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
# for 64-bit application running on PPC970/G5 is:
#
# 512-bit +65%
# 1024-bit +35%
# 2048-bit +18%
# 4096-bit +4%
$flavour = shift;
if ($flavour =~ /32/) {
$BITS= 32;
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
$LDX= "lwzx"; # load indexed
$ST= "stw"; # store
$STU= "stwu"; # store and update
$STX= "stwx"; # store indexed
$STUX= "stwux"; # store indexed and update
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
$SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($flavour =~ /64/) {
$BITS= 64;
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
$LDU= "ldu"; # load and update
$LDX= "ldx"; # load indexed
$ST= "std"; # store
$STU= "stdu"; # store and update
$STX= "stdx"; # store indexed
$STUX= "stdux"; # store indexed and update
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
$SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $flavour"; }
$FRAME=8*$SIZE_T+$RZONE;
$LOCALS=8*$SIZE_T;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$sp="r1";
$toc="r2";
$rp="r3"; $ovf="r3";
$ap="r4";
$bp="r5";
$np="r6";
$n0="r7";
$num="r8";
$rp="r9"; # $rp is reassigned
$aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
$i="r20";
$j="r21";
$tp="r22";
$m0="r23";
$m1="r24";
$lo0="r25";
$hi0="r26";
$lo1="r27";
$hi1="r28";
$alo="r29";
$ahi="r30";
$nlo="r31";
#
$nhi="r0";
$code=<<___;
.machine "any"
.text
.globl .bn_mul_mont_int
.align 4
.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
___
$code.=<<___ if ($BNSZ==4);
cmpwi $num,32 ; longer key performance is not better
bgelr
___
$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
$PUSH r20,`-12*$SIZE_T`($tj)
$PUSH r21,`-11*$SIZE_T`($tj)
$PUSH r22,`-10*$SIZE_T`($tj)
$PUSH r23,`-9*$SIZE_T`($tj)
$PUSH r24,`-8*$SIZE_T`($tj)
$PUSH r25,`-7*$SIZE_T`($tj)
$PUSH r26,`-6*$SIZE_T`($tj)
$PUSH r27,`-5*$SIZE_T`($tj)
$PUSH r28,`-4*$SIZE_T`($tj)
$PUSH r29,`-3*$SIZE_T`($tj)
$PUSH r30,`-2*$SIZE_T`($tj)
$PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
$LD $nj,0($np) ; np[0]
$UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
$UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
$UMULH $ahi,$aj,$m0
$UMULL $lo1,$nj,$m1 ; np[0]*m1
$UMULH $hi1,$nj,$m1
$LD $nj,$BNSZ($np) ; np[1]
addc $lo1,$lo1,$lo0
addze $hi1,$hi1
$UMULL $nlo,$nj,$m1 ; np[1]*m1
$UMULH $nhi,$nj,$m1
mtctr $num
li $j,`2*$BNSZ`
.align 4
L1st:
$LDX $aj,$ap,$j ; ap[j]
addc $lo0,$alo,$hi0
$LDX $nj,$np,$j ; np[j]
addze $hi0,$ahi
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
addc $lo1,$nlo,$hi1
$UMULH $ahi,$aj,$m0
addze $hi1,$nhi
$UMULL $nlo,$nj,$m1 ; np[j]*m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
$UMULH $nhi,$nj,$m1
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addi $j,$j,$BNSZ ; j++
addi $tp,$tp,$BNSZ ; tp++
bdnz- L1st
;L1st
addc $lo0,$alo,$hi0
addze $hi0,$ahi
addc $lo1,$nlo,$hi1
addze $hi1,$nhi
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
li $ovf,0
addc $hi1,$hi1,$hi0
addze $ovf,$ovf ; upmost overflow bit
$ST $hi1,$BNSZ($tp)
li $i,$BNSZ
.align 4
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$LOCALS
$LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
$LD $nj,0($np) ; np[0]
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi0,$hi0
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
$UMULH $ahi,$aj,$m0
$UMULL $lo1,$nj,$m1 ; np[0]*m1
$UMULH $hi1,$nj,$m1
$LD $nj,$BNSZ($np) ; np[1]
addc $lo1,$lo1,$lo0
$UMULL $nlo,$nj,$m1 ; np[1]*m1
addze $hi1,$hi1
$UMULH $nhi,$nj,$m1
mtctr $num
li $j,`2*$BNSZ`
.align 4
Linner:
$LDX $aj,$ap,$j ; ap[j]
addc $lo0,$alo,$hi0
$LD $tj,$BNSZ($tp) ; tp[j]
addze $hi0,$ahi
$LDX $nj,$np,$j ; np[j]
addc $lo1,$nlo,$hi1
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi1,$nhi
$UMULH $ahi,$aj,$m0
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
$UMULL $nlo,$nj,$m1 ; np[j]*m1
addze $hi0,$hi0
$UMULH $nhi,$nj,$m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
addi $j,$j,$BNSZ ; j++
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addi $tp,$tp,$BNSZ ; tp++
bdnz- Linner
;Linner
$LD $tj,$BNSZ($tp) ; tp[j]
addc $lo0,$alo,$hi0
addze $hi0,$ahi
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
addze $hi0,$hi0
addc $lo1,$nlo,$hi1
addze $hi1,$nhi
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
li $ovf,0
adde $hi1,$hi1,$hi0
addze $ovf,$ovf
$ST $hi1,$BNSZ($tp)
;
slwi $tj,$num,`log($BNSZ)/log(2)`
$UCMP $i,$tj
addi $i,$i,$BNSZ
ble- Louter
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$LOCALS
mtctr $num
.align 4
Lsub: $LDX $tj,$tp,$j
$LDX $nj,$np,$j
subfe $aj,$nj,$tj ; tp[j]-np[j]
$STX $aj,$rp,$j
addi $j,$j,$BNSZ
bdnz- Lsub
li $j,0
mtctr $num
subfe $ovf,$j,$ovf ; handle upmost overflow bit
and $ap,$tp,$ovf
andc $np,$rp,$ovf
or $ap,$ap,$np ; ap=borrow?tp:rp
.align 4
Lcopy: ; copy or in-place refresh
$LDX $tj,$ap,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
$POP $tj,0($sp)
li r3,1
$POP r20,`-12*$SIZE_T`($tj)
$POP r21,`-11*$SIZE_T`($tj)
$POP r22,`-10*$SIZE_T`($tj)
$POP r23,`-9*$SIZE_T`($tj)
$POP r24,`-8*$SIZE_T`($tj)
$POP r25,`-7*$SIZE_T`($tj)
$POP r26,`-6*$SIZE_T`($tj)
$POP r27,`-5*$SIZE_T`($tj)
$POP r28,`-4*$SIZE_T`($tj)
$POP r29,`-3*$SIZE_T`($tj)
$POP r30,`-2*$SIZE_T`($tj)
$POP r31,`-1*$SIZE_T`($tj)
mr $sp,$tj
blr
.long 0
.byte 0,12,4,0,0x80,12,6,0
.long 0
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... gcc 4.3 appeared to generate poor code, therefore
# the effort. And indeed, the module delivers 55%-90%(*) improvement
# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
# This is for 64-bit build. In 32-bit "highgprs" case improvement is
# even higher, for example on z990 it was measured 80%-150%. ECDSA
# sign is modest 9%-12% faster. Keep in mind that these coefficients
# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
# burnt in it...
#
# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
# so that improvement coefficients can vary from one specific
# setup to another.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$rp="%r2";
$a1="%r3";
$a0="%r4";
$b1="%r5";
$b0="%r6";
$ra="%r14";
$sp="%r15";
@T=("%r0","%r1");
@i=("%r12","%r13");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
$code.=<<___;
.text
.type _mul_1x1,\@function
.align 16
_mul_1x1:
lgr $a1,$a
sllg $a2,$a,1
sllg $a4,$a,2
sllg $a8,$a,3
srag $lo,$a1,63 # broadcast 63rd bit
nihh $a1,0x1fff
srag @i[0],$a2,63 # broadcast 62nd bit
nihh $a2,0x3fff
srag @i[1],$a4,63 # broadcast 61st bit
nihh $a4,0x7fff
ngr $lo,$b
ngr @i[0],$b
ngr @i[1],$b
lghi @T[0],0
lgr $a12,$a1
stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
xgr $a12,$a2
stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
lgr $a48,$a4
stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
xgr $a48,$a8
stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
xgr $a1,$a4
stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
xgr $a2,$a4
stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
xgr $a12,$a4
stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
xgr $a1,$a48
stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
xgr $a2,$a48
stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
xgr $a12,$a48
stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
xgr $a1,$a4
stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
xgr $a2,$a4
stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
xgr $a12,$a4
stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
srlg $hi,$lo,1
stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
sllg $lo,$lo,63
stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
srlg @T[0],@i[0],2
stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
lghi $mask,`0xf<<3`
sllg $a1,@i[0],62
sllg @i[0],$b,3
srlg @T[1],@i[1],3
ngr @i[0],$mask
sllg $a2,@i[1],61
srlg @i[1],$b,4-3
xgr $hi,@T[0]
ngr @i[1],$mask
xgr $lo,$a1
xgr $hi,@T[1]
xgr $lo,$a2
xg $lo,$stdframe(@i[0],$sp)
srlg @i[0],$b,8-3
ngr @i[0],$mask
___
for($n=1;$n<14;$n++) {
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
srlg @i[1],$b,`($n+2)*4`-3
sllg @T[0],@T[1],`$n*4`
ngr @i[1],$mask
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
sllg @T[0],@T[1],`$n*4`
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
lg @T[0],$stdframe(@i[0],$sp)
sllg @T[1],@T[0],`($n+1)*4`
srlg @T[0],@T[0],`64-($n+1)*4`
xgr $lo,@T[1]
xgr $hi,@T[0]
br $ra
.size _mul_1x1,.-_mul_1x1
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@function
.align 16
bn_GF2m_mul_2x2:
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi %r1,-$stdframe-128
la %r0,0($sp)
la $sp,0(%r1,$sp) # alloca
st${g} %r0,0($sp) # back chain
___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
bras $ra,_mul_1x1 # a1·b1
stmg $lo,$hi,16($rp)
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # a0·b0
stmg $lo,$hi,0($rp)
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
lmg @r[0],@r[3],0($rp)
xgr $lo,$hi
xgr $hi,@r[1]
xgr $lo,@r[0]
xgr $hi,@r[2]
xgr $lo,@r[3]
xgr $hi,@r[3]
xgr $lo,$hi
stg $hi,16($rp)
stg $lo,8($rp)
___
} else {
$code.=<<___;
sllg %r3,%r3,32
sllg %r5,%r5,32
or %r3,%r4
or %r5,%r6
bras $ra,_mul_1x1
rllg $lo,$lo,32
rllg $hi,$hi,32
stmg $lo,$hi,0($rp)
___
}
$code.=<<___;
lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
br $ra
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,277 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2007.
#
# Performance improvement over vanilla C code varies from 85% to 45%
# depending on key length and benchmark. Unfortunately in this context
# these are not very impressive results [for code that utilizes "wide"
# 64x64=128-bit multiplication, which is not commonly available to C
# programmers], at least hand-coded bn_asm.c replacement is known to
# provide 30-40% better results for longest keys. Well, on a second
# thought it's not very surprising, because z-CPUs are single-issue
# and _strictly_ in-order execution, while bn_mul_mont is more or less
# dependent on CPU ability to pipe-line instructions and have several
# of them "in-flight" at the same time. I mean while other methods,
# for example Karatsuba, aim to minimize amount of multiplications at
# the cost of other operations increase, bn_mul_mont aim to neatly
# "overlap" multiplications and the other operations [and on most
# platforms even minimize the amount of the other operations, in
# particular references to memory]. But it's possible to improve this
# module performance by implementing dedicated squaring code-path and
# possibly by unrolling loops...
# January 2009.
#
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
# On z990 it was measured to perform 2.6-2.2 times better than
# compiler-generated code, less for longer keys...
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$mn0="%r0";
$num="%r1";
# int bn_mul_mont(
$rp="%r2"; # BN_ULONG *rp,
$ap="%r3"; # const BN_ULONG *ap,
$bp="%r4"; # const BN_ULONG *bp,
$np="%r5"; # const BN_ULONG *np,
$n0="%r6"; # const BN_ULONG *n0,
#$num="160(%r15)" # int num);
$bi="%r2"; # zaps rp
$j="%r7";
$ahi="%r8";
$alo="%r9";
$nhi="%r10";
$nlo="%r11";
$AHI="%r12";
$NHI="%r13";
$count="%r14";
$sp="%r15";
$code.=<<___;
.text
.globl bn_mul_mont
.type bn_mul_mont,\@function
bn_mul_mont:
lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
la $bp,0($num,$bp)
st${g} %r2,2*$SIZE_T($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
___
$code.=<<___ if ($flavour =~ /3[12]/);
tmll $num,4
bnzr %r14 # if ($num&1) return 0;
___
$code.=<<___ if ($flavour !~ /3[12]/);
cghi $num,96 #
bhr %r14 # if($num>96) return 0;
___
$code.=<<___;
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi $rp,-$stdframe-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
st${g} %r0,0($sp) # back chain
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
_dswap $n0
lg $bi,0($bp)
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
lgr $mn0,$alo # "tp[0]"*n0
msgr $mn0,$n0
lg $nlo,0($np) #
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.L1st:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[0]
algr $alo,$AHI
lghi $AHI,0
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI # +="tp[j]"
algr $nlo,$alo
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,$stdframe($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
lgr $mn0,$alo
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($np) # np[0]
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.Linner:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
alg $alo,$stdframe($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
jne .Louter
l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
la $ap,$stdframe($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
lg $nlo,0($j,$np)
_dswap $nlo
slbgr $alo,$nlo
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
lghi $ahi,0
slbgr $AHI,$ahi # handle upmost carry
ngr $ap,$AHI
lghi $np,-1
xgr $np,$AHI
ngr $np,$rp
ogr $ap,$np # ap=borrow?tp:rp
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
_dswap $alo
stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lcopy
la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
lm${g} %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
print $_,"\n";
}
close STDOUT;

678
jni/openssl/crypto/bn/asm/s390x.S Executable file
View File

@@ -0,0 +1,678 @@
.ident "s390x.S, version 1.1"
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
.text
#define zero %r0
// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_add_words
.type bn_mul_add_words,@function
.align 4
bn_mul_add_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0;
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r10,48(%r15)
lghi %r10,3
lghi %r8,0 // carry = 0
nr %r10,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
.Loop4_madd:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
alcgr %r6,zero
alg %r7,0(%r2,%r1) // +=rp[i]
stg %r7,0(%r2,%r1) // rp[i]=
lg %r9,8(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r2,%r1)
stg %r9,8(%r2,%r1)
lg %r7,16(%r2,%r3)
mlgr %r6,%r5
alcgr %r7,%r8
alcgr %r6,zero
alg %r7,16(%r2,%r1)
stg %r7,16(%r2,%r1)
lg %r9,24(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,24(%r2,%r1)
stg %r9,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r4,.Loop4_madd
la %r10,1(%r10) // see if len%4 is zero ...
brct %r10,.Loop1_madd // without touching condition code:-)
.Lend_madd:
alcgr %r8,zero // collect carry bit
lgr %r2,%r8
lmg %r6,%r10,48(%r15)
br %r14
.Loop1_madd:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
alcgr %r6,zero
alg %r7,0(%r2,%r1) // +=rp[i]
stg %r7,0(%r2,%r1) // rp[i]=
lgr %r8,%r6
la %r2,8(%r2) // i++
brct %r10,.Loop1_madd
j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words
// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_words
.type bn_mul_words,@function
.align 4
bn_mul_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0;
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r10,48(%r15)
lghi %r10,3
lghi %r8,0 // carry = 0
nr %r10,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_mul // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
.Loop4_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lg %r9,8(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,8(%r2,%r1)
lg %r7,16(%r2,%r3)
mlgr %r6,%r5
alcgr %r7,%r8
stg %r7,16(%r2,%r1)
lg %r9,24(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r4,.Loop4_mul
la %r10,1(%r10) // see if len%4 is zero ...
brct %r10,.Loop1_mul // without touching condition code:-)
.Lend_mul:
alcgr %r8,zero // collect carry bit
lgr %r2,%r8
lmg %r6,%r10,48(%r15)
br %r14
.Loop1_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lgr %r8,%r6
la %r2,8(%r2) // i++
brct %r10,.Loop1_mul
j .Lend_mul
.size bn_mul_words,.-bn_mul_words
// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
.globl bn_sqr_words
.type bn_sqr_words,@function
.align 4
bn_sqr_words:
ltgfr %r4,%r4
bler %r14
stmg %r6,%r7,48(%r15)
srag %r1,%r4,2 // cnt=len/4
jz .Loop1_sqr
.Loop4_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
lg %r7,8(%r3)
mlgr %r6,%r7
stg %r7,16(%r2)
stg %r6,24(%r2)
lg %r7,16(%r3)
mlgr %r6,%r7
stg %r7,32(%r2)
stg %r6,40(%r2)
lg %r7,24(%r3)
mlgr %r6,%r7
stg %r7,48(%r2)
stg %r6,56(%r2)
la %r3,32(%r3)
la %r2,64(%r2)
brct %r1,.Loop4_sqr
lghi %r1,3
nr %r4,%r1 // cnt=len%4
jz .Lend_sqr
.Loop1_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
la %r3,8(%r3)
la %r2,16(%r2)
brct %r4,.Loop1_sqr
.Lend_sqr:
lmg %r6,%r7,48(%r15)
br %r14
.size bn_sqr_words,.-bn_sqr_words
// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
.globl bn_div_words
.type bn_div_words,@function
.align 4
bn_div_words:
dlgr %r2,%r4
lgr %r2,%r3
br %r14
.size bn_div_words,.-bn_div_words
// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_add_words
.type bn_add_words,@function
.align 4
bn_add_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jz .Loop1_add // carry is incidentally cleared if branch taken
algr %r2,%r2 // clear carry
.Loop4_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
alcg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
alcg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
alcg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_add
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_add // without touching condition code:-)
.Lexit_add:
lghi %r2,0
alcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.Loop1_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_add
j .Lexit_add
.size bn_add_words,.-bn_add_words
// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_sub_words
.type bn_sub_words,@function
.align 4
bn_sub_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jnz .Loop4_sub // borrow is incidentally cleared if branch taken
slgr %r2,%r2 // clear borrow
.Loop1_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_sub
j .Lexit_sub
.Loop4_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
slbg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
slbg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
slbg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_sub
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_sub // without touching condition code:-)
.Lexit_sub:
lghi %r2,0
slbgr %r2,%r2
lcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.size bn_sub_words,.-bn_sub_words
#define c1 %r1
#define c2 %r5
#define c3 %r8
#define mul_add_c(ai,bi,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,bi*8(%r4); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba8
.type bn_mul_comba8,@function
.align 4
bn_mul_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(4,0,c2,c3,c1);
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
mul_add_c(0,4,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(0,5,c3,c1,c2);
mul_add_c(1,4,c3,c1,c2);
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
mul_add_c(4,1,c3,c1,c2);
mul_add_c(5,0,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(6,0,c1,c2,c3);
mul_add_c(5,1,c1,c2,c3);
mul_add_c(4,2,c1,c2,c3);
mul_add_c(3,3,c1,c2,c3);
mul_add_c(2,4,c1,c2,c3);
mul_add_c(1,5,c1,c2,c3);
mul_add_c(0,6,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
mul_add_c(0,7,c2,c3,c1);
mul_add_c(1,6,c2,c3,c1);
mul_add_c(2,5,c2,c3,c1);
mul_add_c(3,4,c2,c3,c1);
mul_add_c(4,3,c2,c3,c1);
mul_add_c(5,2,c2,c3,c1);
mul_add_c(6,1,c2,c3,c1);
mul_add_c(7,0,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
mul_add_c(7,1,c3,c1,c2);
mul_add_c(6,2,c3,c1,c2);
mul_add_c(5,3,c3,c1,c2);
mul_add_c(4,4,c3,c1,c2);
mul_add_c(3,5,c3,c1,c2);
mul_add_c(2,6,c3,c1,c2);
mul_add_c(1,7,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
mul_add_c(2,7,c1,c2,c3);
mul_add_c(3,6,c1,c2,c3);
mul_add_c(4,5,c1,c2,c3);
mul_add_c(5,4,c1,c2,c3);
mul_add_c(6,3,c1,c2,c3);
mul_add_c(7,2,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
mul_add_c(7,3,c2,c3,c1);
mul_add_c(6,4,c2,c3,c1);
mul_add_c(5,5,c2,c3,c1);
mul_add_c(4,6,c2,c3,c1);
mul_add_c(3,7,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
mul_add_c(4,7,c3,c1,c2);
mul_add_c(5,6,c3,c1,c2);
mul_add_c(6,5,c3,c1,c2);
mul_add_c(7,4,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
mul_add_c(7,5,c1,c2,c3);
mul_add_c(6,6,c1,c2,c3);
mul_add_c(5,7,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
mul_add_c(6,7,c2,c3,c1);
mul_add_c(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
mul_add_c(7,7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba8,.-bn_mul_comba8
// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba4
.type bn_mul_comba4,@function
.align 4
bn_mul_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r3)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(3,3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
stmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba4,.-bn_mul_comba4
#define sqr_add_c(ai,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlgr %r6,%r7; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
#define sqr_add_c2(ai,aj,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,aj*8(%r3); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba8
.type bn_sqr_comba8,@function
.align 4
bn_sqr_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
sqr_add_c2(4,0,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(5,0,c3,c1,c2);
sqr_add_c2(4,1,c3,c1,c2);
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
sqr_add_c2(4,2,c1,c2,c3);
sqr_add_c2(5,1,c1,c2,c3);
sqr_add_c2(6,0,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
sqr_add_c2(7,0,c2,c3,c1);
sqr_add_c2(6,1,c2,c3,c1);
sqr_add_c2(5,2,c2,c3,c1);
sqr_add_c2(4,3,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
sqr_add_c(4,c3,c1,c2);
sqr_add_c2(5,3,c3,c1,c2);
sqr_add_c2(6,2,c3,c1,c2);
sqr_add_c2(7,1,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
sqr_add_c2(7,2,c1,c2,c3);
sqr_add_c2(6,3,c1,c2,c3);
sqr_add_c2(5,4,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
sqr_add_c(5,c2,c3,c1);
sqr_add_c2(6,4,c2,c3,c1);
sqr_add_c2(7,3,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
sqr_add_c2(7,4,c3,c1,c2);
sqr_add_c2(6,5,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
sqr_add_c(6,c1,c2,c3);
sqr_add_c2(7,5,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
sqr_add_c2(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
sqr_add_c(7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba8,.-bn_sqr_comba8
// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba4
.type bn_sqr_comba4,@function
.align 4
bn_sqr_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba4,.-bn_sqr_comba4

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,606 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
#
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
# for undertaken effort are multiple. First of all, UltraSPARC is not
# the whole SPARCv9 universe and other VIS-free implementations deserve
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
# several integrated RSA/DSA accelerator circuits accessible through
# kernel driver [only(*)], but having decent user-land software
# implementation is important too. Finally, reasons like desire to
# experiment with dedicated squaring procedure. Yes, this module
# implements one, because it was easiest to draft it in SPARCv9
# instructions...
# (*) Engine accessing the driver in question is on my TODO list.
# For reference, acceleator is estimated to give 6 to 10 times
# improvement on single-threaded RSA sign. It should be noted
# that 6-10x improvement coefficient does not actually mean
# something extraordinary in terms of absolute [single-threaded]
# performance, as SPARCv9 instruction set is by all means least
# suitable for high performance crypto among other 64 bit
# platforms. 6-10x factor simply places T1 in same performance
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
# appear impressive at all, but it's the sign operation which is
# far more critical/interesting.
# You might notice that inner loops are modulo-scheduled:-) This has
# essentially negligible impact on UltraSPARC performance, it's
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
# the advantage... Currently this module surpasses sparcv9a-mont.pl
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
# module still have hidden potential [see TODO list there], which is
# estimated to be larger than 20%...
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=128; }
$car0="%o0";
$car1="%o1";
$car2="%o2"; # 1 bit
$acc0="%o3";
$acc1="%o4";
$mask="%g1"; # 32 bits, what a waste...
$tmp0="%g4";
$tmp1="%g5";
$i="%l0";
$j="%l1";
$mul0="%l2";
$mul1="%l3";
$tp="%l4";
$apj="%l5";
$npj="%l6";
$tpj="%l7";
$fname="bn_mul_mont_int";
$code=<<___;
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
cmp %o5,4 ! 128 bits minimum
bge,pt %icc,.Lenter
sethi %hi(0xffffffff),$mask
retl
clr %o0
.align 32
.Lenter:
save %sp,-$frame,%sp
sll $num,2,$num ! num*=4
or $mask,%lo(0xffffffff),$mask
ld [$n0],$n0
cmp $ap,$bp
and $num,$mask,$num
ld [$bp],$mul0 ! bp[0]
nop
add %sp,$bias,%o7 ! real top of stack
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
sub %o7,$num,%o7
ld [$ap+4],$apj ! ap[1]
and %o7,-1024,%o7
ld [$np],$car1 ! np[0]
sub %o7,$bias,%sp ! alloca
ld [$np+4],$npj ! np[1]
be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
mov 12,$j
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.L1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp]
cmp $j,$num
mov $tmp1,$acc1
srlx $car1,32,$car1
bl %icc,.L1st
add $tp,4,$tp ! tp++
!.L1st
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
mov 4,$i ! i++
ld [$bp+4],$mul0 ! bp[1]
.Louter:
add %sp,$bias+$frame,$tp
ld [$ap],$car0 ! ap[0]
ld [$ap+4],$apj ! ap[1]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
ld [$tp],$tmp1 ! tp[0]
ld [$tp+4],$tpj ! tp[1]
mov 12,$j
mulx $car0,$mul0,$car0
mulx $apj,$mul0,$tmp0 !prologue!
add $tmp1,$car0,$car0
ld [$ap+8],$apj !prologue!
and $car0,$mask,$acc0
mulx $n0,$acc0,$mul1
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1 !prologue!
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Linner:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
add $acc0,$car0,$car0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
and $car0,$mask,$acc0
ld [$tp+8],$tpj ! tp[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
mov $tmp1,$acc1
cmp $j,$num
bl %icc,.Linner
add $tp,4,$tp ! tp++
!.Linner
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
add $acc0,$car0,$car0
ld [$tp+8],$tpj ! tp[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $tpj,$car0,$car0
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
add $acc0,$car1,$car1
st $car1,[$tp+4] ! tp[j-1]
srlx $car0,32,$car0
add $i,4,$i ! i++
srlx $car1,32,$car1
add $car0,$car1,$car1
cmp $i,$num
add $car2,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
bl,a %icc,.Louter
ld [$bp+$i],$mul0 ! bp[i]
!.Louter
add $tp,12,$tp
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 16
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
subccc %o0,%o1,%o1 ! tp[j]-np[j]
add $rp,%o7,$i
add %o7,4,%o7
brnz %o7,.Lsub
st %o1,[$i]
subc $car2,0,$car2 ! handle upmost overflow bit
and $tp,$car2,$ap
andn $rp,$car2,$np
or $ap,$np,$ap
sub %g0,$num,%o7
.Lcopy:
ld [$ap+%o7],%o0 ! copy or in-place refresh
st %g0,[$tp+%o7] ! zap tp
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
mov 1,%i0
ret
restore
___
########
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
$sbit="%i2"; # re-use $bp!
$code.=<<___;
.align 32
.Lbn_sqr_mont:
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
mulx $apj,$mul0,$tmp0 !prologue!
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
srlx $car0,32,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue!
and $car0,1,$sbit
ld [$np+8],$npj !prologue!
srlx $car0,1,$car0
add $acc0,$car1,$car1
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Lsqr_1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
mov $tmp1,$acc1
srlx $acc0,32,$sbit
add $j,4,$j ! j++
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp]
mov $tmp0,$acc0
srlx $car1,32,$car1
bl %icc,.Lsqr_1st
add $tp,4,$tp ! tp++
!.Lsqr_1st
mulx $apj,$mul0,$tmp0 ! epilogue
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
add $tmp1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
ld [$ap+4],$mul0 ! ap[1]
ld [$ap+8],$apj ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp0,$mul1
mulx $mul0,$mul0,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1
add $tmp0,$car1,$car1
and $car0,$mask,$acc0
ld [$np+8],$npj ! np[2]
srlx $car1,32,$car1
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
and $car0,1,$sbit
add $acc1,$car1,$car1
srlx $car0,1,$car0
mov 12,$j
st $car1,[%sp+$bias+$frame] ! tp[0]=
srlx $car1,32,$car1
add %sp,$bias+$frame+4,$tp
.Lsqr_2nd:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc1,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_2nd
add $tp,4,$tp ! tp++
!.Lsqr_2nd
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+8],$mul0 ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
mov 8,$i
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
mov 4,$j
.Lsqr_outer:
.Lsqr_inner1:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner1
add $tp,4,$tp
!.Lsqr_inner1
add $j,4,$j
ld [$ap+$j],$apj ! ap[j]
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
ld [$np+$j],$npj ! np[j]
add $acc0,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $j,4,$j
cmp $j,$num
be,pn %icc,.Lsqr_no_inner2
add $tp,4,$tp
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
or $sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner2
add $tp,4,$tp ! tp++
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
add $i,4,$i ! i++
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+$i],$mul0 ! ap[j]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
add $i,4,$tmp0
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
cmp $tmp0,$num ! i<num-1
bl %icc,.Lsqr_outer
mov 4,$j
.Lsqr_last:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_last
add $tp,4,$tp
!.Lsqr_last
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ba .Ltail
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View File

@@ -0,0 +1,882 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
# Because unlike integer multiplier, which simply stalls whole CPU,
# FPU is fully pipelined and can effectively emit 48 bit partial
# product every cycle. Why not blended SPARC v9? One can argue that
# making this module dependent on UltraSPARC VIS extension limits its
# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
# implementations from compatibility matrix. But the rest, whole Sun
# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
# VIS extension instructions used in this module. This is considered
# good enough to not care about HAL SPARC64 users [if any] who have
# integer-only pure SPARCv9 module to "fall down" to.
# USI&II cores currently exhibit uniform 2x improvement [over pre-
# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
# performance improves few percents for shorter keys and worsens few
# percents for longer keys. This is because USIII integer multiplier
# is >3x faster than USI&II one, which is harder to match [but see
# TODO list below]. It should also be noted that SPARC64 V features
# out-of-order execution, which *might* mean that integer multiplier
# is pipelined, which in turn *might* be impossible to match... On
# additional note, SPARC64 V implements FP Multiply-Add instruction,
# which is perfectly usable in this context... In other words, as far
# as Fujitsu SPARC64 V goes, talk to the author:-)
# The implementation implies following "non-natural" limitations on
# input arguments:
# - num may not be less than 4;
# - num has to be even;
# Failure to meet either condition has no fatal effects, simply
# doesn't give any performance gain.
# TODO:
# - modulo-schedule inner loop for better performance (on in-order
# execution core such as UltraSPARC this shall result in further
# noticeable(!) improvement);
# - dedicated squaring procedure[?];
######################################################################
# November 2006
#
# Modulo-scheduled inner loops allow to interleave floating point and
# integer instructions and minimize Read-After-Write penalties. This
# results in *further* 20-50% perfromance improvement [depending on
# key length, more for longer keys] on USI&II cores and 30-80% - on
# USIII&IV.
$fname="bn_mul_mont_fpu";
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) {
$bias=2047;
$frame=192;
} else {
$bias=0;
$frame=128; # 96 rounded up to largest known cache-line
}
$locals=64;
# In order to provide for 32-/64-bit ABI duality, I keep integers wider
# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
# exclusively for pointers, indexes and other small values...
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$tp="%l0"; # t[num]
$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
$ap_h="%l2"; # to these four vectors as double-precision FP values.
$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
$np_h="%l4"; # loop and L1-cache aliasing is minimized...
$i="%l5";
$j="%l6";
$mask="%l7"; # 16-bit mask, 0xffff
$n0="%g4"; # reassigned(!) to "64-bit" register
$carry="%i4"; # %i4 reused(!) for a carry bit
# FP register naming chart
#
# ..HILO
# dcba
# --------
# LOa
# LOb
# LOc
# LOd
# HIa
# HIb
# HIc
# HId
# ..a
# ..b
$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
$dota="%f24"; $dotb="%f26";
$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
$code=<<___;
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
save %sp,-$frame-$locals,%sp
cmp $num,4
bl,a,pn %icc,.Lret
clr %i0
andcc $num,1,%g0 ! $num has to be even...
bnz,a,pn %icc,.Lret
clr %i0 ! signal "unsupported input value"
srl $num,1,$num
sethi %hi(0xffff),$mask
ld [%i4+0],$n0 ! $n0 reassigned, remember?
or $mask,%lo(0xffff),$mask
ld [%i4+4],%o0
sllx %o0,32,%o0
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
sll $num,3,$num ! num*=8
add %sp,$bias,%o0 ! real top of stack
sll $num,2,%o1
add %o1,$num,%o1 ! %o1=num*5
sub %o0,%o1,%o0
and %o0,-2048,%o0 ! optimize TLB utilization
sub %o0,$bias,%sp ! alloca(5*num*8)
rd %asi,%o7 ! save %asi
add %sp,$bias+$frame+$locals,$tp
add $tp,$num,$ap_l
add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
add $ap_l,$num,$ap_h
add $ap_h,$num,$np_l
add $np_l,$num,$np_h
wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
add $rp,$num,$rp ! readjust input pointers to point
add $ap,$num,$ap ! at the ends too...
add $bp,$num,$bp
add $np,$num,$np
stx %o7,[%sp+$bias+$frame+48] ! save %asi
sub %g0,$num,$i ! i=-num
sub %g0,$num,$j ! j=-num
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[0]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
add $np,$j,%o5
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
stx %o0,[%sp+$bias+$frame+0]
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o3+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
fxtod $alo,$alo
ldda [%o4+0]%asi,$bb
fxtod $ahi,$ahi
ldda [%o4+6]%asi,$bc
fxtod $nlo,$nlo
ldda [%o4+4]%asi,$bd
fxtod $nhi,$nhi
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fxtod $na,$na
std $ahi,[$ap_h+$j]
fxtod $nb,$nb
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fxtod $nc,$nc
std $nhi,[$np_h+$j]
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
add $j,8,$j
std $nlob,[%sp+$bias+$frame+8]
add $ap,$j,%o4
std $nloc,[%sp+$bias+$frame+16]
add $np,$j,%o5
std $nlod,[%sp+$bias+$frame+24]
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
!and %o0,$mask,%o0
!and %o1,$mask,%o1
!and %o2,$mask,%o2
!sllx %o1,16,%o1
!sllx %o2,32,%o2
!sllx %o3,48,%o7
!or %o1,%o0,%o0
!or %o2,%o0,%o0
!or %o7,%o0,%o0 ! 64-bit result
srlx %o3,16,%g1 ! 34-bit carry
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $dota,$nloa,$nloa
faddd $dotb,$nlob,$nlob
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.L1stskip
std $nlod,[%sp+$bias+$frame+24]
.align 32 ! incidentally already aligned !
.L1st:
add $ap,$j,%o4
add $np,$j,%o5
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
and %o1,$mask,%o1
and %o2,$mask,%o2
fmuld $ahi,$bb,$ahib
sllx %o1,16,%o1
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
sllx %o2,32,%o2
fmuld $ahi,$bc,$ahic
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
or %o2,%o0,%o0
fmuld $ahi,$bd,$ahid
or %o7,%o0,%o0 ! 64-bit result
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
addcc %g1,%o0,%o0
faddd $dota,$nloa,$nloa
srlx %o3,16,%g1 ! 34-bit carry
faddd $dotb,$nlob,$nlob
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bnz,pt %icc,.L1st
add $tp,8,$tp
.L1stskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+32],%o4
addcc %g1,%o0,%o0
ldx [%sp+$bias+$frame+40],%o5
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
mov %g1,$carry
stx %o4,[$tp] ! tp[num-1]=
ba .Louter
add $i,8,$i
.align 32
.Louter:
sub %g0,$num,$j ! j=-num
add %sp,$bias+$frame+$locals,$tp
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[i]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
ldx [$tp],%o2 ! tp[0]
mulx %o1,%o0,%o0
addcc %o2,%o0,%o0
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
stx %o0,[%sp+$bias+$frame+0]
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
ldda [%o4+0]%asi,$bb
ldda [%o4+6]%asi,$bc
ldda [%o4+4]%asi,$bd
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
ldd [$ap_l+$j],$alo ! load a[j] in double format
fxtod $na,$na
ldd [$ap_h+$j],$ahi
fxtod $nb,$nb
ldd [$np_l+$j],$nlo ! load n[j] in double format
fxtod $nc,$nc
ldd [$np_h+$j],$nhi
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
add $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
! why?
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
ldx [$tp],%o7
faddd $nloc,$nhia,$nloc
addcc %o7,%o0,%o0
! end-of-why?
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.Linnerskip
std $nlod,[%sp+$bias+$frame+24]
ba .Linner
nop
.align 32
.Linner:
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
faddd $nloc,$nhia,$nloc
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
addcc %o7,%o0,%o0
fdtox $nloc,$nloc
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
addcc $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
bnz,pt %icc,.Linner
add $tp,8,$tp
.Linnerskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
ldx [%sp+$bias+$frame+32],%o4
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+40],%o5
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
addcc %o7,%o0,%o0
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
addcc $carry,%o4,%o4
stx %o4,[$tp] ! tp[num-1]
mov %g1,$carry
bcs,a %xcc,.+8
add $carry,1,$carry
addcc $i,8,$i
bnz %icc,.Louter
nop
add $tp,8,$tp ! adjust tp to point at the end
orn %g0,%g0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 32
.Lsub:
ldx [$tp+%o7],%o0
add $np,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
srlx %o0,32,%o1
subccc %o0,%o2,%o2
add $rp,%o7,%g1
subccc %o1,%o3,%o3
st %o2,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
subc $carry,0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lcopy
nop
.align 32
.Lcopy:
ldx [$tp+%o7],%o0
add $rp,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
stx %g0,[$tp+%o7]
and %o0,%g4,%o0
srlx %o0,32,%o1
andn %o2,%g4,%o2
andn %o3,%g4,%o3
or %o2,%o0,%o0
or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,$num,%o7 ! n=-num
.Lzap:
stx %g0,[$ap_l+%o7]
stx %g0,[$ap_h+%o7]
stx %g0,[$np_l+%o7]
stx %g0,[$np_h+%o7]
add %o7,8,%o7
brnz,pt %o7,.Lzap
nop
ldx [%sp+$bias+$frame+48],%o7
wr %g0,%o7,%asi ! restore %asi
mov 1,%i0
.Lret:
ret
restore
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
# Below substitution makes it possible to compile without demanding
# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
# dare to do this, because VIS capability is detected at run-time now
# and this routine is not called on CPU not capable to execute it. Do
# note that fzeros is not the only VIS dependency! Another dependency
# is implicit and is just _a_ numerical value loaded to %asi register,
# which assembler can't recognize as VIS specific...
$code =~ s/fzeros\s+%f([0-9]+)/
sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
/gem;
print $code;
# flush
close STDOUT;

View File

@@ -0,0 +1,242 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Wrapper around 'rep montmul', VIA-specific instruction accessing
# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
#
# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
# different software configurations on 1.5GHz VIA Esther processor.
# Lines marked with "software integer" denote performance of hand-
# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
# refers to hand-coded SSE2 Montgomery multiplication procedure found
# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
# Padlock SDK 2.0.1 available for download from VIA, which naturally
# utilizes the magic 'repz montmul' instruction. And finally "hardware
# this" refers to *this* implementation which also uses 'repz montmul'
#
# sign verify sign/s verify/s
# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
#
# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
#
# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
#
# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
#
# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
#
# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
#
# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
#
# To give you some other reference point here is output for 2.4GHz P4
# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
# SSE2" in above terms.
#
# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
#
# Conclusions:
# - VIA SDK leaves a *lot* of room for improvement (which this
# implementation successfully fills:-);
# - 'rep montmul' gives up to >3x performance improvement depending on
# key length;
# - in terms of absolute performance it delivers approximately as much
# as modern out-of-order 32-bit cores [again, for longer keys].
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"via-mont.pl");
# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
$func="bn_mul_mont_padlock";
$pad=16*1; # amount of reserved bytes on top of every vector
# stack layout
$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
$A=&DWP(4,"esp");
$B=&DWP(8,"esp");
$T=&DWP(12,"esp");
$M=&DWP(16,"esp");
$scratch=&DWP(20,"esp");
$rp=&DWP(24,"esp"); # these are mine
$sp=&DWP(28,"esp");
# &DWP(32,"esp") # 32 byte scratch area
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
# Note that SDK suggests to unconditionally allocate 2K per vector. This
# has quite an impact on performance. It naturally depends on key length,
# but to give an example 1024 bit private RSA key operations suffer >30%
# penalty. I allocate only as much as actually required...
&function_begin($func);
&xor ("eax","eax");
&mov ("ecx",&wparam(5)); # num
# meet VIA's limitations for num [note that the specification
# expresses them in bits, while we work with amount of 32-bit words]
&test ("ecx",3);
&jnz (&label("leave")); # num % 4 != 0
&cmp ("ecx",8);
&jb (&label("leave")); # num < 8
&cmp ("ecx",1024);
&ja (&label("leave")); # num > 1024
&pushf ();
&cld ();
&mov ("edi",&wparam(0)); # rp
&mov ("eax",&wparam(1)); # ap
&mov ("ebx",&wparam(2)); # bp
&mov ("edx",&wparam(3)); # np
&mov ("esi",&wparam(4)); # n0
&mov ("esi",&DWP(0,"esi")); # *n0
&lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
&lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
&neg ("ebp");
&add ("ebp","esp");
&and ("ebp",-64); # align to cache-line
&xchg ("ebp","esp"); # alloca
&mov ($rp,"edi"); # save rp
&mov ($sp,"ebp"); # save esp
&mov ($mZeroPrime,"esi");
&lea ("esi",&DWP(64,"esp")); # tp
&mov ($T,"esi");
&lea ("edi",&DWP(32,"esp")); # scratch area
&mov ($scratch,"edi");
&mov ("esi","eax");
&lea ("ebp",&DWP(-$pad,"ecx"));
&shr ("ebp",2); # restore original num value in ebp
&xor ("eax","eax");
&mov ("ecx","ebp");
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("ecx","ebp");
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
&mov ($A,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded ap copy...
&mov ("ecx","ebp");
&mov ("esi","ebx");
&mov ($B,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded bp copy...
&mov ("ecx","ebp");
&mov ("esi","edx");
&mov ($M,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded np copy...
# let magic happen...
&mov ("ecx","ebp");
&mov ("esi","esp");
&shl ("ecx",5); # convert word counter to bit counter
&align (4);
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
&mov ("ecx","ebp");
&lea ("esi",&DWP(64,"esp")); # tp
# edi still points at the end of padded np copy...
&neg ("ebp");
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
&mov ("edi",$rp); # restore rp
&xor ("edx","edx"); # i=0 and clear CF
&set_label("sub",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&sbb ("eax",&DWP(0,"ebp","edx",4));
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("sub")); # doesn't affect CF!
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
&sbb ("eax",0);
&and ("esi","eax");
&not ("eax");
&mov ("ebp","edi");
&and ("ebp","eax");
&or ("esi","ebp"); # tp=carry?tp:rp
&mov ("ecx","edx"); # num
&xor ("edx","edx"); # i=0
&set_label("copy",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
&mov (&DWP(0,"edi","edx",4),"eax");
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("copy"));
&mov ("ebp",$sp);
&xor ("eax","eax");
&mov ("ecx",64/4);
&mov ("edi","esp"); # zap frame including scratch area
&data_byte(0xf3,0xab); # rep stosl, bzero
# zap copies of ap, bp and np
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("esp","ebp");
&inc ("eax"); # signal "done"
&popf ();
&set_label("leave");
&function_end($func);
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,347 @@
.file "crypto/bn/asm/x86-gf2m.s"
.text
.type _mul_1x1_mmx,@function
.align 16
_mul_1x1_mmx:
subl $36,%esp
movl %eax,%ecx
leal (%eax,%eax,1),%edx
andl $1073741823,%ecx
leal (%edx,%edx,1),%ebp
movl $0,(%esp)
andl $2147483647,%edx
movd %eax,%mm2
movd %ebx,%mm3
movl %ecx,4(%esp)
xorl %edx,%ecx
pxor %mm5,%mm5
pxor %mm4,%mm4
movl %edx,8(%esp)
xorl %ebp,%edx
movl %ecx,12(%esp)
pcmpgtd %mm2,%mm5
paddd %mm2,%mm2
xorl %edx,%ecx
movl %ebp,16(%esp)
xorl %edx,%ebp
pand %mm3,%mm5
pcmpgtd %mm2,%mm4
movl %ecx,20(%esp)
xorl %ecx,%ebp
psllq $31,%mm5
pand %mm3,%mm4
movl %edx,24(%esp)
movl $7,%esi
movl %ebp,28(%esp)
movl %esi,%ebp
andl %ebx,%esi
shrl $3,%ebx
movl %ebp,%edi
psllq $30,%mm4
andl %ebx,%edi
shrl $3,%ebx
movd (%esp,%esi,4),%mm0
movl %ebp,%esi
andl %ebx,%esi
shrl $3,%ebx
movd (%esp,%edi,4),%mm2
movl %ebp,%edi
psllq $3,%mm2
andl %ebx,%edi
shrl $3,%ebx
pxor %mm2,%mm0
movd (%esp,%esi,4),%mm1
movl %ebp,%esi
psllq $6,%mm1
andl %ebx,%esi
shrl $3,%ebx
pxor %mm1,%mm0
movd (%esp,%edi,4),%mm2
movl %ebp,%edi
psllq $9,%mm2
andl %ebx,%edi
shrl $3,%ebx
pxor %mm2,%mm0
movd (%esp,%esi,4),%mm1
movl %ebp,%esi
psllq $12,%mm1
andl %ebx,%esi
shrl $3,%ebx
pxor %mm1,%mm0
movd (%esp,%edi,4),%mm2
movl %ebp,%edi
psllq $15,%mm2
andl %ebx,%edi
shrl $3,%ebx
pxor %mm2,%mm0
movd (%esp,%esi,4),%mm1
movl %ebp,%esi
psllq $18,%mm1
andl %ebx,%esi
shrl $3,%ebx
pxor %mm1,%mm0
movd (%esp,%edi,4),%mm2
movl %ebp,%edi
psllq $21,%mm2
andl %ebx,%edi
shrl $3,%ebx
pxor %mm2,%mm0
movd (%esp,%esi,4),%mm1
movl %ebp,%esi
psllq $24,%mm1
andl %ebx,%esi
shrl $3,%ebx
pxor %mm1,%mm0
movd (%esp,%edi,4),%mm2
pxor %mm4,%mm0
psllq $27,%mm2
pxor %mm2,%mm0
movd (%esp,%esi,4),%mm1
pxor %mm5,%mm0
psllq $30,%mm1
addl $36,%esp
pxor %mm1,%mm0
ret
.size _mul_1x1_mmx,.-_mul_1x1_mmx
.type _mul_1x1_ialu,@function
.align 16
_mul_1x1_ialu:
subl $36,%esp
movl %eax,%ecx
leal (%eax,%eax,1),%edx
leal (,%eax,4),%ebp
andl $1073741823,%ecx
leal (%eax,%eax,1),%edi
sarl $31,%eax
movl $0,(%esp)
andl $2147483647,%edx
movl %ecx,4(%esp)
xorl %edx,%ecx
movl %edx,8(%esp)
xorl %ebp,%edx
movl %ecx,12(%esp)
xorl %edx,%ecx
movl %ebp,16(%esp)
xorl %edx,%ebp
movl %ecx,20(%esp)
xorl %ecx,%ebp
sarl $31,%edi
andl %ebx,%eax
movl %edx,24(%esp)
andl %ebx,%edi
movl %ebp,28(%esp)
movl %eax,%edx
shll $31,%eax
movl %edi,%ecx
shrl $1,%edx
movl $7,%esi
shll $30,%edi
andl %ebx,%esi
shrl $2,%ecx
xorl %edi,%eax
shrl $3,%ebx
movl $7,%edi
andl %ebx,%edi
shrl $3,%ebx
xorl %ecx,%edx
xorl (%esp,%esi,4),%eax
movl $7,%esi
andl %ebx,%esi
shrl $3,%ebx
movl (%esp,%edi,4),%ebp
movl $7,%edi
movl %ebp,%ecx
shll $3,%ebp
andl %ebx,%edi
shrl $29,%ecx
xorl %ebp,%eax
shrl $3,%ebx
xorl %ecx,%edx
movl (%esp,%esi,4),%ecx
movl $7,%esi
movl %ecx,%ebp
shll $6,%ecx
andl %ebx,%esi
shrl $26,%ebp
xorl %ecx,%eax
shrl $3,%ebx
xorl %ebp,%edx
movl (%esp,%edi,4),%ebp
movl $7,%edi
movl %ebp,%ecx
shll $9,%ebp
andl %ebx,%edi
shrl $23,%ecx
xorl %ebp,%eax
shrl $3,%ebx
xorl %ecx,%edx
movl (%esp,%esi,4),%ecx
movl $7,%esi
movl %ecx,%ebp
shll $12,%ecx
andl %ebx,%esi
shrl $20,%ebp
xorl %ecx,%eax
shrl $3,%ebx
xorl %ebp,%edx
movl (%esp,%edi,4),%ebp
movl $7,%edi
movl %ebp,%ecx
shll $15,%ebp
andl %ebx,%edi
shrl $17,%ecx
xorl %ebp,%eax
shrl $3,%ebx
xorl %ecx,%edx
movl (%esp,%esi,4),%ecx
movl $7,%esi
movl %ecx,%ebp
shll $18,%ecx
andl %ebx,%esi
shrl $14,%ebp
xorl %ecx,%eax
shrl $3,%ebx
xorl %ebp,%edx
movl (%esp,%edi,4),%ebp
movl $7,%edi
movl %ebp,%ecx
shll $21,%ebp
andl %ebx,%edi
shrl $11,%ecx
xorl %ebp,%eax
shrl $3,%ebx
xorl %ecx,%edx
movl (%esp,%esi,4),%ecx
movl $7,%esi
movl %ecx,%ebp
shll $24,%ecx
andl %ebx,%esi
shrl $8,%ebp
xorl %ecx,%eax
shrl $3,%ebx
xorl %ebp,%edx
movl (%esp,%edi,4),%ebp
movl %ebp,%ecx
shll $27,%ebp
movl (%esp,%esi,4),%edi
shrl $5,%ecx
movl %edi,%esi
xorl %ebp,%eax
shll $30,%edi
xorl %ecx,%edx
shrl $2,%esi
xorl %edi,%eax
xorl %esi,%edx
addl $36,%esp
ret
.size _mul_1x1_ialu,.-_mul_1x1_ialu
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,@function
.align 16
bn_GF2m_mul_2x2:
.L_bn_GF2m_mul_2x2_begin:
call .L000PIC_me_up
.L000PIC_me_up:
popl %edx
leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%edx),%edx
movl OPENSSL_ia32cap_P@GOT(%edx),%edx
movl (%edx),%eax
movl 4(%edx),%edx
testl $8388608,%eax
jz .L001ialu
testl $16777216,%eax
jz .L002mmx
testl $2,%edx
jz .L002mmx
movups 8(%esp),%xmm0
shufps $177,%xmm0,%xmm0
.byte 102,15,58,68,192,1
movl 4(%esp),%eax
movups %xmm0,(%eax)
ret
.align 16
.L002mmx:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 24(%esp),%eax
movl 32(%esp),%ebx
call _mul_1x1_mmx
movq %mm0,%mm7
movl 28(%esp),%eax
movl 36(%esp),%ebx
call _mul_1x1_mmx
movq %mm0,%mm6
movl 24(%esp),%eax
movl 32(%esp),%ebx
xorl 28(%esp),%eax
xorl 36(%esp),%ebx
call _mul_1x1_mmx
pxor %mm7,%mm0
movl 20(%esp),%eax
pxor %mm6,%mm0
movq %mm0,%mm2
psllq $32,%mm0
popl %edi
psrlq $32,%mm2
popl %esi
pxor %mm6,%mm0
popl %ebx
pxor %mm7,%mm2
movq %mm0,(%eax)
popl %ebp
movq %mm2,8(%eax)
emms
ret
.align 16
.L001ialu:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
subl $20,%esp
movl 44(%esp),%eax
movl 52(%esp),%ebx
call _mul_1x1_ialu
movl %eax,8(%esp)
movl %edx,12(%esp)
movl 48(%esp),%eax
movl 56(%esp),%ebx
call _mul_1x1_ialu
movl %eax,(%esp)
movl %edx,4(%esp)
movl 44(%esp),%eax
movl 52(%esp),%ebx
xorl 48(%esp),%eax
xorl 56(%esp),%ebx
call _mul_1x1_ialu
movl 40(%esp),%ebp
movl (%esp),%ebx
movl 4(%esp),%ecx
movl 8(%esp),%edi
movl 12(%esp),%esi
xorl %edx,%eax
xorl %ecx,%edx
xorl %ebx,%eax
movl %ebx,(%ebp)
xorl %edi,%edx
movl %esi,12(%ebp)
xorl %esi,%eax
addl $20,%esp
xorl %esi,%edx
popl %edi
xorl %edx,%eax
popl %esi
movl %edx,8(%ebp)
popl %ebx
movl %eax,4(%ebp)
popl %ebp
ret
.size bn_GF2m_mul_2x2,.-.L_bn_GF2m_mul_2x2_begin
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105
.byte 99,97,116,105,111,110,32,102,111,114,32,120,56,54,44,32
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.comm OPENSSL_ia32cap_P,8,4

View File

@@ -0,0 +1,313 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has three code paths: pure integer
# code suitable for any x86 CPU, MMX code suitable for PIII and later
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
# from one benchmark and µ-arch to another. Below are interval values
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
# code:
#
# PIII 16%-30%
# P4 12%-12%
# Opteron 18%-40%
# Core2 19%-44%
# Atom 38%-64%
# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
#
# Note that above improvement coefficients are not coefficients for
# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
# is more and more dominated by other subroutines, most notably by
# BN_GF2m_mod[_mul]_arr...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$a="eax";
$b="ebx";
($a1,$a2,$a4)=("ecx","edx","ebp");
$R="mm0";
@T=("mm1","mm2");
($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
@i=("esi","edi");
if (!$x86only) {
&function_begin_B("_mul_1x1_mmx");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&and ($a1,0x3fffffff);
&lea ($a4,&DWP(0,$a2,$a2));
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&movd ($A,$a);
&movd ($B,$b);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&pxor ($B31,$B31);
&pxor ($B30,$B30);
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&pcmpgtd($B31,$A); # broadcast 31st bit
&paddd ($A,$A); # $A<<=1
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&pand ($B31,$B);
&pcmpgtd($B30,$A); # broadcast 30th bit
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&psllq ($B31,31);
&pand ($B30,$B);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&mov (@i[0],0x7);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($a4,@i[0]);
&and (@i[0],$b);
&shr ($b,3);
&mov (@i[1],$a4);
&psllq ($B30,30);
&and (@i[1],$b);
&shr ($b,3);
&movd ($R,&DWP(0,"esp",@i[0],4));
&mov (@i[0],$a4);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],$a4);
&psllq (@T[1],3*$n);
&and (@i[1],$b);
&shr ($b,3);
&pxor ($R,@T[1]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&pxor ($R,$B30);
&psllq (@T[1],3*$n++);
&pxor ($R,@T[1]);
&movd (@T[0],&DWP(0,"esp",@i[0],4));
&pxor ($R,$B31);
&psllq (@T[0],3*$n);
&add ("esp",32+4);
&pxor ($R,@T[0]);
&ret ();
&function_end_B("_mul_1x1_mmx");
}
($lo,$hi)=("eax","edx");
@T=("ecx","ebp");
&function_begin_B("_mul_1x1_ialu");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&lea ($a4,&DWP(0,"",$a,4));
&and ($a1,0x3fffffff);
&lea (@i[1],&DWP(0,$lo,$lo));
&sar ($lo,31); # broadcast 31st bit
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&sar (@i[1],31); # broardcast 30th bit
&and ($lo,$b);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&and (@i[1],$b);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($hi,$lo);
&shl ($lo,31);
&mov (@T[0],@i[1]);
&shr ($hi,1);
&mov (@i[0],0x7);
&shl (@i[1],30);
&and (@i[0],$b);
&shr (@T[0],2);
&xor ($lo,@i[1]);
&shr ($b,3);
&mov (@i[1],0x7); # 5-byte instruction!?
&and (@i[1],$b);
&shr ($b,3);
&xor ($hi,@T[0]);
&xor ($lo,&DWP(0,"esp",@i[0],4));
&mov (@i[0],0x7);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],0x7);
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&and (@i[1],$b);
&shr (@T[0],32-3*$n);
&xor ($lo,@T[1]);
&shr ($b,3);
&xor ($hi,@T[0]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&mov (@i[1],&DWP(0,"esp",@i[0],4));
&shr (@T[0],32-3*$n); $n++;
&mov (@i[0],@i[1]);
&xor ($lo,@T[1]);
&shl (@i[1],3*$n);
&xor ($hi,@T[0]);
&shr (@i[0],32-3*$n);
&xor ($lo,@i[1]);
&xor ($hi,@i[0]);
&add ("esp",32+4);
&ret ();
&function_end_B("_mul_1x1_ialu");
# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
&function_begin_B("bn_GF2m_mul_2x2");
if (!$x86only) {
&picmeup("edx","OPENSSL_ia32cap_P");
&mov ("eax",&DWP(0,"edx"));
&mov ("edx",&DWP(4,"edx"));
&test ("eax",1<<23); # check MMX bit
&jz (&label("ialu"));
if ($sse2) {
&test ("eax",1<<24); # check FXSR bit
&jz (&label("mmx"));
&test ("edx",1<<1); # check PCLMULQDQ bit
&jz (&label("mmx"));
&movups ("xmm0",&QWP(8,"esp"));
&shufps ("xmm0","xmm0",0b10110001);
&pclmulqdq ("xmm0","xmm0",1);
&mov ("eax",&DWP(4,"esp"));
&movups (&QWP(0,"eax"),"xmm0");
&ret ();
&set_label("mmx",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_mmx"); # a1·b1
&movq ("mm7",$R);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # a0·b0
&movq ("mm6",$R);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
&pxor ($R,"mm7");
&mov ($a,&wparam(0));
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
&movq ($A,$R);
&psllq ($R,32);
&pop ("edi");
&psrlq ($A,32);
&pop ("esi");
&pxor ($R,"mm6");
&pop ("ebx");
&pxor ($A,"mm7");
&movq (&QWP(0,$a),$R);
&pop ("ebp");
&movq (&QWP(8,$a),$A);
&emms ();
&ret ();
&set_label("ialu",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&stack_push(4+1);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_ialu"); # a1·b1
&mov (&DWP(8,"esp"),$lo);
&mov (&DWP(12,"esp"),$hi);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # a0·b0
&mov (&DWP(0,"esp"),$lo);
&mov (&DWP(4,"esp"),$hi);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
&mov ("ebp",&wparam(0));
@r=("ebx","ecx","edi","esi");
&mov (@r[0],&DWP(0,"esp"));
&mov (@r[1],&DWP(4,"esp"));
&mov (@r[2],&DWP(8,"esp"));
&mov (@r[3],&DWP(12,"esp"));
&xor ($lo,$hi);
&xor ($hi,@r[1]);
&xor ($lo,@r[0]);
&mov (&DWP(0,"ebp"),@r[0]);
&xor ($hi,@r[2]);
&mov (&DWP(12,"ebp"),@r[3]);
&xor ($lo,@r[3]);
&stack_pop(4+1);
&xor ($hi,@r[3]);
&pop ("edi");
&xor ($lo,$hi);
&pop ("esi");
&mov (&DWP(8,"ebp"),$hi);
&pop ("ebx");
&mov (&DWP(4,"ebp"),$lo);
&pop ("ebp");
&ret ();
&function_end_B("bn_GF2m_mul_2x2");
&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,460 @@
.file "crypto/bn/asm/x86-mont.s"
.text
.globl bn_mul_mont
.type bn_mul_mont,@function
.align 16
bn_mul_mont:
.L_bn_mul_mont_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
xorl %eax,%eax
movl 40(%esp),%edi
cmpl $4,%edi
jl .L000just_leave
leal 20(%esp),%esi
leal 24(%esp),%edx
movl %esp,%ebp
addl $2,%edi
negl %edi
leal -32(%esp,%edi,4),%esp
negl %edi
movl %esp,%eax
subl %edx,%eax
andl $2047,%eax
subl %eax,%esp
xorl %esp,%edx
andl $2048,%edx
xorl $2048,%edx
subl %edx,%esp
andl $-64,%esp
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edx
movl 16(%esi),%esi
movl (%esi),%esi
movl %eax,4(%esp)
movl %ebx,8(%esp)
movl %ecx,12(%esp)
movl %edx,16(%esp)
movl %esi,20(%esp)
leal -3(%edi),%ebx
movl %ebp,24(%esp)
call .L001PIC_me_up
.L001PIC_me_up:
popl %eax
leal _GLOBAL_OFFSET_TABLE_+[.-.L001PIC_me_up](%eax),%eax
movl OPENSSL_ia32cap_P@GOT(%eax),%eax
btl $26,(%eax)
jnc .L002non_sse2
movl $-1,%eax
movd %eax,%mm7
movl 8(%esp),%esi
movl 12(%esp),%edi
movl 16(%esp),%ebp
xorl %edx,%edx
xorl %ecx,%ecx
movd (%edi),%mm4
movd (%esi),%mm5
movd (%ebp),%mm3
pmuludq %mm4,%mm5
movq %mm5,%mm2
movq %mm5,%mm0
pand %mm7,%mm0
pmuludq 20(%esp),%mm5
pmuludq %mm5,%mm3
paddq %mm0,%mm3
movd 4(%ebp),%mm1
movd 4(%esi),%mm0
psrlq $32,%mm2
psrlq $32,%mm3
incl %ecx
.align 16
.L0031st:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
paddq %mm1,%mm3
movq %mm2,%mm0
pand %mm7,%mm0
movd 4(%ebp,%ecx,4),%mm1
paddq %mm0,%mm3
movd 4(%esi,%ecx,4),%mm0
psrlq $32,%mm2
movd %mm3,28(%esp,%ecx,4)
psrlq $32,%mm3
leal 1(%ecx),%ecx
cmpl %ebx,%ecx
jl .L0031st
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
paddq %mm1,%mm3
movq %mm2,%mm0
pand %mm7,%mm0
paddq %mm0,%mm3
movd %mm3,28(%esp,%ecx,4)
psrlq $32,%mm2
psrlq $32,%mm3
paddq %mm2,%mm3
movq %mm3,32(%esp,%ebx,4)
incl %edx
.L004outer:
xorl %ecx,%ecx
movd (%edi,%edx,4),%mm4
movd (%esi),%mm5
movd 32(%esp),%mm6
movd (%ebp),%mm3
pmuludq %mm4,%mm5
paddq %mm6,%mm5
movq %mm5,%mm0
movq %mm5,%mm2
pand %mm7,%mm0
pmuludq 20(%esp),%mm5
pmuludq %mm5,%mm3
paddq %mm0,%mm3
movd 36(%esp),%mm6
movd 4(%ebp),%mm1
movd 4(%esi),%mm0
psrlq $32,%mm2
psrlq $32,%mm3
paddq %mm6,%mm2
incl %ecx
decl %ebx
.L005inner:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
paddq %mm1,%mm3
movq %mm2,%mm0
movd 36(%esp,%ecx,4),%mm6
pand %mm7,%mm0
movd 4(%ebp,%ecx,4),%mm1
paddq %mm0,%mm3
movd 4(%esi,%ecx,4),%mm0
psrlq $32,%mm2
movd %mm3,28(%esp,%ecx,4)
psrlq $32,%mm3
paddq %mm6,%mm2
decl %ebx
leal 1(%ecx),%ecx
jnz .L005inner
movl %ecx,%ebx
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
paddq %mm1,%mm3
movq %mm2,%mm0
pand %mm7,%mm0
paddq %mm0,%mm3
movd %mm3,28(%esp,%ecx,4)
psrlq $32,%mm2
psrlq $32,%mm3
movd 36(%esp,%ebx,4),%mm6
paddq %mm2,%mm3
paddq %mm6,%mm3
movq %mm3,32(%esp,%ebx,4)
leal 1(%edx),%edx
cmpl %ebx,%edx
jle .L004outer
emms
jmp .L006common_tail
.align 16
.L002non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
xorl %ecx,%ecx
movl %esi,%edx
andl $1,%ebp
subl %edi,%edx
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
jz .L007bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 16
.L008mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
leal 1(%ecx),%ecx
adcl $0,%edx
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
jl .L008mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
addl %ebp,%eax
movl 16(%esp),%esi
adcl $0,%edx
imull 32(%esp),%edi
movl %eax,32(%esp,%ebx,4)
xorl %ecx,%ecx
movl %edx,36(%esp,%ebx,4)
movl %ecx,40(%esp,%ebx,4)
movl (%esi),%eax
mull %edi
addl 32(%esp),%eax
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
jmp .L0092ndmadd
.align 16
.L0101stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
leal 1(%ecx),%ecx
adcl $0,%edx
addl %eax,%ebp
movl (%esi,%ecx,4),%eax
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
jl .L0101stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
movl 20(%esp),%edi
adcl $0,%edx
movl 16(%esp),%esi
addl %eax,%ebp
adcl $0,%edx
imull 32(%esp),%edi
xorl %ecx,%ecx
addl 36(%esp,%ebx,4),%edx
movl %ebp,32(%esp,%ebx,4)
adcl $0,%ecx
movl (%esi),%eax
movl %edx,36(%esp,%ebx,4)
movl %ecx,40(%esp,%ebx,4)
mull %edi
addl 32(%esp),%eax
movl 4(%esi),%eax
adcl $0,%edx
movl $1,%ecx
.align 16
.L0092ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
leal 1(%ecx),%ecx
adcl $0,%edx
addl %eax,%ebp
movl (%esi,%ecx,4),%eax
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
jl .L0092ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
adcl $0,%edx
addl %eax,%ebp
adcl $0,%edx
movl %ebp,28(%esp,%ebx,4)
xorl %eax,%eax
movl 12(%esp),%ecx
addl 36(%esp,%ebx,4),%edx
adcl 40(%esp,%ebx,4),%eax
leal 4(%ecx),%ecx
movl %edx,32(%esp,%ebx,4)
cmpl 28(%esp),%ecx
movl %eax,36(%esp,%ebx,4)
je .L006common_tail
movl (%ecx),%edi
movl 8(%esp),%esi
movl %ecx,12(%esp)
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
jmp .L0101stmadd
.align 16
.L007bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
mull %edi
movl %eax,32(%esp)
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
incl %ecx
.align 16
.L011sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
addl %ebp,%eax
leal 1(%ecx),%ecx
adcl $0,%edx
leal (%ebx,%eax,2),%ebp
shrl $31,%eax
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
jl .L011sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
addl %ebp,%eax
movl 20(%esp),%edi
adcl $0,%edx
movl 16(%esp),%esi
leal (%ebx,%eax,2),%ebp
imull 32(%esp),%edi
shrl $31,%eax
movl %ebp,32(%esp,%ecx,4)
leal (%eax,%edx,2),%ebp
movl (%esi),%eax
shrl $31,%edx
movl %ebp,36(%esp,%ecx,4)
movl %edx,40(%esp,%ecx,4)
mull %edi
addl 32(%esp),%eax
movl %ecx,%ebx
adcl $0,%edx
movl 4(%esi),%eax
movl $1,%ecx
.align 16
.L0123rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
adcl $0,%edx
addl %eax,%ebp
movl 4(%esi,%ecx,4),%eax
adcl $0,%edx
movl %ebp,28(%esp,%ecx,4)
movl %edx,%ebp
mull %edi
addl 36(%esp,%ecx,4),%ebp
leal 2(%ecx),%ecx
adcl $0,%edx
addl %eax,%ebp
movl (%esi,%ecx,4),%eax
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
jl .L0123rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
adcl $0,%edx
addl %eax,%ebp
adcl $0,%edx
movl %ebp,28(%esp,%ebx,4)
movl 12(%esp),%ecx
xorl %eax,%eax
movl 8(%esp),%esi
addl 36(%esp,%ebx,4),%edx
adcl 40(%esp,%ebx,4),%eax
movl %edx,32(%esp,%ebx,4)
cmpl %ebx,%ecx
movl %eax,36(%esp,%ebx,4)
je .L006common_tail
movl 4(%esi,%ecx,4),%edi
leal 1(%ecx),%ecx
movl %edi,%eax
movl %ecx,12(%esp)
mull %edi
addl 32(%esp,%ecx,4),%eax
adcl $0,%edx
movl %eax,32(%esp,%ecx,4)
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
je .L013sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 16
.L014sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
addl %ebp,%eax
leal (%eax,%eax,1),%ebp
adcl $0,%edx
shrl $31,%eax
addl 32(%esp,%ecx,4),%ebp
leal 1(%ecx),%ecx
adcl $0,%eax
addl %ebx,%ebp
adcl $0,%eax
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
jle .L014sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
.L013sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
addl 32(%esp,%ecx,4),%edx
movl (%esi),%eax
adcl $0,%ebp
movl %edx,32(%esp,%ecx,4)
movl %ebp,36(%esp,%ecx,4)
mull %edi
addl 32(%esp),%eax
leal -1(%ecx),%ebx
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
jmp .L0123rdmadd
.align 16
.L006common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
movl (%esi),%eax
movl %ebx,%ecx
xorl %edx,%edx
.align 16
.L015sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
jge .L015sub
sbbl $0,%eax
andl %eax,%esi
notl %eax
movl %edi,%ebp
andl %eax,%ebp
orl %ebp,%esi
.align 16
.L016copy:
movl (%esi,%ebx,4),%eax
movl %eax,(%edi,%ebx,4)
movl %ecx,32(%esp,%ebx,4)
decl %ebx
jge .L016copy
movl 24(%esp),%esp
movl $1,%eax
.L000just_leave:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size bn_mul_mont,.-.L_bn_mul_mont_begin
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
.byte 111,114,103,62,0
.comm OPENSSL_ia32cap_P,8,4

View File

@@ -0,0 +1,593 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],$0);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&cmp ("edi",4);
&jl (&label("just_leave"));
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","esp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("esp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","esp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("esp","edx"); # this splits them apart modulo 4096
&and ("esp",-64); # align to cache line
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"edx");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"ebp"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","OPENSSL_ia32cap_P");
&bt (&DWP(0,"eax"),26);
&jnc (&label("non_sse2"));
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
&set_label("non_sse2",16);
}
if (0) {
&mov ("esp",$_sp);
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
# all key lengthes on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
# one can say about all CPUs...
} else {
$inp="esi"; # integer path uses these registers differently
$word="edi";
$carry="ebp";
&mov ($inp,$_ap);
&lea ($carry,&DWP(1,$num));
&mov ($word,$_bp);
&xor ($j,$j); # j=0
&mov ("edx",$inp);
&and ($carry,1); # see if num is even
&sub ("edx",$word); # see if ap==bp
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
&or ($carry,"edx");
&mov ($word,&DWP(0,$word)); # bp[0]
&jz (&label("bn_sqr_mont"));
&mov ($_bpend,"eax");
&mov ("eax",&DWP(0,$inp));
&xor ("edx","edx");
&set_label("mull",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[0]
&add ($carry,"eax");
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("mull"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[0]
&mov ($word,$_n0);
&add ("eax",$carry);
&mov ($inp,$_np);
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
&xor ($j,$j);
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mov ("eax",&DWP(0,$inp)); # np[0]
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&inc ($j);
&jmp (&label("2ndmadd"));
&set_label("1stmadd",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[i]
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("1stmadd"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[i]
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&add ($carry,"eax");
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&xor ($j,$j);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
&adc ($j,0);
&mov ("eax",&DWP(0,$inp)); # np[0]
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&mov ($j,1);
&set_label("2ndmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
&jl (&label("2ndmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&xor ("eax","eax");
&mov ($j,$_bp); # &bp[i]
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&lea ($j,&DWP(4,$j));
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$_bpend);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(0,$j)); # bp[i+1]
&mov ($inp,$_ap);
&mov ($_bp,$j); # &bp[++i]
&xor ($j,$j);
&xor ("edx","edx");
&mov ("eax",&DWP(0,$inp));
&jmp (&label("1stmadd"));
&set_label("bn_sqr_mont",16);
$sbit=$num;
&mov ($_num,$num);
&mov ($_bp,$j); # i=0
&mov ("eax",$word); # ap[0]
&mul ($word); # ap[0]*ap[0]
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
&mov ($sbit,"edx");
&shr ("edx",1);
&and ($sbit,1);
&inc ($j);
&set_label("sqr",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[0]
&add ("eax",$carry);
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&shr ("eax",31);
&cmp ($j,$_num);
&mov ($sbit,"eax");
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("sqr"));
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*ap[0]
&add ("eax",$carry);
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&shr ("eax",31);
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
&lea ($carry,&DWP(0,"eax","edx",2));
&mov ("eax",&DWP(0,$inp)); # np[0]
&shr ("edx",31);
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ($num,$j);
&adc ("edx",0);
&mov ("eax",&DWP(4,$inp)); # np[1]
&mov ($j,1);
&set_label("3rdmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
&mov ($carry,"edx");
&mul ($word); # np[j+1]*m
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
&lea ($j,&DWP(2,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
&jl (&label("3rdmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&mov ($j,$_bp); # i
&xor ("eax","eax");
&mov ($inp,$_ap);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$num);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
&lea ($j,&DWP(1,$j));
&mov ("eax",$word);
&mov ($_bp,$j); # ++i
&mul ($word); # ap[i]*ap[i]
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
&adc ("edx",0);
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
&xor ($carry,$carry);
&cmp ($j,$num);
&lea ($j,&DWP(1,$j));
&je (&label("sqrlast"));
&mov ($sbit,"edx"); # zaps $num
&shr ("edx",1);
&and ($sbit,1);
&set_label("sqradd",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[i]
&add ("eax",$carry);
&lea ($carry,&DWP(0,"eax","eax"));
&adc ("edx",0);
&shr ("eax",31);
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("eax",0);
&add ($carry,$sbit);
&adc ("eax",0);
&cmp ($j,$_num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&mov ($sbit,"eax");
&jle (&label("sqradd"));
&mov ($carry,"edx");
&add ("edx","edx");
&shr ($carry,31);
&add ("edx",$sbit);
&adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
&mov ("eax",&DWP(0,$inp)); # np[0]
&adc ($carry,0);
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&lea ($num,&DWP(-1,$j));
&adc ("edx",0);
&mov ($j,1);
&mov ("eax",&DWP(4,$inp)); # np[1]
&jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&and ($tp,"eax");
&not ("eax");
&mov ($np,$rp);
&and ($np,"eax");
&or ($tp,$np); # tp=carry?tp:rp
&set_label("copy",16); # copy or in-place refresh
&mov ("eax",&DWP(0,$tp,$num,4));
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

View File

@@ -0,0 +1,28 @@
#!/usr/local/bin/perl
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
require("x86/mul_add.pl");
require("x86/mul.pl");
require("x86/sqr.pl");
require("x86/div.pl");
require("x86/add.pl");
require("x86/sub.pl");
require("x86/comba.pl");
&asm_init($ARGV[0],$0);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();

View File

@@ -0,0 +1,76 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
1;

View File

@@ -0,0 +1,277 @@
#!/usr/local/bin/perl
# x86 assember
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
1;

View File

@@ -0,0 +1,15 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_div_words
{
local($name)=@_;
&function_begin($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ebx",&wparam(2)); #
&div("ebx");
&function_end($name);
}
1;

View File

@@ -0,0 +1,3 @@
#!/usr/local/bin/perl
# x86 assember

View File

@@ -0,0 +1,77 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_mul_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
1;

View File

@@ -0,0 +1,87 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_mul_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",0);
&mov(&swtmp(0),"ecx"); #
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= *r
&mov($c,&DWP($i,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&mov("ecx",&swtmp(0)); #
&add($a,32);
&add($r,32);
&sub("ecx",8);
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&add("eax",$c);
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
1;

View File

@@ -0,0 +1,60 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_sqr_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
1;

View File

@@ -0,0 +1,76 @@
#!/usr/local/bin/perl
# x86 assember
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *a
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
1;

View File

@@ -0,0 +1,606 @@
#include "../bn_lcl.h"
#if !(defined(__GNUC__) && __GNUC__>=2)
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
#else
/*
* x86_64 BIGNUM accelerator version 0.1, December 2002.
*
* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
* project.
*
* Rights for redistribution and usage in source and binary forms are
* granted according to the OpenSSL license. Warranty of any kind is
* disclaimed.
*
* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
* versions, like 1.0...
* A. Well, that's because this code is basically a quick-n-dirty
* proof-of-concept hack. As you can see it's implemented with
* inline assembler, which means that you're bound to GCC and that
* there might be enough room for further improvement.
*
* Q. Why inline assembler?
* A. x86_64 features own ABI which I'm not familiar with. This is
* why I decided to let the compiler take care of subroutine
* prologue/epilogue as well as register allocation. For reference.
* Win64 implements different ABI for AMD64, different from Linux.
*
* Q. How much faster does it get?
* A. 'apps/openssl speed rsa dsa' output with no-asm:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
* sign verify sign/s verify/s
* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
*
* 'apps/openssl speed rsa dsa' output with this module:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
* sign verify sign/s verify/s
* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
*
* For the reference. IA-32 assembler implementation performs
* very much like 64-bit code compiled with no-asm on the same
* machine.
*/
#ifdef _WIN64
#define BN_ULONG unsigned long long
#else
#define BN_ULONG unsigned long
#endif
#undef mul
#undef mul_add
#undef sqr
/*
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
#define mul_add(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"m"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+m"(r),"+d"(high) \
: "r"(carry),"g"(0) \
: "cc"); \
carry=high; \
} while (0)
#define mul(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"g"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
(r)=carry, carry=high; \
} while (0)
#define sqr(r0,r1,a) \
asm ("mulq %2" \
: "=a"(r0),"=d"(r1) \
: "a"(a) \
: "cc");
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
if (num <= 0) return(c1);
while (num&~3)
{
mul_add(rp[0],ap[0],w,c1);
mul_add(rp[1],ap[1],w,c1);
mul_add(rp[2],ap[2],w,c1);
mul_add(rp[3],ap[3],w,c1);
ap+=4; rp+=4; num-=4;
}
if (num)
{
mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
mul_add(rp[2],ap[2],w,c1); return c1;
}
return(c1);
}
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
if (num <= 0) return(c1);
while (num&~3)
{
mul(rp[0],ap[0],w,c1);
mul(rp[1],ap[1],w,c1);
mul(rp[2],ap[2],w,c1);
mul(rp[3],ap[3],w,c1);
ap+=4; rp+=4; num-=4;
}
if (num)
{
mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
mul(rp[2],ap[2],w,c1);
}
return(c1);
}
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
if (n <= 0) return;
while (n&~3)
{
sqr(r[0],r[1],a[0]);
sqr(r[2],r[3],a[1]);
sqr(r[4],r[5],a[2]);
sqr(r[6],r[7],a[3]);
a+=4; r+=8; n-=4;
}
if (n)
{
sqr(r[0],r[1],a[0]); if (--n == 0) return;
sqr(r[2],r[3],a[1]); if (--n == 0) return;
sqr(r[4],r[5],a[2]);
}
}
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
{ BN_ULONG ret,waste;
asm ("divq %4"
: "=a"(ret),"=d"(waste)
: "a"(l),"d"(h),"g"(d)
: "cc");
return ret;
}
BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" leaq 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n"
: "=&a"(ret),"+c"(n),"=&r"(i)
: "r"(rp),"r"(ap),"r"(bp)
: "cc"
);
return ret&1;
}
#ifndef SIMICS
BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
asm (
" subq %2,%2 \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" leaq 1(%2),%2 \n"
" loop 1b \n"
" sbbq %0,%0 \n"
: "=&a"(ret),"+c"(n),"=&r"(i)
: "r"(rp),"r"(ap),"r"(bp)
: "cc"
);
return ret&1;
}
#else
/* Simics 1.4<7 has buggy sbbq:-( */
#define BN_MASK2 0xffffffffffffffffL
BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
{
BN_ULONG t1,t2;
int c=0;
if (n <= 0) return((BN_ULONG)0);
for (;;)
{
t1=a[0]; t2=b[0];
r[0]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
if (--n <= 0) break;
t1=a[1]; t2=b[1];
r[1]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
if (--n <= 0) break;
t1=a[2]; t2=b[2];
r[2]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
if (--n <= 0) break;
t1=a[3]; t2=b[3];
r[3]=(t1-t2-c)&BN_MASK2;
if (t1 != t2) c=(t1 < t2);
if (--n <= 0) break;
a+=4;
b+=4;
r+=4;
}
return(c);
}
#endif
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
#if 0
/* original macros are kept for reference purposes */
#define mul_add_c(a,b,c0,c1,c2) { \
BN_ULONG ta=(a),tb=(b); \
t1 = ta * tb; \
t2 = BN_UMULT_HIGH(ta,tb); \
c0 += t1; t2 += (c0<t1)?1:0; \
c1 += t2; c2 += (c1<t2)?1:0; \
}
#define mul_add_c2(a,b,c0,c1,c2) { \
BN_ULONG ta=(a),tb=(b),t0; \
t1 = BN_UMULT_HIGH(ta,tb); \
t0 = ta * tb; \
t2 = t1+t1; c2 += (t2<t1)?1:0; \
t1 = t0+t0; t2 += (t1<t0)?1:0; \
c0 += t1; t2 += (c0<t1)?1:0; \
c1 += t2; c2 += (c1<t2)?1:0; \
}
#else
#define mul_add_c(a,b,c0,c1,c2) do { \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(c0),"+d"(t2) \
: "a"(t1),"g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(c1),"+r"(c2) \
: "d"(t2),"g"(0) \
: "cc"); \
} while (0)
#define sqr_add_c(a,i,c0,c1,c2) do { \
asm ("mulq %2" \
: "=a"(t1),"=d"(t2) \
: "a"(a[i]) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(c0),"+d"(t2) \
: "a"(t1),"g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(c1),"+r"(c2) \
: "d"(t2),"g"(0) \
: "cc"); \
} while (0)
#define mul_add_c2(a,b,c0,c1,c2) do { \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %0,%0; adcq %2,%1" \
: "+d"(t2),"+r"(c2) \
: "g"(0) \
: "cc"); \
asm ("addq %0,%0; adcq %2,%1" \
: "+a"(t1),"+d"(t2) \
: "g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(c0),"+d"(t2) \
: "a"(t1),"g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(c1),"+r"(c2) \
: "d"(t2),"g"(0) \
: "cc"); \
} while (0)
#endif
#define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;
c2=0;
c3=0;
mul_add_c(a[0],b[0],c1,c2,c3);
r[0]=c1;
c1=0;
mul_add_c(a[0],b[1],c2,c3,c1);
mul_add_c(a[1],b[0],c2,c3,c1);
r[1]=c2;
c2=0;
mul_add_c(a[2],b[0],c3,c1,c2);
mul_add_c(a[1],b[1],c3,c1,c2);
mul_add_c(a[0],b[2],c3,c1,c2);
r[2]=c3;
c3=0;
mul_add_c(a[0],b[3],c1,c2,c3);
mul_add_c(a[1],b[2],c1,c2,c3);
mul_add_c(a[2],b[1],c1,c2,c3);
mul_add_c(a[3],b[0],c1,c2,c3);
r[3]=c1;
c1=0;
mul_add_c(a[4],b[0],c2,c3,c1);
mul_add_c(a[3],b[1],c2,c3,c1);
mul_add_c(a[2],b[2],c2,c3,c1);
mul_add_c(a[1],b[3],c2,c3,c1);
mul_add_c(a[0],b[4],c2,c3,c1);
r[4]=c2;
c2=0;
mul_add_c(a[0],b[5],c3,c1,c2);
mul_add_c(a[1],b[4],c3,c1,c2);
mul_add_c(a[2],b[3],c3,c1,c2);
mul_add_c(a[3],b[2],c3,c1,c2);
mul_add_c(a[4],b[1],c3,c1,c2);
mul_add_c(a[5],b[0],c3,c1,c2);
r[5]=c3;
c3=0;
mul_add_c(a[6],b[0],c1,c2,c3);
mul_add_c(a[5],b[1],c1,c2,c3);
mul_add_c(a[4],b[2],c1,c2,c3);
mul_add_c(a[3],b[3],c1,c2,c3);
mul_add_c(a[2],b[4],c1,c2,c3);
mul_add_c(a[1],b[5],c1,c2,c3);
mul_add_c(a[0],b[6],c1,c2,c3);
r[6]=c1;
c1=0;
mul_add_c(a[0],b[7],c2,c3,c1);
mul_add_c(a[1],b[6],c2,c3,c1);
mul_add_c(a[2],b[5],c2,c3,c1);
mul_add_c(a[3],b[4],c2,c3,c1);
mul_add_c(a[4],b[3],c2,c3,c1);
mul_add_c(a[5],b[2],c2,c3,c1);
mul_add_c(a[6],b[1],c2,c3,c1);
mul_add_c(a[7],b[0],c2,c3,c1);
r[7]=c2;
c2=0;
mul_add_c(a[7],b[1],c3,c1,c2);
mul_add_c(a[6],b[2],c3,c1,c2);
mul_add_c(a[5],b[3],c3,c1,c2);
mul_add_c(a[4],b[4],c3,c1,c2);
mul_add_c(a[3],b[5],c3,c1,c2);
mul_add_c(a[2],b[6],c3,c1,c2);
mul_add_c(a[1],b[7],c3,c1,c2);
r[8]=c3;
c3=0;
mul_add_c(a[2],b[7],c1,c2,c3);
mul_add_c(a[3],b[6],c1,c2,c3);
mul_add_c(a[4],b[5],c1,c2,c3);
mul_add_c(a[5],b[4],c1,c2,c3);
mul_add_c(a[6],b[3],c1,c2,c3);
mul_add_c(a[7],b[2],c1,c2,c3);
r[9]=c1;
c1=0;
mul_add_c(a[7],b[3],c2,c3,c1);
mul_add_c(a[6],b[4],c2,c3,c1);
mul_add_c(a[5],b[5],c2,c3,c1);
mul_add_c(a[4],b[6],c2,c3,c1);
mul_add_c(a[3],b[7],c2,c3,c1);
r[10]=c2;
c2=0;
mul_add_c(a[4],b[7],c3,c1,c2);
mul_add_c(a[5],b[6],c3,c1,c2);
mul_add_c(a[6],b[5],c3,c1,c2);
mul_add_c(a[7],b[4],c3,c1,c2);
r[11]=c3;
c3=0;
mul_add_c(a[7],b[5],c1,c2,c3);
mul_add_c(a[6],b[6],c1,c2,c3);
mul_add_c(a[5],b[7],c1,c2,c3);
r[12]=c1;
c1=0;
mul_add_c(a[6],b[7],c2,c3,c1);
mul_add_c(a[7],b[6],c2,c3,c1);
r[13]=c2;
c2=0;
mul_add_c(a[7],b[7],c3,c1,c2);
r[14]=c3;
r[15]=c1;
}
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;
c2=0;
c3=0;
mul_add_c(a[0],b[0],c1,c2,c3);
r[0]=c1;
c1=0;
mul_add_c(a[0],b[1],c2,c3,c1);
mul_add_c(a[1],b[0],c2,c3,c1);
r[1]=c2;
c2=0;
mul_add_c(a[2],b[0],c3,c1,c2);
mul_add_c(a[1],b[1],c3,c1,c2);
mul_add_c(a[0],b[2],c3,c1,c2);
r[2]=c3;
c3=0;
mul_add_c(a[0],b[3],c1,c2,c3);
mul_add_c(a[1],b[2],c1,c2,c3);
mul_add_c(a[2],b[1],c1,c2,c3);
mul_add_c(a[3],b[0],c1,c2,c3);
r[3]=c1;
c1=0;
mul_add_c(a[3],b[1],c2,c3,c1);
mul_add_c(a[2],b[2],c2,c3,c1);
mul_add_c(a[1],b[3],c2,c3,c1);
r[4]=c2;
c2=0;
mul_add_c(a[2],b[3],c3,c1,c2);
mul_add_c(a[3],b[2],c3,c1,c2);
r[5]=c3;
c3=0;
mul_add_c(a[3],b[3],c1,c2,c3);
r[6]=c1;
r[7]=c2;
}
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;
c2=0;
c3=0;
sqr_add_c(a,0,c1,c2,c3);
r[0]=c1;
c1=0;
sqr_add_c2(a,1,0,c2,c3,c1);
r[1]=c2;
c2=0;
sqr_add_c(a,1,c3,c1,c2);
sqr_add_c2(a,2,0,c3,c1,c2);
r[2]=c3;
c3=0;
sqr_add_c2(a,3,0,c1,c2,c3);
sqr_add_c2(a,2,1,c1,c2,c3);
r[3]=c1;
c1=0;
sqr_add_c(a,2,c2,c3,c1);
sqr_add_c2(a,3,1,c2,c3,c1);
sqr_add_c2(a,4,0,c2,c3,c1);
r[4]=c2;
c2=0;
sqr_add_c2(a,5,0,c3,c1,c2);
sqr_add_c2(a,4,1,c3,c1,c2);
sqr_add_c2(a,3,2,c3,c1,c2);
r[5]=c3;
c3=0;
sqr_add_c(a,3,c1,c2,c3);
sqr_add_c2(a,4,2,c1,c2,c3);
sqr_add_c2(a,5,1,c1,c2,c3);
sqr_add_c2(a,6,0,c1,c2,c3);
r[6]=c1;
c1=0;
sqr_add_c2(a,7,0,c2,c3,c1);
sqr_add_c2(a,6,1,c2,c3,c1);
sqr_add_c2(a,5,2,c2,c3,c1);
sqr_add_c2(a,4,3,c2,c3,c1);
r[7]=c2;
c2=0;
sqr_add_c(a,4,c3,c1,c2);
sqr_add_c2(a,5,3,c3,c1,c2);
sqr_add_c2(a,6,2,c3,c1,c2);
sqr_add_c2(a,7,1,c3,c1,c2);
r[8]=c3;
c3=0;
sqr_add_c2(a,7,2,c1,c2,c3);
sqr_add_c2(a,6,3,c1,c2,c3);
sqr_add_c2(a,5,4,c1,c2,c3);
r[9]=c1;
c1=0;
sqr_add_c(a,5,c2,c3,c1);
sqr_add_c2(a,6,4,c2,c3,c1);
sqr_add_c2(a,7,3,c2,c3,c1);
r[10]=c2;
c2=0;
sqr_add_c2(a,7,4,c3,c1,c2);
sqr_add_c2(a,6,5,c3,c1,c2);
r[11]=c3;
c3=0;
sqr_add_c(a,6,c1,c2,c3);
sqr_add_c2(a,7,5,c1,c2,c3);
r[12]=c1;
c1=0;
sqr_add_c2(a,7,6,c2,c3,c1);
r[13]=c2;
c2=0;
sqr_add_c(a,7,c3,c1,c2);
r[14]=c3;
r[15]=c1;
}
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
c1=0;
c2=0;
c3=0;
sqr_add_c(a,0,c1,c2,c3);
r[0]=c1;
c1=0;
sqr_add_c2(a,1,0,c2,c3,c1);
r[1]=c2;
c2=0;
sqr_add_c(a,1,c3,c1,c2);
sqr_add_c2(a,2,0,c3,c1,c2);
r[2]=c3;
c3=0;
sqr_add_c2(a,3,0,c1,c2,c3);
sqr_add_c2(a,2,1,c1,c2,c3);
r[3]=c1;
c1=0;
sqr_add_c(a,2,c2,c3,c1);
sqr_add_c2(a,3,1,c2,c3,c1);
r[4]=c2;
c2=0;
sqr_add_c2(a,3,2,c3,c1,c2);
r[5]=c3;
c3=0;
sqr_add_c(a,3,c1,c2,c3);
r[6]=c1;
r[7]=c2;
}
#endif

View File

@@ -0,0 +1,291 @@
.text
.type _mul_1x1,@function
.align 16
_mul_1x1:
subq $128+8,%rsp
movq $-1,%r9
leaq (%rax,%rax,1),%rsi
shrq $3,%r9
leaq (,%rax,4),%rdi
andq %rax,%r9
leaq (,%rax,8),%r12
sarq $63,%rax
leaq (%r9,%r9,1),%r10
sarq $63,%rsi
leaq (,%r9,4),%r11
andq %rbp,%rax
sarq $63,%rdi
movq %rax,%rdx
shlq $63,%rax
andq %rbp,%rsi
shrq $1,%rdx
movq %rsi,%rcx
shlq $62,%rsi
andq %rbp,%rdi
shrq $2,%rcx
xorq %rsi,%rax
movq %rdi,%rbx
shlq $61,%rdi
xorq %rcx,%rdx
shrq $3,%rbx
xorq %rdi,%rax
xorq %rbx,%rdx
movq %r9,%r13
movq $0,0(%rsp)
xorq %r10,%r13
movq %r9,8(%rsp)
movq %r11,%r14
movq %r10,16(%rsp)
xorq %r12,%r14
movq %r13,24(%rsp)
xorq %r11,%r9
movq %r11,32(%rsp)
xorq %r11,%r10
movq %r9,40(%rsp)
xorq %r11,%r13
movq %r10,48(%rsp)
xorq %r14,%r9
movq %r13,56(%rsp)
xorq %r14,%r10
movq %r12,64(%rsp)
xorq %r14,%r13
movq %r9,72(%rsp)
xorq %r11,%r9
movq %r10,80(%rsp)
xorq %r11,%r10
movq %r13,88(%rsp)
xorq %r11,%r13
movq %r14,96(%rsp)
movq %r8,%rsi
movq %r9,104(%rsp)
andq %rbp,%rsi
movq %r10,112(%rsp)
shrq $4,%rbp
movq %r13,120(%rsp)
movq %r8,%rdi
andq %rbp,%rdi
shrq $4,%rbp
movq (%rsp,%rsi,8),%xmm0
movq %r8,%rsi
andq %rbp,%rsi
shrq $4,%rbp
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $4,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $60,%rbx
xorq %rcx,%rax
pslldq $1,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $12,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $52,%rbx
xorq %rcx,%rax
pslldq $2,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $20,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $44,%rbx
xorq %rcx,%rax
pslldq $3,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $28,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $36,%rbx
xorq %rcx,%rax
pslldq $4,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $36,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $28,%rbx
xorq %rcx,%rax
pslldq $5,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $44,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $20,%rbx
xorq %rcx,%rax
pslldq $6,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $52,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $12,%rbx
xorq %rcx,%rax
pslldq $7,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %rcx,%rbx
shlq $60,%rcx
.byte 102,72,15,126,198
shrq $4,%rbx
xorq %rcx,%rax
psrldq $8,%xmm0
xorq %rbx,%rdx
.byte 102,72,15,126,199
xorq %rsi,%rax
xorq %rdi,%rdx
addq $128+8,%rsp
.byte 0xf3,0xc3
.Lend_mul_1x1:
.size _mul_1x1,.-_mul_1x1
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,@function
.align 16
bn_GF2m_mul_2x2:
movq OPENSSL_ia32cap_P(%rip),%rax
btq $33,%rax
jnc .Lvanilla_mul_2x2
.byte 102,72,15,110,198
.byte 102,72,15,110,201
.byte 102,72,15,110,210
.byte 102,73,15,110,216
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
.byte 102,15,58,68,193,0
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
.byte 102,15,58,68,211,0
.byte 102,15,58,68,229,0
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4
movdqa %xmm4,%xmm5
pslldq $8,%xmm4
psrldq $8,%xmm5
pxor %xmm4,%xmm2
pxor %xmm5,%xmm0
movdqu %xmm2,0(%rdi)
movdqu %xmm0,16(%rdi)
.byte 0xf3,0xc3
.align 16
.Lvanilla_mul_2x2:
leaq -136(%rsp),%rsp
movq %r14,80(%rsp)
movq %r13,88(%rsp)
movq %r12,96(%rsp)
movq %rbp,104(%rsp)
movq %rbx,112(%rsp)
.Lbody_mul_2x2:
movq %rdi,32(%rsp)
movq %rsi,40(%rsp)
movq %rdx,48(%rsp)
movq %rcx,56(%rsp)
movq %r8,64(%rsp)
movq $15,%r8
movq %rsi,%rax
movq %rcx,%rbp
call _mul_1x1
movq %rax,16(%rsp)
movq %rdx,24(%rsp)
movq 48(%rsp),%rax
movq 64(%rsp),%rbp
call _mul_1x1
movq %rax,0(%rsp)
movq %rdx,8(%rsp)
movq 40(%rsp),%rax
movq 56(%rsp),%rbp
xorq 48(%rsp),%rax
xorq 64(%rsp),%rbp
call _mul_1x1
movq 0(%rsp),%rbx
movq 8(%rsp),%rcx
movq 16(%rsp),%rdi
movq 24(%rsp),%rsi
movq 32(%rsp),%rbp
xorq %rdx,%rax
xorq %rcx,%rdx
xorq %rbx,%rax
movq %rbx,0(%rbp)
xorq %rdi,%rdx
movq %rsi,24(%rbp)
xorq %rsi,%rax
xorq %rsi,%rdx
xorq %rdx,%rax
movq %rdx,16(%rbp)
movq %rax,8(%rbp)
movq 80(%rsp),%r14
movq 88(%rsp),%r13
movq 96(%rsp),%r12
movq 104(%rsp),%rbp
movq 112(%rsp),%rbx
leaq 136(%rsp),%rsp
.byte 0xf3,0xc3
.Lend_mul_2x2:
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16

View File

@@ -0,0 +1,390 @@
#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: code suitable
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
# later. Improvement varies from one benchmark and µ-arch to another.
# Vanilla code path is at most 20% faster than compiler-generated code
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
# all CPU time is burnt in it...
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
($lo,$hi)=("%rax","%rdx"); $a=$lo;
($i0,$i1)=("%rsi","%rdi");
($t0,$t1)=("%rbx","%rcx");
($b,$mask)=("%rbp","%r8");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
($R,$Tx)=("%xmm0","%xmm1");
$code.=<<___;
.text
.type _mul_1x1,\@abi-omnipotent
.align 16
_mul_1x1:
sub \$128+8,%rsp
mov \$-1,$a1
lea ($a,$a),$i0
shr \$3,$a1
lea (,$a,4),$i1
and $a,$a1 # a1=a&0x1fffffffffffffff
lea (,$a,8),$a8
sar \$63,$a # broadcast 63rd bit
lea ($a1,$a1),$a2
sar \$63,$i0 # broadcast 62nd bit
lea (,$a1,4),$a4
and $b,$a
sar \$63,$i1 # boardcast 61st bit
mov $a,$hi # $a is $lo
shl \$63,$lo
and $b,$i0
shr \$1,$hi
mov $i0,$t1
shl \$62,$i0
and $b,$i1
shr \$2,$t1
xor $i0,$lo
mov $i1,$t0
shl \$61,$i1
xor $t1,$hi
shr \$3,$t0
xor $i1,$lo
xor $t0,$hi
mov $a1,$a12
movq \$0,0(%rsp) # tab[0]=0
xor $a2,$a12 # a1^a2
mov $a1,8(%rsp) # tab[1]=a1
mov $a4,$a48
mov $a2,16(%rsp) # tab[2]=a2
xor $a8,$a48 # a4^a8
mov $a12,24(%rsp) # tab[3]=a1^a2
xor $a4,$a1
mov $a4,32(%rsp) # tab[4]=a4
xor $a4,$a2
mov $a1,40(%rsp) # tab[5]=a1^a4
xor $a4,$a12
mov $a2,48(%rsp) # tab[6]=a2^a4
xor $a48,$a1 # a1^a4^a4^a8=a1^a8
mov $a12,56(%rsp) # tab[7]=a1^a2^a4
xor $a48,$a2 # a2^a4^a4^a8=a1^a8
mov $a8,64(%rsp) # tab[8]=a8
xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
mov $a1,72(%rsp) # tab[9]=a1^a8
xor $a4,$a1 # a1^a8^a4
mov $a2,80(%rsp) # tab[10]=a2^a8
xor $a4,$a2 # a2^a8^a4
mov $a12,88(%rsp) # tab[11]=a1^a2^a8
xor $a4,$a12 # a1^a2^a8^a4
mov $a48,96(%rsp) # tab[12]=a4^a8
mov $mask,$i0
mov $a1,104(%rsp) # tab[13]=a1^a4^a8
and $b,$i0
mov $a2,112(%rsp) # tab[14]=a2^a4^a8
shr \$4,$b
mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
mov $mask,$i1
and $b,$i1
shr \$4,$b
movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
mov $mask,$i0
and $b,$i0
shr \$4,$b
___
for ($n=1;$n<8;$n++) {
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $mask,$i1
mov $t1,$t0
shl \$`8*$n-4`,$t1
and $b,$i1
movq (%rsp,$i0,8),$Tx
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
pslldq \$$n,$Tx
mov $mask,$i0
shr \$4,$b
xor $t0,$hi
and $b,$i0
shr \$4,$b
pxor $Tx,$R
___
}
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $t1,$t0
shl \$`8*$n-4`,$t1
movq $R,$i0
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
psrldq \$8,$R
xor $t0,$hi
movq $R,$i1
xor $i0,$lo
xor $i1,$hi
add \$128+8,%rsp
ret
.Lend_mul_1x1:
.size _mul_1x1,.-_mul_1x1
___
($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
$code.=<<___;
.extern OPENSSL_ia32cap_P
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@abi-omnipotent
.align 16
bn_GF2m_mul_2x2:
mov OPENSSL_ia32cap_P(%rip),%rax
bt \$33,%rax
jnc .Lvanilla_mul_2x2
movq $a1,%xmm0
movq $b1,%xmm1
movq $a0,%xmm2
___
$code.=<<___ if ($win64);
movq 40(%rsp),%xmm3
___
$code.=<<___ if (!$win64);
movq $b0,%xmm3
___
$code.=<<___;
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
movdqa %xmm4,%xmm5
pslldq \$8,%xmm4
psrldq \$8,%xmm5
pxor %xmm4,%xmm2
pxor %xmm5,%xmm0
movdqu %xmm2,0($rp)
movdqu %xmm0,16($rp)
ret
.align 16
.Lvanilla_mul_2x2:
lea -8*17(%rsp),%rsp
___
$code.=<<___ if ($win64);
mov `8*17+40`(%rsp),$b0
mov %rdi,8*15(%rsp)
mov %rsi,8*16(%rsp)
___
$code.=<<___;
mov %r14,8*10(%rsp)
mov %r13,8*11(%rsp)
mov %r12,8*12(%rsp)
mov %rbp,8*13(%rsp)
mov %rbx,8*14(%rsp)
.Lbody_mul_2x2:
mov $rp,32(%rsp) # save the arguments
mov $a1,40(%rsp)
mov $a0,48(%rsp)
mov $b1,56(%rsp)
mov $b0,64(%rsp)
mov \$0xf,$mask
mov $a1,$a
mov $b1,$b
call _mul_1x1 # a1·b1
mov $lo,16(%rsp)
mov $hi,24(%rsp)
mov 48(%rsp),$a
mov 64(%rsp),$b
call _mul_1x1 # a0·b0
mov $lo,0(%rsp)
mov $hi,8(%rsp)
mov 40(%rsp),$a
mov 56(%rsp),$b
xor 48(%rsp),$a
xor 64(%rsp),$b
call _mul_1x1 # (a0+a1)·(b0+b1)
___
@r=("%rbx","%rcx","%rdi","%rsi");
$code.=<<___;
mov 0(%rsp),@r[0]
mov 8(%rsp),@r[1]
mov 16(%rsp),@r[2]
mov 24(%rsp),@r[3]
mov 32(%rsp),%rbp
xor $hi,$lo
xor @r[1],$hi
xor @r[0],$lo
mov @r[0],0(%rbp)
xor @r[2],$hi
mov @r[3],24(%rbp)
xor @r[3],$lo
xor @r[3],$hi
xor $hi,$lo
mov $hi,16(%rbp)
mov $lo,8(%rbp)
mov 8*10(%rsp),%r14
mov 8*11(%rsp),%r13
mov 8*12(%rsp),%r12
mov 8*13(%rsp),%rbp
mov 8*14(%rsp),%rbx
___
$code.=<<___ if ($win64);
mov 8*15(%rsp),%rdi
mov 8*16(%rsp),%rsi
___
$code.=<<___;
lea 8*17(%rsp),%rsp
ret
.Lend_mul_2x2:
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 248($context),%rbx # pull context->Rip
lea .Lbody_mul_2x2(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lin_prologue
mov 8*10(%rax),%r14 # mimic epilogue
mov 8*11(%rax),%r13
mov 8*12(%rax),%r12
mov 8*13(%rax),%rbp
mov 8*14(%rax),%rbx
mov 8*15(%rax),%rdi
mov 8*16(%rax),%rsi
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
.Lin_prologue:
lea 8*17(%rax),%rax
mov %rax,152($context) # restore context->Rsp
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva _mul_1x1
.rva .Lend_mul_1x1
.rva .LSEH_info_1x1
.rva .Lvanilla_mul_2x2
.rva .Lend_mul_2x2
.rva .LSEH_info_2x2
.section .xdata
.align 8
.LSEH_info_1x1:
.byte 0x01,0x07,0x02,0x00
.byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
.LSEH_info_2x2:
.byte 9,0,0,0
.rva se_handler
___
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,784 @@
.text
.globl bn_mul_mont_gather5
.type bn_mul_mont_gather5,@function
.align 64
bn_mul_mont_gather5:
testl $3,%r9d
jnz .Lmul_enter
cmpl $8,%r9d
jb .Lmul_enter
jmp .Lmul4x_enter
.align 16
.Lmul_enter:
movl %r9d,%r9d
movl 8(%rsp),%r10d
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %rsp,%rax
leaq 2(%r9),%r11
negq %r11
leaq (%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul_body:
movq %rdx,%r12
movq %r10,%r11
shrq $3,%r10
andq $7,%r11
notq %r10
leaq .Lmagic_masks(%rip),%rax
andq $3,%r10
leaq 96(%r12,%r11,8),%r12
movq 0(%rax,%r10,8),%xmm4
movq 8(%rax,%r10,8),%xmm5
movq 16(%rax,%r10,8),%xmm6
movq 24(%rax,%r10,8),%xmm7
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
.byte 102,72,15,126,195
movq (%r8),%r8
movq (%rsi),%rax
xorq %r14,%r14
xorq %r15,%r15
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq %rdx,%r13
leaq 1(%r15),%r15
jmp .L1st_enter
.align 16
.L1st:
addq %rax,%r13
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%r13
movq %r10,%r11
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
.L1st_enter:
mulq %rbx
addq %rax,%r11
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
leaq 1(%r15),%r15
movq %rdx,%r10
mulq %rbp
cmpq %r9,%r15
jne .L1st
.byte 102,72,15,126,195
addq %rax,%r13
movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
movq %r10,%r11
xorq %rdx,%rdx
addq %r11,%r13
adcq $0,%rdx
movq %r13,-8(%rsp,%r9,8)
movq %rdx,(%rsp,%r9,8)
leaq 1(%r14),%r14
jmp .Louter
.align 16
.Louter:
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq 8(%rsp),%r10
movq %rdx,%r13
leaq 1(%r15),%r15
jmp .Linner_enter
.align 16
.Linner:
addq %rax,%r13
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
movq (%rsp,%r15,8),%r10
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
.Linner_enter:
mulq %rbx
addq %rax,%r11
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
adcq $0,%r11
leaq 1(%r15),%r15
mulq %rbp
cmpq %r9,%r15
jne .Linner
.byte 102,72,15,126,195
addq %rax,%r13
movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
movq (%rsp,%r15,8),%r10
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
xorq %rdx,%rdx
addq %r11,%r13
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-8(%rsp,%r9,8)
movq %rdx,(%rsp,%r9,8)
leaq 1(%r14),%r14
cmpq %r9,%r14
jl .Louter
xorq %r14,%r14
movq (%rsp),%rax
leaq (%rsp),%rsi
movq %r9,%r15
jmp .Lsub
.align 16
.Lsub: sbbq (%rcx,%r14,8),%rax
movq %rax,(%rdi,%r14,8)
movq 8(%rsi,%r14,8),%rax
leaq 1(%r14),%r14
decq %r15
jnz .Lsub
sbbq $0,%rax
xorq %r14,%r14
andq %rax,%rsi
notq %rax
movq %rdi,%rcx
andq %rax,%rcx
movq %r9,%r15
orq %rcx,%rsi
.align 16
.Lcopy:
movq (%rsi,%r14,8),%rax
movq %r14,(%rsp,%r14,8)
movq %rax,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz .Lcopy
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
movq (%rsi),%r15
movq 8(%rsi),%r14
movq 16(%rsi),%r13
movq 24(%rsi),%r12
movq 32(%rsi),%rbp
movq 40(%rsi),%rbx
leaq 48(%rsi),%rsp
.Lmul_epilogue:
.byte 0xf3,0xc3
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
.type bn_mul4x_mont_gather5,@function
.align 16
bn_mul4x_mont_gather5:
.Lmul4x_enter:
movl %r9d,%r9d
movl 8(%rsp),%r10d
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %rsp,%rax
leaq 4(%r9),%r11
negq %r11
leaq (%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul4x_body:
movq %rdi,16(%rsp,%r9,8)
movq %rdx,%r12
movq %r10,%r11
shrq $3,%r10
andq $7,%r11
notq %r10
leaq .Lmagic_masks(%rip),%rax
andq $3,%r10
leaq 96(%r12,%r11,8),%r12
movq 0(%rax,%r10,8),%xmm4
movq 8(%rax,%r10,8),%xmm5
movq 16(%rax,%r10,8),%xmm6
movq 24(%rax,%r10,8),%xmm7
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
.byte 102,72,15,126,195
movq (%r8),%r8
movq (%rsi),%rax
xorq %r14,%r14
xorq %r15,%r15
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq 16(%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
leaq 4(%r15),%r15
adcq $0,%rdx
movq %rdi,(%rsp)
movq %rdx,%r13
jmp .L1st4x
.align 16
.L1st4x:
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %rdi,-16(%rsp,%r15,8)
movq %rdx,%r13
mulq %rbx
addq %rax,%r10
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq 8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-8(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx,%r15,8),%rax
adcq $0,%rdx
leaq 4(%r15),%r15
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq -16(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %rdi,-32(%rsp,%r15,8)
movq %rdx,%r13
cmpq %r9,%r15
jl .L1st4x
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %rdi,-16(%rsp,%r15,8)
movq %rdx,%r13
.byte 102,72,15,126,195
xorq %rdi,%rdi
addq %r10,%r13
adcq $0,%rdi
movq %r13,-8(%rsp,%r15,8)
movq %rdi,(%rsp,%r15,8)
leaq 1(%r14),%r14
.align 4
.Louter4x:
xorq %r15,%r15
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq (%rsp),%r10
movq %r8,%rbp
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%rsp),%r11
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq 16(%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
leaq 4(%r15),%r15
adcq $0,%rdx
movq %rdx,%r13
jmp .Linner4x
.align 16
.Linner4x:
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -16(%rsp,%r15,8),%r10
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %rdi,-32(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -8(%rsp,%r15,8),%r11
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%r13
mulq %rbx
addq %rax,%r10
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
addq (%rsp,%r15,8),%r10
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq 8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %rdi,-16(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx,%r15,8),%rax
adcq $0,%rdx
addq 8(%rsp,%r15,8),%r11
adcq $0,%rdx
leaq 4(%r15),%r15
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq -16(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %r13,-40(%rsp,%r15,8)
movq %rdx,%r13
cmpq %r9,%r15
jl .Linner4x
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -16(%rsp,%r15,8),%r10
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %rdi,-32(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -8(%rsp,%r15,8),%r11
adcq $0,%rdx
leaq 1(%r14),%r14
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%r13
.byte 102,72,15,126,195
movq %rdi,-16(%rsp,%r15,8)
xorq %rdi,%rdi
addq %r10,%r13
adcq $0,%rdi
addq (%rsp,%r9,8),%r13
adcq $0,%rdi
movq %r13,-8(%rsp,%r15,8)
movq %rdi,(%rsp,%r15,8)
cmpq %r9,%r14
jl .Louter4x
movq 16(%rsp,%r9,8),%rdi
movq 0(%rsp),%rax
pxor %xmm0,%xmm0
movq 8(%rsp),%rdx
shrq $2,%r9
leaq (%rsp),%rsi
xorq %r14,%r14
subq 0(%rcx),%rax
movq 16(%rsi),%rbx
movq 24(%rsi),%rbp
sbbq 8(%rcx),%rdx
leaq -1(%r9),%r15
jmp .Lsub4x
.align 16
.Lsub4x:
movq %rax,0(%rdi,%r14,8)
movq %rdx,8(%rdi,%r14,8)
sbbq 16(%rcx,%r14,8),%rbx
movq 32(%rsi,%r14,8),%rax
movq 40(%rsi,%r14,8),%rdx
sbbq 24(%rcx,%r14,8),%rbp
movq %rbx,16(%rdi,%r14,8)
movq %rbp,24(%rdi,%r14,8)
sbbq 32(%rcx,%r14,8),%rax
movq 48(%rsi,%r14,8),%rbx
movq 56(%rsi,%r14,8),%rbp
sbbq 40(%rcx,%r14,8),%rdx
leaq 4(%r14),%r14
decq %r15
jnz .Lsub4x
movq %rax,0(%rdi,%r14,8)
movq 32(%rsi,%r14,8),%rax
sbbq 16(%rcx,%r14,8),%rbx
movq %rdx,8(%rdi,%r14,8)
sbbq 24(%rcx,%r14,8),%rbp
movq %rbx,16(%rdi,%r14,8)
sbbq $0,%rax
movq %rbp,24(%rdi,%r14,8)
xorq %r14,%r14
andq %rax,%rsi
notq %rax
movq %rdi,%rcx
andq %rax,%rcx
leaq -1(%r9),%r15
orq %rcx,%rsi
movdqu (%rsi),%xmm1
movdqa %xmm0,(%rsp)
movdqu %xmm1,(%rdi)
jmp .Lcopy4x
.align 16
.Lcopy4x:
movdqu 16(%rsi,%r14,1),%xmm2
movdqu 32(%rsi,%r14,1),%xmm1
movdqa %xmm0,16(%rsp,%r14,1)
movdqu %xmm2,16(%rdi,%r14,1)
movdqa %xmm0,32(%rsp,%r14,1)
movdqu %xmm1,32(%rdi,%r14,1)
leaq 32(%r14),%r14
decq %r15
jnz .Lcopy4x
shlq $2,%r9
movdqu 16(%rsi,%r14,1),%xmm2
movdqa %xmm0,16(%rsp,%r14,1)
movdqu %xmm2,16(%rdi,%r14,1)
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
movq (%rsi),%r15
movq 8(%rsi),%r14
movq 16(%rsi),%r13
movq 24(%rsi),%r12
movq 32(%rsi),%rbp
movq 40(%rsi),%rbx
leaq 48(%rsi),%rsp
.Lmul4x_epilogue:
.byte 0xf3,0xc3
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.globl bn_scatter5
.type bn_scatter5,@function
.align 16
bn_scatter5:
cmpq $0,%rsi
jz .Lscatter_epilogue
leaq (%rdx,%rcx,8),%rdx
.Lscatter:
movq (%rdi),%rax
leaq 8(%rdi),%rdi
movq %rax,(%rdx)
leaq 256(%rdx),%rdx
subq $1,%rsi
jnz .Lscatter
.Lscatter_epilogue:
.byte 0xf3,0xc3
.size bn_scatter5,.-bn_scatter5
.globl bn_gather5
.type bn_gather5,@function
.align 16
bn_gather5:
movq %rcx,%r11
shrq $3,%rcx
andq $7,%r11
notq %rcx
leaq .Lmagic_masks(%rip),%rax
andq $3,%rcx
leaq 96(%rdx,%r11,8),%rdx
movq 0(%rax,%rcx,8),%xmm4
movq 8(%rax,%rcx,8),%xmm5
movq 16(%rax,%rcx,8),%xmm6
movq 24(%rax,%rcx,8),%xmm7
jmp .Lgather
.align 16
.Lgather:
movq -96(%rdx),%xmm0
movq -32(%rdx),%xmm1
pand %xmm4,%xmm0
movq 32(%rdx),%xmm2
pand %xmm5,%xmm1
movq 96(%rdx),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
leaq 256(%rdx),%rdx
por %xmm3,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subq $1,%rsi
jnz .Lgather
.byte 0xf3,0xc3
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
.align 64
.Lmagic_masks:
.long 0,0, 0,0, 0,0, -1,-1
.long 0,0, 0,0, 0,0, 0,0
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0

File diff suppressed because it is too large Load Diff