Viewing file:      poly1305-x86_64.pl (98.12 KB)      -rwxr-xr-x Select action/file-type:    (+) |   (+) |   (+) | Code (+) | Session (+) |   (+) | SDB (+) |   (+) |   (+) |   (+) |   (+) |   (+) |
 
#! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License").  You may not use # this file except in compliance with the License.  You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html
  # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for x86_64. # # March 2015 # # Initial release. # # December 2016 # # Add AVX512F+VL+BW code path. # # November 2017 # # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be # executed even on Knights Landing. Trigger for modification was # observation that AVX512 code paths can negatively affect overall # Skylake-X system performance. Since we are likely to suppress # AVX512F capability flag [at least on Skylake-X], conversion serves # as kind of "investment protection". Note that next *lake processor, # Cannolake, has AVX512IFMA code path to execute... # # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # #        IALU/gcc-4.8(*)    AVX(**)        AVX2    AVX-512 # P4        4.46/+120%    - # Core 2    2.41/+90%    - # Westmere    1.88/+120%    - # Sandy Bridge    1.39/+140%    1.10 # Haswell    1.14/+175%    1.11        0.65 # Skylake[-X]    1.13/+120%    0.96        0.51    [0.35] # Silvermont    2.83/+95%    - # Knights L    3.60/?        1.65        1.10    0.41(***) # Goldmont    1.70/+180%    - # VIA Nano    1.82/+150%    - # Sledgehammer    1.38/+160%    - # Bulldozer    2.30/+130%    0.97 # Ryzen        1.15/+200%    1.08        1.18 # # (*)    improvement coefficients relative to clang are more modest and #    are ~50% on most processors, in both cases we are comparing to #    __int128 code; # (**)    SSE2 implementation was attempted, but among non-AVX processors #    it was faster than integer-only code only on older Intel P4 and #    Core processors, 50-30%, less newer processor is, but slower on #    contemporary ones, for example almost 2x slower on Atom, and as #    former are naturally disappearing, SSE2 is deemed unnecessary; # (***)    strangely enough performance seems to vary from core to core, #    listed result is best case;
  # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl";
  if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`         =~ /GNU assembler version ([2-9]\.[0-9]+)/) {     $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); }
  if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&        `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {     $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);     $avx += 2 if ($1==2.11 && $2>=8); }
  if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&        `ml64 2>&1` =~ /Version ([0-9]+)\./) {     $avx = ($1>=10) + ($1>=12); }
  if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {     $avx = ($2>=3.0) + ($2>3.0); }
  open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""     or die "can't call $xlate: $!"; *STDOUT=*OUT;
  my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); my ($mac,$nonce)=($inp,$len);    # *_emit arguments my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
  sub poly1305_iteration { # input:    copy of $r1 in %rax, $h0-$h2, $r0-$r1 # output:    $h0-$h2 *= $r0-$r1 $code.=<<___;     mulq    $h0            # h0*r1     mov    %rax,$d2      mov    $r0,%rax     mov    %rdx,$d3
      mulq    $h0            # h0*r0     mov    %rax,$h0        # future $h0      mov    $r0,%rax     mov    %rdx,$d1
      mulq    $h1            # h1*r0     add    %rax,$d2      mov    $s1,%rax     adc    %rdx,$d3
      mulq    $h1            # h1*s1      mov    $h2,$h1            # borrow $h1     add    %rax,$h0     adc    %rdx,$d1
      imulq    $s1,$h1            # h2*s1     add    $h1,$d2      mov    $d1,$h1     adc    \$0,$d3
      imulq    $r0,$h2            # h2*r0     add    $d2,$h1     mov    \$-4,%rax        # mask value     adc    $h2,$d3
      and    $d3,%rax        # last reduction step     mov    $d3,$h2     shr    \$2,$d3     and    \$3,$h2     add    $d3,%rax     add    %rax,$h0     adc    \$0,$h1     adc    \$0,$h2 ___ }
  ######################################################################## # Layout of opaque area is following. # #    unsigned __int64 h[3];        # current hash value base 2^64 #    unsigned __int64 r[2];        # key value base 2^64
  $code.=<<___; .text
  .extern    OPENSSL_ia32cap_P
  .globl    poly1305_init .hidden    poly1305_init .globl    poly1305_blocks .hidden    poly1305_blocks .globl    poly1305_emit .hidden    poly1305_emit
  .type    poly1305_init,\@function,3 .align    32 poly1305_init: .cfi_startproc     xor    %rax,%rax     mov    %rax,0($ctx)        # initialize hash value     mov    %rax,8($ctx)     mov    %rax,16($ctx)
      cmp    \$0,$inp     je    .Lno_key
      lea    poly1305_blocks(%rip),%r10     lea    poly1305_emit(%rip),%r11 ___ $code.=<<___    if ($avx);     mov    OPENSSL_ia32cap_P+4(%rip),%r9     lea    poly1305_blocks_avx(%rip),%rax     lea    poly1305_emit_avx(%rip),%rcx     bt    \$`60-32`,%r9        # AVX?     cmovc    %rax,%r10     cmovc    %rcx,%r11 ___ $code.=<<___    if ($avx>1);     lea    poly1305_blocks_avx2(%rip),%rax     bt    \$`5+32`,%r9        # AVX2?     cmovc    %rax,%r10 ___ $code.=<<___    if ($avx>3);     mov    \$`(1<<31|1<<21|1<<16)`,%rax     shr    \$32,%r9     and    %rax,%r9     cmp    %rax,%r9     je    .Linit_base2_44 ___ $code.=<<___;     mov    \$0x0ffffffc0fffffff,%rax     mov    \$0x0ffffffc0ffffffc,%rcx     and    0($inp),%rax     and    8($inp),%rcx     mov    %rax,24($ctx)     mov    %rcx,32($ctx) ___ $code.=<<___    if ($flavour !~ /elf32/);     mov    %r10,0(%rdx)     mov    %r11,8(%rdx) ___ $code.=<<___    if ($flavour =~ /elf32/);     mov    %r10d,0(%rdx)     mov    %r11d,4(%rdx) ___ $code.=<<___;     mov    \$1,%eax .Lno_key:     ret .cfi_endproc .size    poly1305_init,.-poly1305_init
  .type    poly1305_blocks,\@function,4 .align    32 poly1305_blocks: .cfi_startproc .Lblocks:     shr    \$4,$len     jz    .Lno_data        # too short
      push    %rbx .cfi_push    %rbx     push    %rbp .cfi_push    %rbp     push    %r12 .cfi_push    %r12     push    %r13 .cfi_push    %r13     push    %r14 .cfi_push    %r14     push    %r15 .cfi_push    %r15 .Lblocks_body:
      mov    $len,%r15        # reassign $len
      mov    24($ctx),$r0        # load r     mov    32($ctx),$s1
      mov    0($ctx),$h0        # load hash value     mov    8($ctx),$h1     mov    16($ctx),$h2
      mov    $s1,$r1     shr    \$2,$s1     mov    $r1,%rax     add    $r1,$s1            # s1 = r1 + (r1 >> 2)     jmp    .Loop
  .align    32 .Loop:     add    0($inp),$h0        # accumulate input     adc    8($inp),$h1     lea    16($inp),$inp     adc    $padbit,$h2 ___     &poly1305_iteration(); $code.=<<___;     mov    $r1,%rax     dec    %r15            # len-=16     jnz    .Loop
      mov    $h0,0($ctx)        # store hash value     mov    $h1,8($ctx)     mov    $h2,16($ctx)
      mov    0(%rsp),%r15 .cfi_restore    %r15     mov    8(%rsp),%r14 .cfi_restore    %r14     mov    16(%rsp),%r13 .cfi_restore    %r13     mov    24(%rsp),%r12 .cfi_restore    %r12     mov    32(%rsp),%rbp .cfi_restore    %rbp     mov    40(%rsp),%rbx .cfi_restore    %rbx     lea    48(%rsp),%rsp .cfi_adjust_cfa_offset    -48 .Lno_data: .Lblocks_epilogue:     ret .cfi_endproc .size    poly1305_blocks,.-poly1305_blocks
  .type    poly1305_emit,\@function,3 .align    32 poly1305_emit: .cfi_startproc .Lemit:     mov    0($ctx),%r8    # load hash value     mov    8($ctx),%r9     mov    16($ctx),%r10
      mov    %r8,%rax     add    \$5,%r8        # compare to modulus     mov    %r9,%rcx     adc    \$0,%r9     adc    \$0,%r10     shr    \$2,%r10    # did 130-bit value overflow?     cmovnz    %r8,%rax     cmovnz    %r9,%rcx
      add    0($nonce),%rax    # accumulate nonce     adc    8($nonce),%rcx     mov    %rax,0($mac)    # write result     mov    %rcx,8($mac)
      ret .cfi_endproc .size    poly1305_emit,.-poly1305_emit ___ if ($avx) {
  ######################################################################## # Layout of opaque area is following. # #    unsigned __int32 h[5];        # current hash value base 2^26 #    unsigned __int32 is_base2_26; #    unsigned __int64 r[2];        # key value base 2^64 #    unsigned __int64 pad; #    struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; # # where r^n are base 2^26 digits of degrees of multiplier key. There are # 5 digits, but last four are interleaved with multiples of 5, totalling # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
  my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =     map("%xmm$_",(0..15));
  $code.=<<___; .type    __poly1305_block,\@abi-omnipotent .align    32 __poly1305_block: .cfi_startproc ___     &poly1305_iteration(); $code.=<<___;     ret .cfi_endproc .size    __poly1305_block,.-__poly1305_block
  .type    __poly1305_init_avx,\@abi-omnipotent .align    32 __poly1305_init_avx: .cfi_startproc     mov    $r0,$h0     mov    $r1,$h1     xor    $h2,$h2
      lea    48+64($ctx),$ctx    # size optimization
      mov    $r1,%rax     call    __poly1305_block    # r^2
      mov    \$0x3ffffff,%eax    # save interleaved r^2 and r base 2^26     mov    \$0x3ffffff,%edx     mov    $h0,$d1     and    $h0#d,%eax     mov    $r0,$d2     and    $r0#d,%edx     mov    %eax,`16*0+0-64`($ctx)     shr    \$26,$d1     mov    %edx,`16*0+4-64`($ctx)     shr    \$26,$d2
      mov    \$0x3ffffff,%eax     mov    \$0x3ffffff,%edx     and    $d1#d,%eax     and    $d2#d,%edx     mov    %eax,`16*1+0-64`($ctx)     lea    (%rax,%rax,4),%eax    # *5     mov    %edx,`16*1+4-64`($ctx)     lea    (%rdx,%rdx,4),%edx    # *5     mov    %eax,`16*2+0-64`($ctx)     shr    \$26,$d1     mov    %edx,`16*2+4-64`($ctx)     shr    \$26,$d2
      mov    $h1,%rax     mov    $r1,%rdx     shl    \$12,%rax     shl    \$12,%rdx     or    $d1,%rax     or    $d2,%rdx     and    \$0x3ffffff,%eax     and    \$0x3ffffff,%edx     mov    %eax,`16*3+0-64`($ctx)     lea    (%rax,%rax,4),%eax    # *5     mov    %edx,`16*3+4-64`($ctx)     lea    (%rdx,%rdx,4),%edx    # *5     mov    %eax,`16*4+0-64`($ctx)     mov    $h1,$d1     mov    %edx,`16*4+4-64`($ctx)     mov    $r1,$d2
      mov    \$0x3ffffff,%eax     mov    \$0x3ffffff,%edx     shr    \$14,$d1     shr    \$14,$d2     and    $d1#d,%eax     and    $d2#d,%edx     mov    %eax,`16*5+0-64`($ctx)     lea    (%rax,%rax,4),%eax    # *5     mov    %edx,`16*5+4-64`($ctx)     lea    (%rdx,%rdx,4),%edx    # *5     mov    %eax,`16*6+0-64`($ctx)     shr    \$26,$d1     mov    %edx,`16*6+4-64`($ctx)     shr    \$26,$d2
      mov    $h2,%rax     shl    \$24,%rax     or    %rax,$d1     mov    $d1#d,`16*7+0-64`($ctx)     lea    ($d1,$d1,4),$d1        # *5     mov    $d2#d,`16*7+4-64`($ctx)     lea    ($d2,$d2,4),$d2        # *5     mov    $d1#d,`16*8+0-64`($ctx)     mov    $d2#d,`16*8+4-64`($ctx)
      mov    $r1,%rax     call    __poly1305_block    # r^3
      mov    \$0x3ffffff,%eax    # save r^3 base 2^26     mov    $h0,$d1     and    $h0#d,%eax     shr    \$26,$d1     mov    %eax,`16*0+12-64`($ctx)
      mov    \$0x3ffffff,%edx     and    $d1#d,%edx     mov    %edx,`16*1+12-64`($ctx)     lea    (%rdx,%rdx,4),%edx    # *5     shr    \$26,$d1     mov    %edx,`16*2+12-64`($ctx)
      mov    $h1,%rax     shl    \$12,%rax     or    $d1,%rax     and    \$0x3ffffff,%eax     mov    %eax,`16*3+12-64`($ctx)     lea    (%rax,%rax,4),%eax    # *5     mov    $h1,$d1     mov    %eax,`16*4+12-64`($ctx)
      mov    \$0x3ffffff,%edx     shr    \$14,$d1     and    $d1#d,%edx     mov    %edx,`16*5+12-64`($ctx)     lea    (%rdx,%rdx,4),%edx    # *5     shr    \$26,$d1     mov    %edx,`16*6+12-64`($ctx)
      mov    $h2,%rax     shl    \$24,%rax     or    %rax,$d1     mov    $d1#d,`16*7+12-64`($ctx)     lea    ($d1,$d1,4),$d1        # *5     mov    $d1#d,`16*8+12-64`($ctx)
      mov    $r1,%rax     call    __poly1305_block    # r^4
      mov    \$0x3ffffff,%eax    # save r^4 base 2^26     mov    $h0,$d1     and    $h0#d,%eax     shr    \$26,$d1     mov    %eax,`16*0+8-64`($ctx)
      mov    \$0x3ffffff,%edx     and    $d1#d,%edx     mov    %edx,`16*1+8-64`($ctx)     lea    (%rdx,%rdx,4),%edx    # *5     shr    \$26,$d1     mov    %edx,`16*2+8-64`($ctx)
      mov    $h1,%rax     shl    \$12,%rax     or    $d1,%rax     and    \$0x3ffffff,%eax     mov    %eax,`16*3+8-64`($ctx)     lea    (%rax,%rax,4),%eax    # *5     mov    $h1,$d1     mov    %eax,`16*4+8-64`($ctx)
      mov    \$0x3ffffff,%edx     shr    \$14,$d1     and    $d1#d,%edx     mov    %edx,`16*5+8-64`($ctx)     lea    (%rdx,%rdx,4),%edx    # *5     shr    \$26,$d1     mov    %edx,`16*6+8-64`($ctx)
      mov    $h2,%rax     shl    \$24,%rax     or    %rax,$d1     mov    $d1#d,`16*7+8-64`($ctx)     lea    ($d1,$d1,4),$d1        # *5     mov    $d1#d,`16*8+8-64`($ctx)
      lea    -48-64($ctx),$ctx    # size [de-]optimization     ret .cfi_endproc .size    __poly1305_init_avx,.-__poly1305_init_avx
  .type    poly1305_blocks_avx,\@function,4 .align    32 poly1305_blocks_avx: .cfi_startproc     mov    20($ctx),%r8d        # is_base2_26     cmp    \$128,$len     jae    .Lblocks_avx     test    %r8d,%r8d     jz    .Lblocks
  .Lblocks_avx:     and    \$-16,$len     jz    .Lno_data_avx
      vzeroupper
      test    %r8d,%r8d     jz    .Lbase2_64_avx
      test    \$31,$len     jz    .Leven_avx
      push    %rbx .cfi_push    %rbx     push    %rbp .cfi_push    %rbp     push    %r12 .cfi_push    %r12     push    %r13 .cfi_push    %r13     push    %r14 .cfi_push    %r14     push    %r15 .cfi_push    %r15 .Lblocks_avx_body:
      mov    $len,%r15        # reassign $len
      mov    0($ctx),$d1        # load hash value     mov    8($ctx),$d2     mov    16($ctx),$h2#d
      mov    24($ctx),$r0        # load r     mov    32($ctx),$s1
      ################################# base 2^26 -> base 2^64     mov    $d1#d,$h0#d     and    \$`-1*(1<<31)`,$d1     mov    $d2,$r1            # borrow $r1     mov    $d2#d,$h1#d     and    \$`-1*(1<<31)`,$d2
      shr    \$6,$d1     shl    \$52,$r1     add    $d1,$h0     shr    \$12,$h1     shr    \$18,$d2     add    $r1,$h0     adc    $d2,$h1
      mov    $h2,$d1     shl    \$40,$d1     shr    \$24,$h2     add    $d1,$h1     adc    \$0,$h2            # can be partially reduced...
      mov    \$-4,$d2        # ... so reduce     mov    $h2,$d1     and    $h2,$d2     shr    \$2,$d1     and    \$3,$h2     add    $d2,$d1            # =*5     add    $d1,$h0     adc    \$0,$h1     adc    \$0,$h2
      mov    $s1,$r1     mov    $s1,%rax     shr    \$2,$s1     add    $r1,$s1            # s1 = r1 + (r1 >> 2)
      add    0($inp),$h0        # accumulate input     adc    8($inp),$h1     lea    16($inp),$inp     adc    $padbit,$h2
      call    __poly1305_block
      test    $padbit,$padbit        # if $padbit is zero,     jz    .Lstore_base2_64_avx    # store hash in base 2^64 format
      ################################# base 2^64 -> base 2^26     mov    $h0,%rax     mov    $h0,%rdx     shr    \$52,$h0     mov    $h1,$r0     mov    $h1,$r1     shr    \$26,%rdx     and    \$0x3ffffff,%rax    # h[0]     shl    \$12,$r0     and    \$0x3ffffff,%rdx    # h[1]     shr    \$14,$h1     or    $r0,$h0     shl    \$24,$h2     and    \$0x3ffffff,$h0        # h[2]     shr    \$40,$r1     and    \$0x3ffffff,$h1        # h[3]     or    $r1,$h2            # h[4]
      sub    \$16,%r15     jz    .Lstore_base2_26_avx
      vmovd    %rax#d,$H0     vmovd    %rdx#d,$H1     vmovd    $h0#d,$H2     vmovd    $h1#d,$H3     vmovd    $h2#d,$H4     jmp    .Lproceed_avx
  .align    32 .Lstore_base2_64_avx:     mov    $h0,0($ctx)     mov    $h1,8($ctx)     mov    $h2,16($ctx)        # note that is_base2_26 is zeroed     jmp    .Ldone_avx
  .align    16 .Lstore_base2_26_avx:     mov    %rax#d,0($ctx)        # store hash value base 2^26     mov    %rdx#d,4($ctx)     mov    $h0#d,8($ctx)     mov    $h1#d,12($ctx)     mov    $h2#d,16($ctx) .align    16 .Ldone_avx:     mov    0(%rsp),%r15 .cfi_restore    %r15     mov    8(%rsp),%r14 .cfi_restore    %r14     mov    16(%rsp),%r13 .cfi_restore    %r13     mov    24(%rsp),%r12 .cfi_restore    %r12     mov    32(%rsp),%rbp .cfi_restore    %rbp     mov    40(%rsp),%rbx .cfi_restore    %rbx     lea    48(%rsp),%rsp .cfi_adjust_cfa_offset    -48 .Lno_data_avx: .Lblocks_avx_epilogue:     ret .cfi_endproc
  .align    32 .Lbase2_64_avx: .cfi_startproc     push    %rbx .cfi_push    %rbx     push    %rbp .cfi_push    %rbp     push    %r12 .cfi_push    %r12     push    %r13 .cfi_push    %r13     push    %r14 .cfi_push    %r14     push    %r15 .cfi_push    %r15 .Lbase2_64_avx_body:
      mov    $len,%r15        # reassign $len
      mov    24($ctx),$r0        # load r     mov    32($ctx),$s1
      mov    0($ctx),$h0        # load hash value     mov    8($ctx),$h1     mov    16($ctx),$h2#d
      mov    $s1,$r1     mov    $s1,%rax     shr    \$2,$s1     add    $r1,$s1            # s1 = r1 + (r1 >> 2)
      test    \$31,$len     jz    .Linit_avx
      add    0($inp),$h0        # accumulate input     adc    8($inp),$h1     lea    16($inp),$inp     adc    $padbit,$h2     sub    \$16,%r15
      call    __poly1305_block
  .Linit_avx:     ################################# base 2^64 -> base 2^26     mov    $h0,%rax     mov    $h0,%rdx     shr    \$52,$h0     mov    $h1,$d1     mov    $h1,$d2     shr    \$26,%rdx     and    \$0x3ffffff,%rax    # h[0]     shl    \$12,$d1     and    \$0x3ffffff,%rdx    # h[1]     shr    \$14,$h1     or    $d1,$h0     shl    \$24,$h2     and    \$0x3ffffff,$h0        # h[2]     shr    \$40,$d2     and    \$0x3ffffff,$h1        # h[3]     or    $d2,$h2            # h[4]
      vmovd    %rax#d,$H0     vmovd    %rdx#d,$H1     vmovd    $h0#d,$H2     vmovd    $h1#d,$H3     vmovd    $h2#d,$H4     movl    \$1,20($ctx)        # set is_base2_26
      call    __poly1305_init_avx
  .Lproceed_avx:     mov    %r15,$len
      mov    0(%rsp),%r15 .cfi_restore    %r15     mov    8(%rsp),%r14 .cfi_restore    %r14     mov    16(%rsp),%r13 .cfi_restore    %r13     mov    24(%rsp),%r12 .cfi_restore    %r12     mov    32(%rsp),%rbp .cfi_restore    %rbp     mov    40(%rsp),%rbx .cfi_restore    %rbx     lea    48(%rsp),%rax     lea    48(%rsp),%rsp .cfi_adjust_cfa_offset    -48 .Lbase2_64_avx_epilogue:     jmp    .Ldo_avx .cfi_endproc
  .align    32 .Leven_avx: .cfi_startproc     vmovd        4*0($ctx),$H0        # load hash value     vmovd        4*1($ctx),$H1     vmovd        4*2($ctx),$H2     vmovd        4*3($ctx),$H3     vmovd        4*4($ctx),$H4
  .Ldo_avx: ___ $code.=<<___    if (!$win64);     lea        -0x58(%rsp),%r11 .cfi_def_cfa        %r11,0x60     sub        \$0x178,%rsp ___ $code.=<<___    if ($win64);     lea        -0xf8(%rsp),%r11     sub        \$0x218,%rsp     vmovdqa        %xmm6,0x50(%r11)     vmovdqa        %xmm7,0x60(%r11)     vmovdqa        %xmm8,0x70(%r11)     vmovdqa        %xmm9,0x80(%r11)     vmovdqa        %xmm10,0x90(%r11)     vmovdqa        %xmm11,0xa0(%r11)     vmovdqa        %xmm12,0xb0(%r11)     vmovdqa        %xmm13,0xc0(%r11)     vmovdqa        %xmm14,0xd0(%r11)     vmovdqa        %xmm15,0xe0(%r11) .Ldo_avx_body: ___ $code.=<<___;     sub        \$64,$len     lea        -32($inp),%rax     cmovc        %rax,$inp
      vmovdqu        `16*3`($ctx),$D4    # preload r0^2     lea        `16*3+64`($ctx),$ctx    # size optimization     lea        .Lconst(%rip),%rcx
      ################################################################     # load input     vmovdqu        16*2($inp),$T0     vmovdqu        16*3($inp),$T1     vmovdqa        64(%rcx),$MASK        # .Lmask26
      vpsrldq        \$6,$T0,$T2        # splat input     vpsrldq        \$6,$T1,$T3     vpunpckhqdq    $T1,$T0,$T4        # 4     vpunpcklqdq    $T1,$T0,$T0        # 0:1     vpunpcklqdq    $T3,$T2,$T3        # 2:3
      vpsrlq        \$40,$T4,$T4        # 4     vpsrlq        \$26,$T0,$T1     vpand        $MASK,$T0,$T0        # 0     vpsrlq        \$4,$T3,$T2     vpand        $MASK,$T1,$T1        # 1     vpsrlq        \$30,$T3,$T3     vpand        $MASK,$T2,$T2        # 2     vpand        $MASK,$T3,$T3        # 3     vpor        32(%rcx),$T4,$T4    # padbit, yes, always
      jbe        .Lskip_loop_avx
      # expand and copy pre-calculated table to stack     vmovdqu        `16*1-64`($ctx),$D1     vmovdqu        `16*2-64`($ctx),$D2     vpshufd        \$0xEE,$D4,$D3        # 34xx -> 3434     vpshufd        \$0x44,$D4,$D0        # xx12 -> 1212     vmovdqa        $D3,-0x90(%r11)     vmovdqa        $D0,0x00(%rsp)     vpshufd        \$0xEE,$D1,$D4     vmovdqu        `16*3-64`($ctx),$D0     vpshufd        \$0x44,$D1,$D1     vmovdqa        $D4,-0x80(%r11)     vmovdqa        $D1,0x10(%rsp)     vpshufd        \$0xEE,$D2,$D3     vmovdqu        `16*4-64`($ctx),$D1     vpshufd        \$0x44,$D2,$D2     vmovdqa        $D3,-0x70(%r11)     vmovdqa        $D2,0x20(%rsp)     vpshufd        \$0xEE,$D0,$D4     vmovdqu        `16*5-64`($ctx),$D2     vpshufd        \$0x44,$D0,$D0     vmovdqa        $D4,-0x60(%r11)     vmovdqa        $D0,0x30(%rsp)     vpshufd        \$0xEE,$D1,$D3     vmovdqu        `16*6-64`($ctx),$D0     vpshufd        \$0x44,$D1,$D1     vmovdqa        $D3,-0x50(%r11)     vmovdqa        $D1,0x40(%rsp)     vpshufd        \$0xEE,$D2,$D4     vmovdqu        `16*7-64`($ctx),$D1     vpshufd        \$0x44,$D2,$D2     vmovdqa        $D4,-0x40(%r11)     vmovdqa        $D2,0x50(%rsp)     vpshufd        \$0xEE,$D0,$D3     vmovdqu        `16*8-64`($ctx),$D2     vpshufd        \$0x44,$D0,$D0     vmovdqa        $D3,-0x30(%r11)     vmovdqa        $D0,0x60(%rsp)     vpshufd        \$0xEE,$D1,$D4     vpshufd        \$0x44,$D1,$D1     vmovdqa        $D4,-0x20(%r11)     vmovdqa        $D1,0x70(%rsp)     vpshufd        \$0xEE,$D2,$D3      vmovdqa    0x00(%rsp),$D4        # preload r0^2     vpshufd        \$0x44,$D2,$D2     vmovdqa        $D3,-0x10(%r11)     vmovdqa        $D2,0x80(%rsp)
      jmp        .Loop_avx
  .align    32 .Loop_avx:     ################################################################     # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2     # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r     #   \___________________/     # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2     # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r     #   \___________________/ \____________________/     #     # Note that we start with inp[2:3]*r^2. This is because it     # doesn't depend on reduction in previous iteration.     ################################################################     # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4     # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4     # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4     # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4     # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4     #     # though note that $Tx and $Hx are "reversed" in this section,     # and $D4 is preloaded with r0^2...
      vpmuludq    $T0,$D4,$D0        # d0 = h0*r0     vpmuludq    $T1,$D4,$D1        # d1 = h1*r0       vmovdqa    $H2,0x20(%r11)                # offload hash     vpmuludq    $T2,$D4,$D2        # d3 = h2*r0      vmovdqa    0x10(%rsp),$H2        # r1^2     vpmuludq    $T3,$D4,$D3        # d3 = h3*r0     vpmuludq    $T4,$D4,$D4        # d4 = h4*r0
        vmovdqa    $H0,0x00(%r11)                #     vpmuludq    0x20(%rsp),$T4,$H0    # h4*s1       vmovdqa    $H1,0x10(%r11)                #     vpmuludq    $T3,$H2,$H1        # h3*r1     vpaddq        $H0,$D0,$D0        # d0 += h4*s1     vpaddq        $H1,$D4,$D4        # d4 += h3*r1       vmovdqa    $H3,0x30(%r11)                #     vpmuludq    $T2,$H2,$H0        # h2*r1     vpmuludq    $T1,$H2,$H1        # h1*r1     vpaddq        $H0,$D3,$D3        # d3 += h2*r1      vmovdqa    0x30(%rsp),$H3        # r2^2     vpaddq        $H1,$D2,$D2        # d2 += h1*r1       vmovdqa    $H4,0x40(%r11)                #     vpmuludq    $T0,$H2,$H2        # h0*r1      vpmuludq    $T2,$H3,$H0        # h2*r2     vpaddq        $H2,$D1,$D1        # d1 += h0*r1
       vmovdqa    0x40(%rsp),$H4        # s2^2     vpaddq        $H0,$D4,$D4        # d4 += h2*r2     vpmuludq    $T1,$H3,$H1        # h1*r2     vpmuludq    $T0,$H3,$H3        # h0*r2     vpaddq        $H1,$D3,$D3        # d3 += h1*r2      vmovdqa    0x50(%rsp),$H2        # r3^2     vpaddq        $H3,$D2,$D2        # d2 += h0*r2     vpmuludq    $T4,$H4,$H0        # h4*s2     vpmuludq    $T3,$H4,$H4        # h3*s2     vpaddq        $H0,$D1,$D1        # d1 += h4*s2      vmovdqa    0x60(%rsp),$H3        # s3^2     vpaddq        $H4,$D0,$D0        # d0 += h3*s2
       vmovdqa    0x80(%rsp),$H4        # s4^2     vpmuludq    $T1,$H2,$H1        # h1*r3     vpmuludq    $T0,$H2,$H2        # h0*r3     vpaddq        $H1,$D4,$D4        # d4 += h1*r3     vpaddq        $H2,$D3,$D3        # d3 += h0*r3     vpmuludq    $T4,$H3,$H0        # h4*s3     vpmuludq    $T3,$H3,$H1        # h3*s3     vpaddq        $H0,$D2,$D2        # d2 += h4*s3      vmovdqu    16*0($inp),$H0                # load input     vpaddq        $H1,$D1,$D1        # d1 += h3*s3     vpmuludq    $T2,$H3,$H3        # h2*s3      vpmuludq    $T2,$H4,$T2        # h2*s4     vpaddq        $H3,$D0,$D0        # d0 += h2*s3
       vmovdqu    16*1($inp),$H1                #     vpaddq        $T2,$D1,$D1        # d1 += h2*s4     vpmuludq    $T3,$H4,$T3        # h3*s4     vpmuludq    $T4,$H4,$T4        # h4*s4      vpsrldq    \$6,$H0,$H2                # splat input     vpaddq        $T3,$D2,$D2        # d2 += h3*s4     vpaddq        $T4,$D3,$D3        # d3 += h4*s4      vpsrldq    \$6,$H1,$H3                #     vpmuludq    0x70(%rsp),$T0,$T4    # h0*r4     vpmuludq    $T1,$H4,$T0        # h1*s4      vpunpckhqdq    $H1,$H0,$H4        # 4     vpaddq        $T4,$D4,$D4        # d4 += h0*r4      vmovdqa    -0x90(%r11),$T4        # r0^4     vpaddq        $T0,$D0,$D0        # d0 += h1*s4
      vpunpcklqdq    $H1,$H0,$H0        # 0:1     vpunpcklqdq    $H3,$H2,$H3        # 2:3
      #vpsrlq        \$40,$H4,$H4        # 4     vpsrldq        \$`40/8`,$H4,$H4    # 4     vpsrlq        \$26,$H0,$H1     vpand        $MASK,$H0,$H0        # 0     vpsrlq        \$4,$H3,$H2     vpand        $MASK,$H1,$H1        # 1     vpand        0(%rcx),$H4,$H4        # .Lmask24     vpsrlq        \$30,$H3,$H3     vpand        $MASK,$H2,$H2        # 2     vpand        $MASK,$H3,$H3        # 3     vpor        32(%rcx),$H4,$H4    # padbit, yes, always
      vpaddq        0x00(%r11),$H0,$H0    # add hash value     vpaddq        0x10(%r11),$H1,$H1     vpaddq        0x20(%r11),$H2,$H2     vpaddq        0x30(%r11),$H3,$H3     vpaddq        0x40(%r11),$H4,$H4
      lea        16*2($inp),%rax     lea        16*4($inp),$inp     sub        \$64,$len     cmovc        %rax,$inp
      ################################################################     # Now we accumulate (inp[0:1]+hash)*r^4     ################################################################     # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4     # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4     # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4     # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4     # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
      vpmuludq    $H0,$T4,$T0        # h0*r0     vpmuludq    $H1,$T4,$T1        # h1*r0     vpaddq        $T0,$D0,$D0     vpaddq        $T1,$D1,$D1      vmovdqa    -0x80(%r11),$T2        # r1^4     vpmuludq    $H2,$T4,$T0        # h2*r0     vpmuludq    $H3,$T4,$T1        # h3*r0     vpaddq        $T0,$D2,$D2     vpaddq        $T1,$D3,$D3     vpmuludq    $H4,$T4,$T4        # h4*r0      vpmuludq    -0x70(%r11),$H4,$T0    # h4*s1     vpaddq        $T4,$D4,$D4
      vpaddq        $T0,$D0,$D0        # d0 += h4*s1     vpmuludq    $H2,$T2,$T1        # h2*r1     vpmuludq    $H3,$T2,$T0        # h3*r1     vpaddq        $T1,$D3,$D3        # d3 += h2*r1      vmovdqa    -0x60(%r11),$T3        # r2^4     vpaddq        $T0,$D4,$D4        # d4 += h3*r1     vpmuludq    $H1,$T2,$T1        # h1*r1     vpmuludq    $H0,$T2,$T2        # h0*r1     vpaddq        $T1,$D2,$D2        # d2 += h1*r1     vpaddq        $T2,$D1,$D1        # d1 += h0*r1
       vmovdqa    -0x50(%r11),$T4        # s2^4     vpmuludq    $H2,$T3,$T0        # h2*r2     vpmuludq    $H1,$T3,$T1        # h1*r2     vpaddq        $T0,$D4,$D4        # d4 += h2*r2     vpaddq        $T1,$D3,$D3        # d3 += h1*r2      vmovdqa    -0x40(%r11),$T2        # r3^4     vpmuludq    $H0,$T3,$T3        # h0*r2     vpmuludq    $H4,$T4,$T0        # h4*s2     vpaddq        $T3,$D2,$D2        # d2 += h0*r2     vpaddq        $T0,$D1,$D1        # d1 += h4*s2      vmovdqa    -0x30(%r11),$T3        # s3^4     vpmuludq    $H3,$T4,$T4        # h3*s2      vpmuludq    $H1,$T2,$T1        # h1*r3     vpaddq        $T4,$D0,$D0        # d0 += h3*s2
       vmovdqa    -0x10(%r11),$T4        # s4^4     vpaddq        $T1,$D4,$D4        # d4 += h1*r3     vpmuludq    $H0,$T2,$T2        # h0*r3     vpmuludq    $H4,$T3,$T0        # h4*s3     vpaddq        $T2,$D3,$D3        # d3 += h0*r3     vpaddq        $T0,$D2,$D2        # d2 += h4*s3      vmovdqu    16*2($inp),$T0                # load input     vpmuludq    $H3,$T3,$T2        # h3*s3     vpmuludq    $H2,$T3,$T3        # h2*s3     vpaddq        $T2,$D1,$D1        # d1 += h3*s3      vmovdqu    16*3($inp),$T1                #     vpaddq        $T3,$D0,$D0        # d0 += h2*s3
      vpmuludq    $H2,$T4,$H2        # h2*s4     vpmuludq    $H3,$T4,$H3        # h3*s4      vpsrldq    \$6,$T0,$T2                # splat input     vpaddq        $H2,$D1,$D1        # d1 += h2*s4     vpmuludq    $H4,$T4,$H4        # h4*s4      vpsrldq    \$6,$T1,$T3                #     vpaddq        $H3,$D2,$H2        # h2 = d2 + h3*s4     vpaddq        $H4,$D3,$H3        # h3 = d3 + h4*s4     vpmuludq    -0x20(%r11),$H0,$H4    # h0*r4     vpmuludq    $H1,$T4,$H0      vpunpckhqdq    $T1,$T0,$T4        # 4     vpaddq        $H4,$D4,$H4        # h4 = d4 + h0*r4     vpaddq        $H0,$D0,$H0        # h0 = d0 + h1*s4
      vpunpcklqdq    $T1,$T0,$T0        # 0:1     vpunpcklqdq    $T3,$T2,$T3        # 2:3
      #vpsrlq        \$40,$T4,$T4        # 4     vpsrldq        \$`40/8`,$T4,$T4    # 4     vpsrlq        \$26,$T0,$T1      vmovdqa    0x00(%rsp),$D4        # preload r0^2     vpand        $MASK,$T0,$T0        # 0     vpsrlq        \$4,$T3,$T2     vpand        $MASK,$T1,$T1        # 1     vpand        0(%rcx),$T4,$T4        # .Lmask24     vpsrlq        \$30,$T3,$T3     vpand        $MASK,$T2,$T2        # 2     vpand        $MASK,$T3,$T3        # 3     vpor        32(%rcx),$T4,$T4    # padbit, yes, always
      ################################################################     # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein     # and P. Schwabe
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3     vpaddq        $D3,$H4,$H4        # h3 -> h4
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0     vpaddq        $D0,$D1,$H1        # h0 -> h1
      vpsrlq        \$26,$H4,$D0     vpand        $MASK,$H4,$H4
      vpsrlq        \$26,$H1,$D1     vpand        $MASK,$H1,$H1     vpaddq        $D1,$H2,$H2        # h1 -> h2
      vpaddq        $D0,$H0,$H0     vpsllq        \$2,$D0,$D0     vpaddq        $D0,$H0,$H0        # h4 -> h0
      vpsrlq        \$26,$H2,$D2     vpand        $MASK,$H2,$H2     vpaddq        $D2,$H3,$H3        # h2 -> h3
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0     vpaddq        $D0,$H1,$H1        # h0 -> h1
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3     vpaddq        $D3,$H4,$H4        # h3 -> h4
      ja        .Loop_avx
  .Lskip_loop_avx:     ################################################################     # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
      vpshufd        \$0x10,$D4,$D4        # r0^n, xx12 -> x1x2     add        \$32,$len     jnz        .Long_tail_avx
      vpaddq        $H2,$T2,$T2     vpaddq        $H0,$T0,$T0     vpaddq        $H1,$T1,$T1     vpaddq        $H3,$T3,$T3     vpaddq        $H4,$T4,$T4
  .Long_tail_avx:     vmovdqa        $H2,0x20(%r11)     vmovdqa        $H0,0x00(%r11)     vmovdqa        $H1,0x10(%r11)     vmovdqa        $H3,0x30(%r11)     vmovdqa        $H4,0x40(%r11)
      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4     # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4     # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4     # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4     # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
      vpmuludq    $T2,$D4,$D2        # d2 = h2*r0     vpmuludq    $T0,$D4,$D0        # d0 = h0*r0      vpshufd    \$0x10,`16*1-64`($ctx),$H2        # r1^n     vpmuludq    $T1,$D4,$D1        # d1 = h1*r0     vpmuludq    $T3,$D4,$D3        # d3 = h3*r0     vpmuludq    $T4,$D4,$D4        # d4 = h4*r0
      vpmuludq    $T3,$H2,$H0        # h3*r1     vpaddq        $H0,$D4,$D4        # d4 += h3*r1      vpshufd    \$0x10,`16*2-64`($ctx),$H3        # s1^n     vpmuludq    $T2,$H2,$H1        # h2*r1     vpaddq        $H1,$D3,$D3        # d3 += h2*r1      vpshufd    \$0x10,`16*3-64`($ctx),$H4        # r2^n     vpmuludq    $T1,$H2,$H0        # h1*r1     vpaddq        $H0,$D2,$D2        # d2 += h1*r1     vpmuludq    $T0,$H2,$H2        # h0*r1     vpaddq        $H2,$D1,$D1        # d1 += h0*r1     vpmuludq    $T4,$H3,$H3        # h4*s1     vpaddq        $H3,$D0,$D0        # d0 += h4*s1
       vpshufd    \$0x10,`16*4-64`($ctx),$H2        # s2^n     vpmuludq    $T2,$H4,$H1        # h2*r2     vpaddq        $H1,$D4,$D4        # d4 += h2*r2     vpmuludq    $T1,$H4,$H0        # h1*r2     vpaddq        $H0,$D3,$D3        # d3 += h1*r2      vpshufd    \$0x10,`16*5-64`($ctx),$H3        # r3^n     vpmuludq    $T0,$H4,$H4        # h0*r2     vpaddq        $H4,$D2,$D2        # d2 += h0*r2     vpmuludq    $T4,$H2,$H1        # h4*s2     vpaddq        $H1,$D1,$D1        # d1 += h4*s2      vpshufd    \$0x10,`16*6-64`($ctx),$H4        # s3^n     vpmuludq    $T3,$H2,$H2        # h3*s2     vpaddq        $H2,$D0,$D0        # d0 += h3*s2
      vpmuludq    $T1,$H3,$H0        # h1*r3     vpaddq        $H0,$D4,$D4        # d4 += h1*r3     vpmuludq    $T0,$H3,$H3        # h0*r3     vpaddq        $H3,$D3,$D3        # d3 += h0*r3      vpshufd    \$0x10,`16*7-64`($ctx),$H2        # r4^n     vpmuludq    $T4,$H4,$H1        # h4*s3     vpaddq        $H1,$D2,$D2        # d2 += h4*s3      vpshufd    \$0x10,`16*8-64`($ctx),$H3        # s4^n     vpmuludq    $T3,$H4,$H0        # h3*s3     vpaddq        $H0,$D1,$D1        # d1 += h3*s3     vpmuludq    $T2,$H4,$H4        # h2*s3     vpaddq        $H4,$D0,$D0        # d0 += h2*s3
      vpmuludq    $T0,$H2,$H2        # h0*r4     vpaddq        $H2,$D4,$D4        # h4 = d4 + h0*r4     vpmuludq    $T4,$H3,$H1        # h4*s4     vpaddq        $H1,$D3,$D3        # h3 = d3 + h4*s4     vpmuludq    $T3,$H3,$H0        # h3*s4     vpaddq        $H0,$D2,$D2        # h2 = d2 + h3*s4     vpmuludq    $T2,$H3,$H1        # h2*s4     vpaddq        $H1,$D1,$D1        # h1 = d1 + h2*s4     vpmuludq    $T1,$H3,$H3        # h1*s4     vpaddq        $H3,$D0,$D0        # h0 = d0 + h1*s4
      jz        .Lshort_tail_avx
      vmovdqu        16*0($inp),$H0        # load input     vmovdqu        16*1($inp),$H1
      vpsrldq        \$6,$H0,$H2        # splat input     vpsrldq        \$6,$H1,$H3     vpunpckhqdq    $H1,$H0,$H4        # 4     vpunpcklqdq    $H1,$H0,$H0        # 0:1     vpunpcklqdq    $H3,$H2,$H3        # 2:3
      vpsrlq        \$40,$H4,$H4        # 4     vpsrlq        \$26,$H0,$H1     vpand        $MASK,$H0,$H0        # 0     vpsrlq        \$4,$H3,$H2     vpand        $MASK,$H1,$H1        # 1     vpsrlq        \$30,$H3,$H3     vpand        $MASK,$H2,$H2        # 2     vpand        $MASK,$H3,$H3        # 3     vpor        32(%rcx),$H4,$H4    # padbit, yes, always
      vpshufd        \$0x32,`16*0-64`($ctx),$T4    # r0^n, 34xx -> x3x4     vpaddq        0x00(%r11),$H0,$H0     vpaddq        0x10(%r11),$H1,$H1     vpaddq        0x20(%r11),$H2,$H2     vpaddq        0x30(%r11),$H3,$H3     vpaddq        0x40(%r11),$H4,$H4
      ################################################################     # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
      vpmuludq    $H0,$T4,$T0        # h0*r0     vpaddq        $T0,$D0,$D0        # d0 += h0*r0     vpmuludq    $H1,$T4,$T1        # h1*r0     vpaddq        $T1,$D1,$D1        # d1 += h1*r0     vpmuludq    $H2,$T4,$T0        # h2*r0     vpaddq        $T0,$D2,$D2        # d2 += h2*r0      vpshufd    \$0x32,`16*1-64`($ctx),$T2        # r1^n     vpmuludq    $H3,$T4,$T1        # h3*r0     vpaddq        $T1,$D3,$D3        # d3 += h3*r0     vpmuludq    $H4,$T4,$T4        # h4*r0     vpaddq        $T4,$D4,$D4        # d4 += h4*r0
      vpmuludq    $H3,$T2,$T0        # h3*r1     vpaddq        $T0,$D4,$D4        # d4 += h3*r1      vpshufd    \$0x32,`16*2-64`($ctx),$T3        # s1     vpmuludq    $H2,$T2,$T1        # h2*r1     vpaddq        $T1,$D3,$D3        # d3 += h2*r1      vpshufd    \$0x32,`16*3-64`($ctx),$T4        # r2     vpmuludq    $H1,$T2,$T0        # h1*r1     vpaddq        $T0,$D2,$D2        # d2 += h1*r1     vpmuludq    $H0,$T2,$T2        # h0*r1     vpaddq        $T2,$D1,$D1        # d1 += h0*r1     vpmuludq    $H4,$T3,$T3        # h4*s1     vpaddq        $T3,$D0,$D0        # d0 += h4*s1
       vpshufd    \$0x32,`16*4-64`($ctx),$T2        # s2     vpmuludq    $H2,$T4,$T1        # h2*r2     vpaddq        $T1,$D4,$D4        # d4 += h2*r2     vpmuludq    $H1,$T4,$T0        # h1*r2     vpaddq        $T0,$D3,$D3        # d3 += h1*r2      vpshufd    \$0x32,`16*5-64`($ctx),$T3        # r3     vpmuludq    $H0,$T4,$T4        # h0*r2     vpaddq        $T4,$D2,$D2        # d2 += h0*r2     vpmuludq    $H4,$T2,$T1        # h4*s2     vpaddq        $T1,$D1,$D1        # d1 += h4*s2      vpshufd    \$0x32,`16*6-64`($ctx),$T4        # s3     vpmuludq    $H3,$T2,$T2        # h3*s2     vpaddq        $T2,$D0,$D0        # d0 += h3*s2
      vpmuludq    $H1,$T3,$T0        # h1*r3     vpaddq        $T0,$D4,$D4        # d4 += h1*r3     vpmuludq    $H0,$T3,$T3        # h0*r3     vpaddq        $T3,$D3,$D3        # d3 += h0*r3      vpshufd    \$0x32,`16*7-64`($ctx),$T2        # r4     vpmuludq    $H4,$T4,$T1        # h4*s3     vpaddq        $T1,$D2,$D2        # d2 += h4*s3      vpshufd    \$0x32,`16*8-64`($ctx),$T3        # s4     vpmuludq    $H3,$T4,$T0        # h3*s3     vpaddq        $T0,$D1,$D1        # d1 += h3*s3     vpmuludq    $H2,$T4,$T4        # h2*s3     vpaddq        $T4,$D0,$D0        # d0 += h2*s3
      vpmuludq    $H0,$T2,$T2        # h0*r4     vpaddq        $T2,$D4,$D4        # d4 += h0*r4     vpmuludq    $H4,$T3,$T1        # h4*s4     vpaddq        $T1,$D3,$D3        # d3 += h4*s4     vpmuludq    $H3,$T3,$T0        # h3*s4     vpaddq        $T0,$D2,$D2        # d2 += h3*s4     vpmuludq    $H2,$T3,$T1        # h2*s4     vpaddq        $T1,$D1,$D1        # d1 += h2*s4     vpmuludq    $H1,$T3,$T3        # h1*s4     vpaddq        $T3,$D0,$D0        # d0 += h1*s4
  .Lshort_tail_avx:     ################################################################     # horizontal addition
      vpsrldq        \$8,$D4,$T4     vpsrldq        \$8,$D3,$T3     vpsrldq        \$8,$D1,$T1     vpsrldq        \$8,$D0,$T0     vpsrldq        \$8,$D2,$T2     vpaddq        $T3,$D3,$D3     vpaddq        $T4,$D4,$D4     vpaddq        $T0,$D0,$D0     vpaddq        $T1,$D1,$D1     vpaddq        $T2,$D2,$D2
      ################################################################     # lazy reduction
      vpsrlq        \$26,$D3,$H3     vpand        $MASK,$D3,$D3     vpaddq        $H3,$D4,$D4        # h3 -> h4
      vpsrlq        \$26,$D0,$H0     vpand        $MASK,$D0,$D0     vpaddq        $H0,$D1,$D1        # h0 -> h1
      vpsrlq        \$26,$D4,$H4     vpand        $MASK,$D4,$D4
      vpsrlq        \$26,$D1,$H1     vpand        $MASK,$D1,$D1     vpaddq        $H1,$D2,$D2        # h1 -> h2
      vpaddq        $H4,$D0,$D0     vpsllq        \$2,$H4,$H4     vpaddq        $H4,$D0,$D0        # h4 -> h0
      vpsrlq        \$26,$D2,$H2     vpand        $MASK,$D2,$D2     vpaddq        $H2,$D3,$D3        # h2 -> h3
      vpsrlq        \$26,$D0,$H0     vpand        $MASK,$D0,$D0     vpaddq        $H0,$D1,$D1        # h0 -> h1
      vpsrlq        \$26,$D3,$H3     vpand        $MASK,$D3,$D3     vpaddq        $H3,$D4,$D4        # h3 -> h4
      vmovd        $D0,`4*0-48-64`($ctx)    # save partially reduced     vmovd        $D1,`4*1-48-64`($ctx)     vmovd        $D2,`4*2-48-64`($ctx)     vmovd        $D3,`4*3-48-64`($ctx)     vmovd        $D4,`4*4-48-64`($ctx) ___ $code.=<<___    if ($win64);     vmovdqa        0x50(%r11),%xmm6     vmovdqa        0x60(%r11),%xmm7     vmovdqa        0x70(%r11),%xmm8     vmovdqa        0x80(%r11),%xmm9     vmovdqa        0x90(%r11),%xmm10     vmovdqa        0xa0(%r11),%xmm11     vmovdqa        0xb0(%r11),%xmm12     vmovdqa        0xc0(%r11),%xmm13     vmovdqa        0xd0(%r11),%xmm14     vmovdqa        0xe0(%r11),%xmm15     lea        0xf8(%r11),%rsp .Ldo_avx_epilogue: ___ $code.=<<___    if (!$win64);     lea        0x58(%r11),%rsp .cfi_def_cfa        %rsp,8 ___ $code.=<<___;     vzeroupper     ret .cfi_endproc .size    poly1305_blocks_avx,.-poly1305_blocks_avx
  .type    poly1305_emit_avx,\@function,3 .align    32 poly1305_emit_avx: .cfi_startproc     cmpl    \$0,20($ctx)    # is_base2_26?     je    .Lemit
      mov    0($ctx),%eax    # load hash value base 2^26     mov    4($ctx),%ecx     mov    8($ctx),%r8d     mov    12($ctx),%r11d     mov    16($ctx),%r10d
      shl    \$26,%rcx    # base 2^26 -> base 2^64     mov    %r8,%r9     shl    \$52,%r8     add    %rcx,%rax     shr    \$12,%r9     add    %rax,%r8    # h0     adc    \$0,%r9
      shl    \$14,%r11     mov    %r10,%rax     shr    \$24,%r10     add    %r11,%r9     shl    \$40,%rax     add    %rax,%r9    # h1     adc    \$0,%r10    # h2
      mov    %r10,%rax    # could be partially reduced, so reduce     mov    %r10,%rcx     and    \$3,%r10     shr    \$2,%rax     and    \$-4,%rcx     add    %rcx,%rax     add    %rax,%r8     adc    \$0,%r9     adc    \$0,%r10
      mov    %r8,%rax     add    \$5,%r8        # compare to modulus     mov    %r9,%rcx     adc    \$0,%r9     adc    \$0,%r10     shr    \$2,%r10    # did 130-bit value overflow?     cmovnz    %r8,%rax     cmovnz    %r9,%rcx
      add    0($nonce),%rax    # accumulate nonce     adc    8($nonce),%rcx     mov    %rax,0($mac)    # write result     mov    %rcx,8($mac)
      ret .cfi_endproc .size    poly1305_emit_avx,.-poly1305_emit_avx ___
  if ($avx>1) { my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =     map("%ymm$_",(0..15)); my $S4=$MASK;
  $code.=<<___; .type    poly1305_blocks_avx2,\@function,4 .align    32 poly1305_blocks_avx2: .cfi_startproc     mov    20($ctx),%r8d        # is_base2_26     cmp    \$128,$len     jae    .Lblocks_avx2     test    %r8d,%r8d     jz    .Lblocks
  .Lblocks_avx2:     and    \$-16,$len     jz    .Lno_data_avx2
      vzeroupper
      test    %r8d,%r8d     jz    .Lbase2_64_avx2
      test    \$63,$len     jz    .Leven_avx2
      push    %rbx .cfi_push    %rbx     push    %rbp .cfi_push    %rbp     push    %r12 .cfi_push    %r12     push    %r13 .cfi_push    %r13     push    %r14 .cfi_push    %r14     push    %r15 .cfi_push    %r15 .Lblocks_avx2_body:
      mov    $len,%r15        # reassign $len
      mov    0($ctx),$d1        # load hash value     mov    8($ctx),$d2     mov    16($ctx),$h2#d
      mov    24($ctx),$r0        # load r     mov    32($ctx),$s1
      ################################# base 2^26 -> base 2^64     mov    $d1#d,$h0#d     and    \$`-1*(1<<31)`,$d1     mov    $d2,$r1            # borrow $r1     mov    $d2#d,$h1#d     and    \$`-1*(1<<31)`,$d2
      shr    \$6,$d1     shl    \$52,$r1     add    $d1,$h0     shr    \$12,$h1     shr    \$18,$d2     add    $r1,$h0     adc    $d2,$h1
      mov    $h2,$d1     shl    \$40,$d1     shr    \$24,$h2     add    $d1,$h1     adc    \$0,$h2            # can be partially reduced...
      mov    \$-4,$d2        # ... so reduce     mov    $h2,$d1     and    $h2,$d2     shr    \$2,$d1     and    \$3,$h2     add    $d2,$d1            # =*5     add    $d1,$h0     adc    \$0,$h1     adc    \$0,$h2
      mov    $s1,$r1     mov    $s1,%rax     shr    \$2,$s1     add    $r1,$s1            # s1 = r1 + (r1 >> 2)
  .Lbase2_26_pre_avx2:     add    0($inp),$h0        # accumulate input     adc    8($inp),$h1     lea    16($inp),$inp     adc    $padbit,$h2     sub    \$16,%r15
      call    __poly1305_block     mov    $r1,%rax
      test    \$63,%r15     jnz    .Lbase2_26_pre_avx2
      test    $padbit,$padbit        # if $padbit is zero,     jz    .Lstore_base2_64_avx2    # store hash in base 2^64 format
      ################################# base 2^64 -> base 2^26     mov    $h0,%rax     mov    $h0,%rdx     shr    \$52,$h0     mov    $h1,$r0     mov    $h1,$r1     shr    \$26,%rdx     and    \$0x3ffffff,%rax    # h[0]     shl    \$12,$r0     and    \$0x3ffffff,%rdx    # h[1]     shr    \$14,$h1     or    $r0,$h0     shl    \$24,$h2     and    \$0x3ffffff,$h0        # h[2]     shr    \$40,$r1     and    \$0x3ffffff,$h1        # h[3]     or    $r1,$h2            # h[4]
      test    %r15,%r15     jz    .Lstore_base2_26_avx2
      vmovd    %rax#d,%x#$H0     vmovd    %rdx#d,%x#$H1     vmovd    $h0#d,%x#$H2     vmovd    $h1#d,%x#$H3     vmovd    $h2#d,%x#$H4     jmp    .Lproceed_avx2
  .align    32 .Lstore_base2_64_avx2:     mov    $h0,0($ctx)     mov    $h1,8($ctx)     mov    $h2,16($ctx)        # note that is_base2_26 is zeroed     jmp    .Ldone_avx2
  .align    16 .Lstore_base2_26_avx2:     mov    %rax#d,0($ctx)        # store hash value base 2^26     mov    %rdx#d,4($ctx)     mov    $h0#d,8($ctx)     mov    $h1#d,12($ctx)     mov    $h2#d,16($ctx) .align    16 .Ldone_avx2:     mov    0(%rsp),%r15 .cfi_restore    %r15     mov    8(%rsp),%r14 .cfi_restore    %r14     mov    16(%rsp),%r13 .cfi_restore    %r13     mov    24(%rsp),%r12 .cfi_restore    %r12     mov    32(%rsp),%rbp .cfi_restore    %rbp     mov    40(%rsp),%rbx .cfi_restore    %rbx     lea    48(%rsp),%rsp .cfi_adjust_cfa_offset    -48 .Lno_data_avx2: .Lblocks_avx2_epilogue:     ret .cfi_endproc
  .align    32 .Lbase2_64_avx2: .cfi_startproc     push    %rbx .cfi_push    %rbx     push    %rbp .cfi_push    %rbp     push    %r12 .cfi_push    %r12     push    %r13 .cfi_push    %r13     push    %r14 .cfi_push    %r14     push    %r15 .cfi_push    %r15 .Lbase2_64_avx2_body:
      mov    $len,%r15        # reassign $len
      mov    24($ctx),$r0        # load r     mov    32($ctx),$s1
      mov    0($ctx),$h0        # load hash value     mov    8($ctx),$h1     mov    16($ctx),$h2#d
      mov    $s1,$r1     mov    $s1,%rax     shr    \$2,$s1     add    $r1,$s1            # s1 = r1 + (r1 >> 2)
      test    \$63,$len     jz    .Linit_avx2
  .Lbase2_64_pre_avx2:     add    0($inp),$h0        # accumulate input     adc    8($inp),$h1     lea    16($inp),$inp     adc    $padbit,$h2     sub    \$16,%r15
      call    __poly1305_block     mov    $r1,%rax
      test    \$63,%r15     jnz    .Lbase2_64_pre_avx2
  .Linit_avx2:     ################################# base 2^64 -> base 2^26     mov    $h0,%rax     mov    $h0,%rdx     shr    \$52,$h0     mov    $h1,$d1     mov    $h1,$d2     shr    \$26,%rdx     and    \$0x3ffffff,%rax    # h[0]     shl    \$12,$d1     and    \$0x3ffffff,%rdx    # h[1]     shr    \$14,$h1     or    $d1,$h0     shl    \$24,$h2     and    \$0x3ffffff,$h0        # h[2]     shr    \$40,$d2     and    \$0x3ffffff,$h1        # h[3]     or    $d2,$h2            # h[4]
      vmovd    %rax#d,%x#$H0     vmovd    %rdx#d,%x#$H1     vmovd    $h0#d,%x#$H2     vmovd    $h1#d,%x#$H3     vmovd    $h2#d,%x#$H4     movl    \$1,20($ctx)        # set is_base2_26
      call    __poly1305_init_avx
  .Lproceed_avx2:     mov    %r15,$len            # restore $len     mov    OPENSSL_ia32cap_P+8(%rip),%r10d     mov    \$`(1<<31|1<<30|1<<16)`,%r11d
      mov    0(%rsp),%r15 .cfi_restore    %r15     mov    8(%rsp),%r14 .cfi_restore    %r14     mov    16(%rsp),%r13 .cfi_restore    %r13     mov    24(%rsp),%r12 .cfi_restore    %r12     mov    32(%rsp),%rbp .cfi_restore    %rbp     mov    40(%rsp),%rbx .cfi_restore    %rbx     lea    48(%rsp),%rax     lea    48(%rsp),%rsp .cfi_adjust_cfa_offset    -48 .Lbase2_64_avx2_epilogue:     jmp    .Ldo_avx2 .cfi_endproc
  .align    32 .Leven_avx2: .cfi_startproc     mov        OPENSSL_ia32cap_P+8(%rip),%r10d     vmovd        4*0($ctx),%x#$H0    # load hash value base 2^26     vmovd        4*1($ctx),%x#$H1     vmovd        4*2($ctx),%x#$H2     vmovd        4*3($ctx),%x#$H3     vmovd        4*4($ctx),%x#$H4
  .Ldo_avx2: ___ $code.=<<___        if ($avx>2);     cmp        \$512,$len     jb        .Lskip_avx512     and        %r11d,%r10d     test        \$`1<<16`,%r10d        # check for AVX512F     jnz        .Lblocks_avx512 .Lskip_avx512: ___ $code.=<<___    if (!$win64);     lea        -8(%rsp),%r11 .cfi_def_cfa        %r11,16     sub        \$0x128,%rsp ___ $code.=<<___    if ($win64);     lea        -0xf8(%rsp),%r11     sub        \$0x1c8,%rsp     vmovdqa        %xmm6,0x50(%r11)     vmovdqa        %xmm7,0x60(%r11)     vmovdqa        %xmm8,0x70(%r11)     vmovdqa        %xmm9,0x80(%r11)     vmovdqa        %xmm10,0x90(%r11)     vmovdqa        %xmm11,0xa0(%r11)     vmovdqa        %xmm12,0xb0(%r11)     vmovdqa        %xmm13,0xc0(%r11)     vmovdqa        %xmm14,0xd0(%r11)     vmovdqa        %xmm15,0xe0(%r11) .Ldo_avx2_body: ___ $code.=<<___;     lea        .Lconst(%rip),%rcx     lea        48+64($ctx),$ctx    # size optimization     vmovdqa        96(%rcx),$T0        # .Lpermd_avx2
      # expand and copy pre-calculated table to stack     vmovdqu        `16*0-64`($ctx),%x#$T2     and        \$-512,%rsp     vmovdqu        `16*1-64`($ctx),%x#$T3     vmovdqu        `16*2-64`($ctx),%x#$T4     vmovdqu        `16*3-64`($ctx),%x#$D0     vmovdqu        `16*4-64`($ctx),%x#$D1     vmovdqu        `16*5-64`($ctx),%x#$D2     lea        0x90(%rsp),%rax        # size optimization     vmovdqu        `16*6-64`($ctx),%x#$D3     vpermd        $T2,$T0,$T2        # 00003412 -> 14243444     vmovdqu        `16*7-64`($ctx),%x#$D4     vpermd        $T3,$T0,$T3     vmovdqu        `16*8-64`($ctx),%x#$MASK     vpermd        $T4,$T0,$T4     vmovdqa        $T2,0x00(%rsp)     vpermd        $D0,$T0,$D0     vmovdqa        $T3,0x20-0x90(%rax)     vpermd        $D1,$T0,$D1     vmovdqa        $T4,0x40-0x90(%rax)     vpermd        $D2,$T0,$D2     vmovdqa        $D0,0x60-0x90(%rax)     vpermd        $D3,$T0,$D3     vmovdqa        $D1,0x80-0x90(%rax)     vpermd        $D4,$T0,$D4     vmovdqa        $D2,0xa0-0x90(%rax)     vpermd        $MASK,$T0,$MASK     vmovdqa        $D3,0xc0-0x90(%rax)     vmovdqa        $D4,0xe0-0x90(%rax)     vmovdqa        $MASK,0x100-0x90(%rax)     vmovdqa        64(%rcx),$MASK        # .Lmask26
      ################################################################     # load input     vmovdqu        16*0($inp),%x#$T0     vmovdqu        16*1($inp),%x#$T1     vinserti128    \$1,16*2($inp),$T0,$T0     vinserti128    \$1,16*3($inp),$T1,$T1     lea        16*4($inp),$inp
      vpsrldq        \$6,$T0,$T2        # splat input     vpsrldq        \$6,$T1,$T3     vpunpckhqdq    $T1,$T0,$T4        # 4     vpunpcklqdq    $T3,$T2,$T2        # 2:3     vpunpcklqdq    $T1,$T0,$T0        # 0:1
      vpsrlq        \$30,$T2,$T3     vpsrlq        \$4,$T2,$T2     vpsrlq        \$26,$T0,$T1     vpsrlq        \$40,$T4,$T4        # 4     vpand        $MASK,$T2,$T2        # 2     vpand        $MASK,$T0,$T0        # 0     vpand        $MASK,$T1,$T1        # 1     vpand        $MASK,$T3,$T3        # 3     vpor        32(%rcx),$T4,$T4    # padbit, yes, always
      vpaddq        $H2,$T2,$H2        # accumulate input     sub        \$64,$len     jz        .Ltail_avx2     jmp        .Loop_avx2
  .align    32 .Loop_avx2:     ################################################################     # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4     # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3     # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2     # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1     #   \________/\__________/     ################################################################     #vpaddq        $H2,$T2,$H2        # accumulate input     vpaddq        $H0,$T0,$H0     vmovdqa        `32*0`(%rsp),$T0    # r0^4     vpaddq        $H1,$T1,$H1     vmovdqa        `32*1`(%rsp),$T1    # r1^4     vpaddq        $H3,$T3,$H3     vmovdqa        `32*3`(%rsp),$T2    # r2^4     vpaddq        $H4,$T4,$H4     vmovdqa        `32*6-0x90`(%rax),$T3    # s3^4     vmovdqa        `32*8-0x90`(%rax),$S4    # s4^4
      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4     # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4     # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4     # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4     # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4     #     # however, as h2 is "chronologically" first one available pull     # corresponding operations up, so it's     #     # d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4     # d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4     # d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4     # d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3     # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
      vpmuludq    $H2,$T0,$D2        # d2 = h2*r0     vpmuludq    $H2,$T1,$D3        # d3 = h2*r1     vpmuludq    $H2,$T2,$D4        # d4 = h2*r2     vpmuludq    $H2,$T3,$D0        # d0 = h2*s3     vpmuludq    $H2,$S4,$D1        # d1 = h2*s4
      vpmuludq    $H0,$T1,$T4        # h0*r1     vpmuludq    $H1,$T1,$H2        # h1*r1, borrow $H2 as temp     vpaddq        $T4,$D1,$D1        # d1 += h0*r1     vpaddq        $H2,$D2,$D2        # d2 += h1*r1     vpmuludq    $H3,$T1,$T4        # h3*r1     vpmuludq    `32*2`(%rsp),$H4,$H2    # h4*s1     vpaddq        $T4,$D4,$D4        # d4 += h3*r1     vpaddq        $H2,$D0,$D0        # d0 += h4*s1      vmovdqa    `32*4-0x90`(%rax),$T1    # s2
      vpmuludq    $H0,$T0,$T4        # h0*r0     vpmuludq    $H1,$T0,$H2        # h1*r0     vpaddq        $T4,$D0,$D0        # d0 += h0*r0     vpaddq        $H2,$D1,$D1        # d1 += h1*r0     vpmuludq    $H3,$T0,$T4        # h3*r0     vpmuludq    $H4,$T0,$H2        # h4*r0      vmovdqu    16*0($inp),%x#$T0    # load input     vpaddq        $T4,$D3,$D3        # d3 += h3*r0     vpaddq        $H2,$D4,$D4        # d4 += h4*r0      vinserti128    \$1,16*2($inp),$T0,$T0
      vpmuludq    $H3,$T1,$T4        # h3*s2     vpmuludq    $H4,$T1,$H2        # h4*s2      vmovdqu    16*1($inp),%x#$T1     vpaddq        $T4,$D0,$D0        # d0 += h3*s2     vpaddq        $H2,$D1,$D1        # d1 += h4*s2      vmovdqa    `32*5-0x90`(%rax),$H2    # r3     vpmuludq    $H1,$T2,$T4        # h1*r2     vpmuludq    $H0,$T2,$T2        # h0*r2     vpaddq        $T4,$D3,$D3        # d3 += h1*r2     vpaddq        $T2,$D2,$D2        # d2 += h0*r2      vinserti128    \$1,16*3($inp),$T1,$T1      lea        16*4($inp),$inp
      vpmuludq    $H1,$H2,$T4        # h1*r3     vpmuludq    $H0,$H2,$H2        # h0*r3      vpsrldq    \$6,$T0,$T2        # splat input     vpaddq        $T4,$D4,$D4        # d4 += h1*r3     vpaddq        $H2,$D3,$D3        # d3 += h0*r3     vpmuludq    $H3,$T3,$T4        # h3*s3     vpmuludq    $H4,$T3,$H2        # h4*s3      vpsrldq    \$6,$T1,$T3     vpaddq        $T4,$D1,$D1        # d1 += h3*s3     vpaddq        $H2,$D2,$D2        # d2 += h4*s3      vpunpckhqdq    $T1,$T0,$T4        # 4
      vpmuludq    $H3,$S4,$H3        # h3*s4     vpmuludq    $H4,$S4,$H4        # h4*s4      vpunpcklqdq    $T1,$T0,$T0        # 0:1     vpaddq        $H3,$D2,$H2        # h2 = d2 + h3*r4     vpaddq        $H4,$D3,$H3        # h3 = d3 + h4*r4      vpunpcklqdq    $T3,$T2,$T3        # 2:3     vpmuludq    `32*7-0x90`(%rax),$H0,$H4    # h0*r4     vpmuludq    $H1,$S4,$H0        # h1*s4     vmovdqa        64(%rcx),$MASK        # .Lmask26     vpaddq        $H4,$D4,$H4        # h4 = d4 + h0*r4     vpaddq        $H0,$D0,$H0        # h0 = d0 + h1*s4
      ################################################################     # lazy reduction (interleaved with tail of input splat)
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3     vpaddq        $D3,$H4,$H4        # h3 -> h4
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0     vpaddq        $D0,$D1,$H1        # h0 -> h1
      vpsrlq        \$26,$H4,$D4     vpand        $MASK,$H4,$H4
       vpsrlq        \$4,$T3,$T2
      vpsrlq        \$26,$H1,$D1     vpand        $MASK,$H1,$H1     vpaddq        $D1,$H2,$H2        # h1 -> h2
      vpaddq        $D4,$H0,$H0     vpsllq        \$2,$D4,$D4     vpaddq        $D4,$H0,$H0        # h4 -> h0
       vpand        $MASK,$T2,$T2        # 2      vpsrlq        \$26,$T0,$T1
      vpsrlq        \$26,$H2,$D2     vpand        $MASK,$H2,$H2     vpaddq        $D2,$H3,$H3        # h2 -> h3
       vpaddq        $T2,$H2,$H2        # modulo-scheduled      vpsrlq        \$30,$T3,$T3
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0     vpaddq        $D0,$H1,$H1        # h0 -> h1
       vpsrlq        \$40,$T4,$T4        # 4
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3     vpaddq        $D3,$H4,$H4        # h3 -> h4
       vpand        $MASK,$T0,$T0        # 0      vpand        $MASK,$T1,$T1        # 1      vpand        $MASK,$T3,$T3        # 3      vpor        32(%rcx),$T4,$T4    # padbit, yes, always
      sub        \$64,$len     jnz        .Loop_avx2
      .byte        0x66,0x90 .Ltail_avx2:     ################################################################     # while above multiplications were by r^4 in all lanes, in last     # iteration we multiply least significant lane by r^4 and most     # significant one by r, so copy of above except that references     # to the precomputed table are displaced by 4...
      #vpaddq        $H2,$T2,$H2        # accumulate input     vpaddq        $H0,$T0,$H0     vmovdqu        `32*0+4`(%rsp),$T0    # r0^4     vpaddq        $H1,$T1,$H1     vmovdqu        `32*1+4`(%rsp),$T1    # r1^4     vpaddq        $H3,$T3,$H3     vmovdqu        `32*3+4`(%rsp),$T2    # r2^4     vpaddq        $H4,$T4,$H4     vmovdqu        `32*6+4-0x90`(%rax),$T3    # s3^4     vmovdqu        `32*8+4-0x90`(%rax),$S4    # s4^4
      vpmuludq    $H2,$T0,$D2        # d2 = h2*r0     vpmuludq    $H2,$T1,$D3        # d3 = h2*r1     vpmuludq    $H2,$T2,$D4        # d4 = h2*r2     vpmuludq    $H2,$T3,$D0        # d0 = h2*s3     vpmuludq    $H2,$S4,$D1        # d1 = h2*s4
      vpmuludq    $H0,$T1,$T4        # h0*r1     vpmuludq    $H1,$T1,$H2        # h1*r1     vpaddq        $T4,$D1,$D1        # d1 += h0*r1     vpaddq        $H2,$D2,$D2        # d2 += h1*r1     vpmuludq    $H3,$T1,$T4        # h3*r1     vpmuludq    `32*2+4`(%rsp),$H4,$H2    # h4*s1     vpaddq        $T4,$D4,$D4        # d4 += h3*r1     vpaddq        $H2,$D0,$D0        # d0 += h4*s1
      vpmuludq    $H0,$T0,$T4        # h0*r0     vpmuludq    $H1,$T0,$H2        # h1*r0     vpaddq        $T4,$D0,$D0        # d0 += h0*r0      vmovdqu    `32*4+4-0x90`(%rax),$T1    # s2     vpaddq        $H2,$D1,$D1        # d1 += h1*r0     vpmuludq    $H3,$T0,$T4        # h3*r0     vpmuludq    $H4,$T0,$H2        # h4*r0     vpaddq        $T4,$D3,$D3        # d3 += h3*r0     vpaddq        $H2,$D4,$D4        # d4 += h4*r0
      vpmuludq    $H3,$T1,$T4        # h3*s2     vpmuludq    $H4,$T1,$H2        # h4*s2     vpaddq        $T4,$D0,$D0        # d0 += h3*s2     vpaddq        $H2,$D1,$D1        # d1 += h4*s2      vmovdqu    `32*5+4-0x90`(%rax),$H2    # r3     vpmuludq    $H1,$T2,$T4        # h1*r2     vpmuludq    $H0,$T2,$T2        # h0*r2     vpaddq        $T4,$D3,$D3        # d3 += h1*r2     vpaddq        $T2,$D2,$D2        # d2 += h0*r2
      vpmuludq    $H1,$H2,$T4        # h1*r3     vpmuludq    $H0,$H2,$H2        # h0*r3     vpaddq        $T4,$D4,$D4        # d4 += h1*r3     vpaddq        $H2,$D3,$D3        # d3 += h0*r3     vpmuludq    $H3,$T3,$T4        # h3*s3     vpmuludq    $H4,$T3,$H2        # h4*s3     vpaddq        $T4,$D1,$D1        # d1 += h3*s3     vpaddq        $H2,$D2,$D2        # d2 += h4*s3
      vpmuludq    $H3,$S4,$H3        # h3*s4     vpmuludq    $H4,$S4,$H4        # h4*s4     vpaddq        $H3,$D2,$H2        # h2 = d2 + h3*r4     vpaddq        $H4,$D3,$H3        # h3 = d3 + h4*r4     vpmuludq    `32*7+4-0x90`(%rax),$H0,$H4        # h0*r4     vpmuludq    $H1,$S4,$H0        # h1*s4     vmovdqa        64(%rcx),$MASK        # .Lmask26     vpaddq        $H4,$D4,$H4        # h4 = d4 + h0*r4     vpaddq        $H0,$D0,$H0        # h0 = d0 + h1*s4
      ################################################################     # horizontal addition
      vpsrldq        \$8,$D1,$T1     vpsrldq        \$8,$H2,$T2     vpsrldq        \$8,$H3,$T3     vpsrldq        \$8,$H4,$T4     vpsrldq        \$8,$H0,$T0     vpaddq        $T1,$D1,$D1     vpaddq        $T2,$H2,$H2     vpaddq        $T3,$H3,$H3     vpaddq        $T4,$H4,$H4     vpaddq        $T0,$H0,$H0
      vpermq        \$0x2,$H3,$T3     vpermq        \$0x2,$H4,$T4     vpermq        \$0x2,$H0,$T0     vpermq        \$0x2,$D1,$T1     vpermq        \$0x2,$H2,$T2     vpaddq        $T3,$H3,$H3     vpaddq        $T4,$H4,$H4     vpaddq        $T0,$H0,$H0     vpaddq        $T1,$D1,$D1     vpaddq        $T2,$H2,$H2
      ################################################################     # lazy reduction
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3     vpaddq        $D3,$H4,$H4        # h3 -> h4
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0     vpaddq        $D0,$D1,$H1        # h0 -> h1
      vpsrlq        \$26,$H4,$D4     vpand        $MASK,$H4,$H4
      vpsrlq        \$26,$H1,$D1     vpand        $MASK,$H1,$H1     vpaddq        $D1,$H2,$H2        # h1 -> h2
      vpaddq        $D4,$H0,$H0     vpsllq        \$2,$D4,$D4     vpaddq        $D4,$H0,$H0        # h4 -> h0
      vpsrlq        \$26,$H2,$D2     vpand        $MASK,$H2,$H2     vpaddq        $D2,$H3,$H3        # h2 -> h3
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0     vpaddq        $D0,$H1,$H1        # h0 -> h1
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3     vpaddq        $D3,$H4,$H4        # h3 -> h4
      vmovd        %x#$H0,`4*0-48-64`($ctx)# save partially reduced     vmovd        %x#$H1,`4*1-48-64`($ctx)     vmovd        %x#$H2,`4*2-48-64`($ctx)     vmovd        %x#$H3,`4*3-48-64`($ctx)     vmovd        %x#$H4,`4*4-48-64`($ctx) ___ $code.=<<___    if ($win64);     vmovdqa        0x50(%r11),%xmm6     vmovdqa        0x60(%r11),%xmm7     vmovdqa        0x70(%r11),%xmm8     vmovdqa        0x80(%r11),%xmm9     vmovdqa        0x90(%r11),%xmm10     vmovdqa        0xa0(%r11),%xmm11     vmovdqa        0xb0(%r11),%xmm12     vmovdqa        0xc0(%r11),%xmm13     vmovdqa        0xd0(%r11),%xmm14     vmovdqa        0xe0(%r11),%xmm15     lea        0xf8(%r11),%rsp .Ldo_avx2_epilogue: ___ $code.=<<___    if (!$win64);     lea        8(%r11),%rsp .cfi_def_cfa        %rsp,8 ___ $code.=<<___;     vzeroupper     ret .cfi_endproc .size    poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ ####################################################################### if ($avx>2) { # On entry we have input length divisible by 64. But since inner loop # processes 128 bytes per iteration, cases when length is not divisible # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this # reason stack layout is kept identical to poly1305_blocks_avx2. If not # for this tail, we wouldn't have to even allocate stack frame...
  my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); my $PADBIT="%zmm30";
  map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));        # switch to %zmm domain map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); map(s/%y/%z/,($MASK));
  $code.=<<___; .type    poly1305_blocks_avx512,\@function,4 .align    32 poly1305_blocks_avx512: .cfi_startproc .Lblocks_avx512:     mov        \$15,%eax     kmovw        %eax,%k2 ___ $code.=<<___    if (!$win64);     lea        -8(%rsp),%r11 .cfi_def_cfa        %r11,16     sub        \$0x128,%rsp ___ $code.=<<___    if ($win64);     lea        -0xf8(%rsp),%r11     sub        \$0x1c8,%rsp     vmovdqa        %xmm6,0x50(%r11)     vmovdqa        %xmm7,0x60(%r11)     vmovdqa        %xmm8,0x70(%r11)     vmovdqa        %xmm9,0x80(%r11)     vmovdqa        %xmm10,0x90(%r11)     vmovdqa        %xmm11,0xa0(%r11)     vmovdqa        %xmm12,0xb0(%r11)     vmovdqa        %xmm13,0xc0(%r11)     vmovdqa        %xmm14,0xd0(%r11)     vmovdqa        %xmm15,0xe0(%r11) .Ldo_avx512_body: ___ $code.=<<___;     lea        .Lconst(%rip),%rcx     lea        48+64($ctx),$ctx    # size optimization     vmovdqa        96(%rcx),%y#$T2        # .Lpermd_avx2
      # expand pre-calculated table     vmovdqu        `16*0-64`($ctx),%x#$D0    # will become expanded ${R0}     and        \$-512,%rsp     vmovdqu        `16*1-64`($ctx),%x#$D1    # will become ... ${R1}     mov        \$0x20,%rax     vmovdqu        `16*2-64`($ctx),%x#$T0    # ... ${S1}     vmovdqu        `16*3-64`($ctx),%x#$D2    # ... ${R2}     vmovdqu        `16*4-64`($ctx),%x#$T1    # ... ${S2}     vmovdqu        `16*5-64`($ctx),%x#$D3    # ... ${R3}     vmovdqu        `16*6-64`($ctx),%x#$T3    # ... ${S3}     vmovdqu        `16*7-64`($ctx),%x#$D4    # ... ${R4}     vmovdqu        `16*8-64`($ctx),%x#$T4    # ... ${S4}     vpermd        $D0,$T2,$R0        # 00003412 -> 14243444     vpbroadcastq    64(%rcx),$MASK        # .Lmask26     vpermd        $D1,$T2,$R1     vpermd        $T0,$T2,$S1     vpermd        $D2,$T2,$R2     vmovdqa64    $R0,0x00(%rsp){%k2}    # save in case $len%128 != 0      vpsrlq        \$32,$R0,$T0        # 14243444 -> 01020304     vpermd        $T1,$T2,$S2     vmovdqu64    $R1,0x00(%rsp,%rax){%k2}      vpsrlq        \$32,$R1,$T1     vpermd        $D3,$T2,$R3     vmovdqa64    $S1,0x40(%rsp){%k2}     vpermd        $T3,$T2,$S3     vpermd        $D4,$T2,$R4     vmovdqu64    $R2,0x40(%rsp,%rax){%k2}     vpermd        $T4,$T2,$S4     vmovdqa64    $S2,0x80(%rsp){%k2}     vmovdqu64    $R3,0x80(%rsp,%rax){%k2}     vmovdqa64    $S3,0xc0(%rsp){%k2}     vmovdqu64    $R4,0xc0(%rsp,%rax){%k2}     vmovdqa64    $S4,0x100(%rsp){%k2}
      ################################################################     # calculate 5th through 8th powers of the key     #     # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1     # d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2     # d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3     # d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4     # d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
      vpmuludq    $T0,$R0,$D0        # d0 = r0'*r0     vpmuludq    $T0,$R1,$D1        # d1 = r0'*r1     vpmuludq    $T0,$R2,$D2        # d2 = r0'*r2     vpmuludq    $T0,$R3,$D3        # d3 = r0'*r3     vpmuludq    $T0,$R4,$D4        # d4 = r0'*r4      vpsrlq        \$32,$R2,$T2
      vpmuludq    $T1,$S4,$M0     vpmuludq    $T1,$R0,$M1     vpmuludq    $T1,$R1,$M2     vpmuludq    $T1,$R2,$M3     vpmuludq    $T1,$R3,$M4      vpsrlq        \$32,$R3,$T3     vpaddq        $M0,$D0,$D0        # d0 += r1'*5*r4     vpaddq        $M1,$D1,$D1        # d1 += r1'*r0     vpaddq        $M2,$D2,$D2        # d2 += r1'*r1     vpaddq        $M3,$D3,$D3        # d3 += r1'*r2     vpaddq        $M4,$D4,$D4        # d4 += r1'*r3
      vpmuludq    $T2,$S3,$M0     vpmuludq    $T2,$S4,$M1     vpmuludq    $T2,$R1,$M3     vpmuludq    $T2,$R2,$M4     vpmuludq    $T2,$R0,$M2      vpsrlq        \$32,$R4,$T4     vpaddq        $M0,$D0,$D0        # d0 += r2'*5*r3     vpaddq        $M1,$D1,$D1        # d1 += r2'*5*r4     vpaddq        $M3,$D3,$D3        # d3 += r2'*r1     vpaddq        $M4,$D4,$D4        # d4 += r2'*r2     vpaddq        $M2,$D2,$D2        # d2 += r2'*r0
      vpmuludq    $T3,$S2,$M0     vpmuludq    $T3,$R0,$M3     vpmuludq    $T3,$R1,$M4     vpmuludq    $T3,$S3,$M1     vpmuludq    $T3,$S4,$M2     vpaddq        $M0,$D0,$D0        # d0 += r3'*5*r2     vpaddq        $M3,$D3,$D3        # d3 += r3'*r0     vpaddq        $M4,$D4,$D4        # d4 += r3'*r1     vpaddq        $M1,$D1,$D1        # d1 += r3'*5*r3     vpaddq        $M2,$D2,$D2        # d2 += r3'*5*r4
      vpmuludq    $T4,$S4,$M3     vpmuludq    $T4,$R0,$M4     vpmuludq    $T4,$S1,$M0     vpmuludq    $T4,$S2,$M1     vpmuludq    $T4,$S3,$M2     vpaddq        $M3,$D3,$D3        # d3 += r2'*5*r4     vpaddq        $M4,$D4,$D4        # d4 += r2'*r0     vpaddq        $M0,$D0,$D0        # d0 += r2'*5*r1     vpaddq        $M1,$D1,$D1        # d1 += r2'*5*r2     vpaddq        $M2,$D2,$D2        # d2 += r2'*5*r3
      ################################################################     # load input     vmovdqu64    16*0($inp),%z#$T3     vmovdqu64    16*4($inp),%z#$T4     lea        16*8($inp),$inp
      ################################################################     # lazy reduction
      vpsrlq        \$26,$D3,$M3     vpandq        $MASK,$D3,$D3     vpaddq        $M3,$D4,$D4        # d3 -> d4
      vpsrlq        \$26,$D0,$M0     vpandq        $MASK,$D0,$D0     vpaddq        $M0,$D1,$D1        # d0 -> d1
      vpsrlq        \$26,$D4,$M4     vpandq        $MASK,$D4,$D4
      vpsrlq        \$26,$D1,$M1     vpandq        $MASK,$D1,$D1     vpaddq        $M1,$D2,$D2        # d1 -> d2
      vpaddq        $M4,$D0,$D0     vpsllq        \$2,$M4,$M4     vpaddq        $M4,$D0,$D0        # d4 -> d0
      vpsrlq        \$26,$D2,$M2     vpandq        $MASK,$D2,$D2     vpaddq        $M2,$D3,$D3        # d2 -> d3
      vpsrlq        \$26,$D0,$M0     vpandq        $MASK,$D0,$D0     vpaddq        $M0,$D1,$D1        # d0 -> d1
      vpsrlq        \$26,$D3,$M3     vpandq        $MASK,$D3,$D3     vpaddq        $M3,$D4,$D4        # d3 -> d4
      ################################################################     # at this point we have 14243444 in $R0-$S4 and 05060708 in     # $D0-$D4, ...
      vpunpcklqdq    $T4,$T3,$T0    # transpose input     vpunpckhqdq    $T4,$T3,$T4
      # ... since input 64-bit lanes are ordered as 73625140, we could     # "vperm" it to 76543210 (here and in each loop iteration), *or*     # we could just flow along, hence the goal for $R0-$S4 is     # 1858286838784888 ...
      vmovdqa32    128(%rcx),$M0        # .Lpermd_avx512:     mov        \$0x7777,%eax     kmovw        %eax,%k1
      vpermd        $R0,$M0,$R0        # 14243444 -> 1---2---3---4---     vpermd        $R1,$M0,$R1     vpermd        $R2,$M0,$R2     vpermd        $R3,$M0,$R3     vpermd        $R4,$M0,$R4
      vpermd        $D0,$M0,${R0}{%k1}    # 05060708 -> 1858286838784888     vpermd        $D1,$M0,${R1}{%k1}     vpermd        $D2,$M0,${R2}{%k1}     vpermd        $D3,$M0,${R3}{%k1}     vpermd        $D4,$M0,${R4}{%k1}
      vpslld        \$2,$R1,$S1        # *5     vpslld        \$2,$R2,$S2     vpslld        \$2,$R3,$S3     vpslld        \$2,$R4,$S4     vpaddd        $R1,$S1,$S1     vpaddd        $R2,$S2,$S2     vpaddd        $R3,$S3,$S3     vpaddd        $R4,$S4,$S4
      vpbroadcastq    32(%rcx),$PADBIT    # .L129
      vpsrlq        \$52,$T0,$T2        # splat input     vpsllq        \$12,$T4,$T3     vporq        $T3,$T2,$T2     vpsrlq        \$26,$T0,$T1     vpsrlq        \$14,$T4,$T3     vpsrlq        \$40,$T4,$T4        # 4     vpandq        $MASK,$T2,$T2        # 2     vpandq        $MASK,$T0,$T0        # 0     #vpandq        $MASK,$T1,$T1        # 1     #vpandq        $MASK,$T3,$T3        # 3     #vporq        $PADBIT,$T4,$T4        # padbit, yes, always
      vpaddq        $H2,$T2,$H2        # accumulate input     sub        \$192,$len     jbe        .Ltail_avx512     jmp        .Loop_avx512
  .align    32 .Loop_avx512:     ################################################################     # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8     # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7     # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6     # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5     # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4     # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3     # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2     # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1     #   \________/\___________/     ################################################################     #vpaddq        $H2,$T2,$H2        # accumulate input
      # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4     # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4     # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4     # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4     # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4     #     # however, as h2 is "chronologically" first one available pull     # corresponding operations up, so it's     #     # d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4     # d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0     # d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1     # d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2     # d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
      vpmuludq    $H2,$R1,$D3        # d3 = h2*r1      vpaddq        $H0,$T0,$H0     vpmuludq    $H2,$R2,$D4        # d4 = h2*r2      vpandq        $MASK,$T1,$T1        # 1     vpmuludq    $H2,$S3,$D0        # d0 = h2*s3      vpandq        $MASK,$T3,$T3        # 3     vpmuludq    $H2,$S4,$D1        # d1 = h2*s4      vporq        $PADBIT,$T4,$T4        # padbit, yes, always     vpmuludq    $H2,$R0,$D2        # d2 = h2*r0      vpaddq        $H1,$T1,$H1        # accumulate input      vpaddq        $H3,$T3,$H3      vpaddq        $H4,$T4,$H4
        vmovdqu64    16*0($inp),$T3        # load input       vmovdqu64    16*4($inp),$T4       lea        16*8($inp),$inp     vpmuludq    $H0,$R3,$M3     vpmuludq    $H0,$R4,$M4     vpmuludq    $H0,$R0,$M0     vpmuludq    $H0,$R1,$M1     vpaddq        $M3,$D3,$D3        # d3 += h0*r3     vpaddq        $M4,$D4,$D4        # d4 += h0*r4     vpaddq        $M0,$D0,$D0        # d0 += h0*r0     vpaddq        $M1,$D1,$D1        # d1 += h0*r1
      vpmuludq    $H1,$R2,$M3     vpmuludq    $H1,$R3,$M4     vpmuludq    $H1,$S4,$M0     vpmuludq    $H0,$R2,$M2     vpaddq        $M3,$D3,$D3        # d3 += h1*r2     vpaddq        $M4,$D4,$D4        # d4 += h1*r3     vpaddq        $M0,$D0,$D0        # d0 += h1*s4     vpaddq        $M2,$D2,$D2        # d2 += h0*r2
        vpunpcklqdq    $T4,$T3,$T0        # transpose input       vpunpckhqdq    $T4,$T3,$T4
      vpmuludq    $H3,$R0,$M3     vpmuludq    $H3,$R1,$M4     vpmuludq    $H1,$R0,$M1     vpmuludq    $H1,$R1,$M2     vpaddq        $M3,$D3,$D3        # d3 += h3*r0     vpaddq        $M4,$D4,$D4        # d4 += h3*r1     vpaddq        $M1,$D1,$D1        # d1 += h1*r0     vpaddq        $M2,$D2,$D2        # d2 += h1*r1
      vpmuludq    $H4,$S4,$M3     vpmuludq    $H4,$R0,$M4     vpmuludq    $H3,$S2,$M0     vpmuludq    $H3,$S3,$M1     vpaddq        $M3,$D3,$D3        # d3 += h4*s4     vpmuludq    $H3,$S4,$M2     vpaddq        $M4,$D4,$D4        # d4 += h4*r0     vpaddq        $M0,$D0,$D0        # d0 += h3*s2     vpaddq        $M1,$D1,$D1        # d1 += h3*s3     vpaddq        $M2,$D2,$D2        # d2 += h3*s4
      vpmuludq    $H4,$S1,$M0     vpmuludq    $H4,$S2,$M1     vpmuludq    $H4,$S3,$M2     vpaddq        $M0,$D0,$H0        # h0 = d0 + h4*s1     vpaddq        $M1,$D1,$H1        # h1 = d2 + h4*s2     vpaddq        $M2,$D2,$H2        # h2 = d3 + h4*s3
      ################################################################     # lazy reduction (interleaved with input splat)
       vpsrlq        \$52,$T0,$T2        # splat input      vpsllq        \$12,$T4,$T3
      vpsrlq        \$26,$D3,$H3     vpandq        $MASK,$D3,$D3     vpaddq        $H3,$D4,$H4        # h3 -> h4
       vporq        $T3,$T2,$T2
      vpsrlq        \$26,$H0,$D0     vpandq        $MASK,$H0,$H0     vpaddq        $D0,$H1,$H1        # h0 -> h1
       vpandq        $MASK,$T2,$T2        # 2
      vpsrlq        \$26,$H4,$D4     vpandq        $MASK,$H4,$H4
      vpsrlq        \$26,$H1,$D1     vpandq        $MASK,$H1,$H1     vpaddq        $D1,$H2,$H2        # h1 -> h2
      vpaddq        $D4,$H0,$H0     vpsllq        \$2,$D4,$D4     vpaddq        $D4,$H0,$H0        # h4 -> h0
       vpaddq        $T2,$H2,$H2        # modulo-scheduled      vpsrlq        \$26,$T0,$T1
      vpsrlq        \$26,$H2,$D2     vpandq        $MASK,$H2,$H2     vpaddq        $D2,$D3,$H3        # h2 -> h3
       vpsrlq        \$14,$T4,$T3
      vpsrlq        \$26,$H0,$D0     vpandq        $MASK,$H0,$H0     vpaddq        $D0,$H1,$H1        # h0 -> h1
       vpsrlq        \$40,$T4,$T4        # 4
      vpsrlq        \$26,$H3,$D3     vpandq        $MASK,$H3,$H3     vpaddq        $D3,$H4,$H4        # h3 -> h4
       vpandq        $MASK,$T0,$T0        # 0      #vpandq    $MASK,$T1,$T1        # 1      #vpandq    $MASK,$T3,$T3        # 3      #vporq        $PADBIT,$T4,$T4        # padbit, yes, always
      sub        \$128,$len     ja        .Loop_avx512
  .Ltail_avx512:     ################################################################     # while above multiplications were by r^8 in all lanes, in last     # iteration we multiply least significant lane by r^8 and most     # significant one by r, that's why table gets shifted...
      vpsrlq        \$32,$R0,$R0        # 0105020603070408     vpsrlq        \$32,$R1,$R1     vpsrlq        \$32,$R2,$R2     vpsrlq        \$32,$S3,$S3     vpsrlq        \$32,$S4,$S4     vpsrlq        \$32,$R3,$R3     vpsrlq        \$32,$R4,$R4     vpsrlq        \$32,$S1,$S1     vpsrlq        \$32,$S2,$S2
      ################################################################     # load either next or last 64 byte of input     lea        ($inp,$len),$inp
      #vpaddq        $H2,$T2,$H2        # accumulate input     vpaddq        $H0,$T0,$H0
      vpmuludq    $H2,$R1,$D3        # d3 = h2*r1     vpmuludq    $H2,$R2,$D4        # d4 = h2*r2     vpmuludq    $H2,$S3,$D0        # d0 = h2*s3      vpandq        $MASK,$T1,$T1        # 1     vpmuludq    $H2,$S4,$D1        # d1 = h2*s4      vpandq        $MASK,$T3,$T3        # 3     vpmuludq    $H2,$R0,$D2        # d2 = h2*r0      vporq        $PADBIT,$T4,$T4        # padbit, yes, always      vpaddq        $H1,$T1,$H1        # accumulate input      vpaddq        $H3,$T3,$H3      vpaddq        $H4,$T4,$H4
        vmovdqu    16*0($inp),%x#$T0     vpmuludq    $H0,$R3,$M3     vpmuludq    $H0,$R4,$M4     vpmuludq    $H0,$R0,$M0     vpmuludq    $H0,$R1,$M1     vpaddq        $M3,$D3,$D3        # d3 += h0*r3     vpaddq        $M4,$D4,$D4        # d4 += h0*r4     vpaddq        $M0,$D0,$D0        # d0 += h0*r0     vpaddq        $M1,$D1,$D1        # d1 += h0*r1
        vmovdqu    16*1($inp),%x#$T1     vpmuludq    $H1,$R2,$M3     vpmuludq    $H1,$R3,$M4     vpmuludq    $H1,$S4,$M0     vpmuludq    $H0,$R2,$M2     vpaddq        $M3,$D3,$D3        # d3 += h1*r2     vpaddq        $M4,$D4,$D4        # d4 += h1*r3     vpaddq        $M0,$D0,$D0        # d0 += h1*s4     vpaddq        $M2,$D2,$D2        # d2 += h0*r2
        vinserti128    \$1,16*2($inp),%y#$T0,%y#$T0     vpmuludq    $H3,$R0,$M3     vpmuludq    $H3,$R1,$M4     vpmuludq    $H1,$R0,$M1     vpmuludq    $H1,$R1,$M2     vpaddq        $M3,$D3,$D3        # d3 += h3*r0     vpaddq        $M4,$D4,$D4        # d4 += h3*r1     vpaddq        $M1,$D1,$D1        # d1 += h1*r0     vpaddq        $M2,$D2,$D2        # d2 += h1*r1
        vinserti128    \$1,16*3($inp),%y#$T1,%y#$T1     vpmuludq    $H4,$S4,$M3     vpmuludq    $H4,$R0,$M4     vpmuludq    $H3,$S2,$M0     vpmuludq    $H3,$S3,$M1     vpmuludq    $H3,$S4,$M2     vpaddq        $M3,$D3,$H3        # h3 = d3 + h4*s4     vpaddq        $M4,$D4,$D4        # d4 += h4*r0     vpaddq        $M0,$D0,$D0        # d0 += h3*s2     vpaddq        $M1,$D1,$D1        # d1 += h3*s3     vpaddq        $M2,$D2,$D2        # d2 += h3*s4
      vpmuludq    $H4,$S1,$M0     vpmuludq    $H4,$S2,$M1     vpmuludq    $H4,$S3,$M2     vpaddq        $M0,$D0,$H0        # h0 = d0 + h4*s1     vpaddq        $M1,$D1,$H1        # h1 = d2 + h4*s2     vpaddq        $M2,$D2,$H2        # h2 = d3 + h4*s3
      ################################################################     # horizontal addition
      mov        \$1,%eax     vpermq        \$0xb1,$H3,$D3     vpermq        \$0xb1,$D4,$H4     vpermq        \$0xb1,$H0,$D0     vpermq        \$0xb1,$H1,$D1     vpermq        \$0xb1,$H2,$D2     vpaddq        $D3,$H3,$H3     vpaddq        $D4,$H4,$H4     vpaddq        $D0,$H0,$H0     vpaddq        $D1,$H1,$H1     vpaddq        $D2,$H2,$H2
      kmovw        %eax,%k3     vpermq        \$0x2,$H3,$D3     vpermq        \$0x2,$H4,$D4     vpermq        \$0x2,$H0,$D0     vpermq        \$0x2,$H1,$D1     vpermq        \$0x2,$H2,$D2     vpaddq        $D3,$H3,$H3     vpaddq        $D4,$H4,$H4     vpaddq        $D0,$H0,$H0     vpaddq        $D1,$H1,$H1     vpaddq        $D2,$H2,$H2
      vextracti64x4    \$0x1,$H3,%y#$D3     vextracti64x4    \$0x1,$H4,%y#$D4     vextracti64x4    \$0x1,$H0,%y#$D0     vextracti64x4    \$0x1,$H1,%y#$D1     vextracti64x4    \$0x1,$H2,%y#$D2     vpaddq        $D3,$H3,${H3}{%k3}{z}    # keep single qword in case     vpaddq        $D4,$H4,${H4}{%k3}{z}    # it's passed to .Ltail_avx2     vpaddq        $D0,$H0,${H0}{%k3}{z}     vpaddq        $D1,$H1,${H1}{%k3}{z}     vpaddq        $D2,$H2,${H2}{%k3}{z} ___ map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); $code.=<<___;     ################################################################     # lazy reduction (interleaved with input splat)
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3      vpsrldq    \$6,$T0,$T2        # splat input      vpsrldq    \$6,$T1,$T3      vpunpckhqdq    $T1,$T0,$T4        # 4     vpaddq        $D3,$H4,$H4        # h3 -> h4
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0      vpunpcklqdq    $T3,$T2,$T2        # 2:3      vpunpcklqdq    $T1,$T0,$T0        # 0:1     vpaddq        $D0,$H1,$H1        # h0 -> h1
      vpsrlq        \$26,$H4,$D4     vpand        $MASK,$H4,$H4
      vpsrlq        \$26,$H1,$D1     vpand        $MASK,$H1,$H1      vpsrlq        \$30,$T2,$T3      vpsrlq        \$4,$T2,$T2     vpaddq        $D1,$H2,$H2        # h1 -> h2
      vpaddq        $D4,$H0,$H0     vpsllq        \$2,$D4,$D4      vpsrlq        \$26,$T0,$T1      vpsrlq        \$40,$T4,$T4        # 4     vpaddq        $D4,$H0,$H0        # h4 -> h0
      vpsrlq        \$26,$H2,$D2     vpand        $MASK,$H2,$H2      vpand        $MASK,$T2,$T2        # 2      vpand        $MASK,$T0,$T0        # 0     vpaddq        $D2,$H3,$H3        # h2 -> h3
      vpsrlq        \$26,$H0,$D0     vpand        $MASK,$H0,$H0      vpaddq        $H2,$T2,$H2        # accumulate input for .Ltail_avx2      vpand        $MASK,$T1,$T1        # 1     vpaddq        $D0,$H1,$H1        # h0 -> h1
      vpsrlq        \$26,$H3,$D3     vpand        $MASK,$H3,$H3      vpand        $MASK,$T3,$T3        # 3      vpor        32(%rcx),$T4,$T4    # padbit, yes, always     vpaddq        $D3,$H4,$H4        # h3 -> h4
      lea        0x90(%rsp),%rax        # size optimization for .Ltail_avx2     add        \$64,$len     jnz        .Ltail_avx2
      vpsubq        $T2,$H2,$H2        # undo input accumulation     vmovd        %x#$H0,`4*0-48-64`($ctx)# save partially reduced     vmovd        %x#$H1,`4*1-48-64`($ctx)     vmovd        %x#$H2,`4*2-48-64`($ctx)     vmovd        %x#$H3,`4*3-48-64`($ctx)     vmovd        %x#$H4,`4*4-48-64`($ctx)     vzeroall ___ $code.=<<___    if ($win64);     movdqa        0x50(%r11),%xmm6     movdqa        0x60(%r11),%xmm7     movdqa        0x70(%r11),%xmm8     movdqa        0x80(%r11),%xmm9     movdqa        0x90(%r11),%xmm10     movdqa        0xa0(%r11),%xmm11     movdqa        0xb0(%r11),%xmm12     movdqa        0xc0(%r11),%xmm13     movdqa        0xd0(%r11),%xmm14     movdqa        0xe0(%r11),%xmm15     lea        0xf8(%r11),%rsp .Ldo_avx512_epilogue: ___ $code.=<<___    if (!$win64);     lea        8(%r11),%rsp .cfi_def_cfa        %rsp,8 ___ $code.=<<___;     ret .cfi_endproc .size    poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ if ($avx>3) { ######################################################################## # VPMADD52 version using 2^44 radix. # # One can argue that base 2^52 would be more natural. Well, even though # some operations would be more natural, one has to recognize couple of # things. Base 2^52 doesn't provide advantage over base 2^44 if you look # at amount of multiply-n-accumulate operations. Secondly, it makes it # impossible to pre-compute multiples of 5 [referred to as s[]/sN in # reference implementations], which means that more such operations # would have to be performed in inner loop, which in turn makes critical # path longer. In other words, even though base 2^44 reduction might # look less elegant, overall critical path is actually shorter...
  ######################################################################## # Layout of opaque area is following. # #    unsigned __int64 h[3];        # current hash value base 2^44 #    unsigned __int64 s[2];        # key value*20 base 2^44 #    unsigned __int64 r[3];        # key value base 2^44 #    struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; #                    # r^n positions reflect #                    # placement in register, not #                    # memory, R[3] is R[1]*20
  $code.=<<___; .type    poly1305_init_base2_44,\@function,3 .align    32 poly1305_init_base2_44: .cfi_startproc     xor    %rax,%rax     mov    %rax,0($ctx)        # initialize hash value     mov    %rax,8($ctx)     mov    %rax,16($ctx)
  .Linit_base2_44:     lea    poly1305_blocks_vpmadd52(%rip),%r10     lea    poly1305_emit_base2_44(%rip),%r11
      mov    \$0x0ffffffc0fffffff,%rax     mov    \$0x0ffffffc0ffffffc,%rcx     and    0($inp),%rax     mov    \$0x00000fffffffffff,%r8     and    8($inp),%rcx     mov    \$0x00000fffffffffff,%r9     and    %rax,%r8     shrd    \$44,%rcx,%rax     mov    %r8,40($ctx)        # r0     and    %r9,%rax     shr    \$24,%rcx     mov    %rax,48($ctx)        # r1     lea    (%rax,%rax,4),%rax    # *5     mov    %rcx,56($ctx)        # r2     shl    \$2,%rax        # magic <<2     lea    (%rcx,%rcx,4),%rcx    # *5     shl    \$2,%rcx        # magic <<2     mov    %rax,24($ctx)        # s1     mov    %rcx,32($ctx)        # s2     movq    \$-1,64($ctx)        # write impossible value ___ $code.=<<___    if ($flavour !~ /elf32/);     mov    %r10,0(%rdx)     mov    %r11,8(%rdx) ___ $code.=<<___    if ($flavour =~ /elf32/);     mov    %r10d,0(%rdx)     mov    %r11d,4(%rdx) ___ $code.=<<___;     mov    \$1,%eax     ret .cfi_endproc .size    poly1305_init_base2_44,.-poly1305_init_base2_44 ___ { my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
  $code.=<<___; .type    poly1305_blocks_vpmadd52,\@function,4 .align    32 poly1305_blocks_vpmadd52: .cfi_startproc     endbranch     shr    \$4,$len     jz    .Lno_data_vpmadd52        # too short
      shl    \$40,$padbit     mov    64($ctx),%r8            # peek on power of the key
      # if powers of the key are not calculated yet, process up to 3     # blocks with this single-block subroutine, otherwise ensure that     # length is divisible by 2 blocks and pass the rest down to next     # subroutine...
      mov    \$3,%rax     mov    \$1,%r10     cmp    \$4,$len            # is input long     cmovae    %r10,%rax     test    %r8,%r8                # is power value impossible?     cmovns    %r10,%rax
      and    $len,%rax            # is input of favourable length?     jz    .Lblocks_vpmadd52_4x
      sub        %rax,$len     mov        \$7,%r10d     mov        \$1,%r11d     kmovw        %r10d,%k7     lea        .L2_44_inp_permd(%rip),%r10     kmovw        %r11d,%k1
      vmovq        $padbit,%x#$PAD     vmovdqa64    0(%r10),$inp_permd    # .L2_44_inp_permd     vmovdqa64    32(%r10),$inp_shift    # .L2_44_inp_shift     vpermq        \$0xcf,$PAD,$PAD     vmovdqa64    64(%r10),$reduc_mask    # .L2_44_mask
      vmovdqu64    0($ctx),${Dlo}{%k7}{z}        # load hash value     vmovdqu64    40($ctx),${r2r1r0}{%k7}{z}    # load keys     vmovdqu64    32($ctx),${r1r0s2}{%k7}{z}     vmovdqu64    24($ctx),${r0s2s1}{%k7}{z}
      vmovdqa64    96(%r10),$reduc_rght    # .L2_44_shift_rgt     vmovdqa64    128(%r10),$reduc_left    # .L2_44_shift_lft
      jmp        .Loop_vpmadd52
  .align    32 .Loop_vpmadd52:     vmovdqu32    0($inp),%x#$T0        # load input as ----3210     lea        16($inp),$inp
      vpermd        $T0,$inp_permd,$T0    # ----3210 -> --322110     vpsrlvq        $inp_shift,$T0,$T0     vpandq        $reduc_mask,$T0,$T0     vporq        $PAD,$T0,$T0
      vpaddq        $T0,$Dlo,$Dlo        # accumulate input
      vpermq        \$0,$Dlo,${H0}{%k7}{z}    # smash hash value     vpermq        \$0b01010101,$Dlo,${H1}{%k7}{z}     vpermq        \$0b10101010,$Dlo,${H2}{%k7}{z}
      vpxord        $Dlo,$Dlo,$Dlo     vpxord        $Dhi,$Dhi,$Dhi
      vpmadd52luq    $r2r1r0,$H0,$Dlo     vpmadd52huq    $r2r1r0,$H0,$Dhi
      vpmadd52luq    $r1r0s2,$H1,$Dlo     vpmadd52huq    $r1r0s2,$H1,$Dhi
      vpmadd52luq    $r0s2s1,$H2,$Dlo     vpmadd52huq    $r0s2s1,$H2,$Dhi
      vpsrlvq        $reduc_rght,$Dlo,$T0    # 0 in topmost qword     vpsllvq        $reduc_left,$Dhi,$Dhi    # 0 in topmost qword     vpandq        $reduc_mask,$Dlo,$Dlo
      vpaddq        $T0,$Dhi,$Dhi
      vpermq        \$0b10010011,$Dhi,$Dhi    # 0 in lowest qword
      vpaddq        $Dhi,$Dlo,$Dlo        # note topmost qword :-)
      vpsrlvq        $reduc_rght,$Dlo,$T0    # 0 in topmost word     vpandq        $reduc_mask,$Dlo,$Dlo
      vpermq        \$0b10010011,$T0,$T0
      vpaddq        $T0,$Dlo,$Dlo
      vpermq        \$0b10010011,$Dlo,${T0}{%k1}{z}
      vpaddq        $T0,$Dlo,$Dlo     vpsllq        \$2,$T0,$T0
      vpaddq        $T0,$Dlo,$Dlo
      dec        %rax            # len-=16     jnz        .Loop_vpmadd52
      vmovdqu64    $Dlo,0($ctx){%k7}    # store hash value
      test        $len,$len     jnz        .Lblocks_vpmadd52_4x
  .Lno_data_vpmadd52:     ret .cfi_endproc .size    poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 ___ } { ######################################################################## # As implied by its name 4x subroutine processes 4 blocks in parallel # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power # and is handled in 256-bit %ymm registers.
  my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
  $code.=<<___; .type    poly1305_blocks_vpmadd52_4x,\@function,4 .align    32 poly1305_blocks_vpmadd52_4x: .cfi_startproc     shr    \$4,$len     jz    .Lno_data_vpmadd52_4x        # too short
      shl    \$40,$padbit     mov    64($ctx),%r8            # peek on power of the key
  .Lblocks_vpmadd52_4x:     vpbroadcastq    $padbit,$PAD
      vmovdqa64    .Lx_mask44(%rip),$mask44     mov        \$5,%eax     vmovdqa64    .Lx_mask42(%rip),$mask42     kmovw        %eax,%k1        # used in 2x path
      test        %r8,%r8            # is power value impossible?     js        .Linit_vpmadd52        # if it is, then init R[4]
      vmovq        0($ctx),%x#$H0        # load current hash value     vmovq        8($ctx),%x#$H1     vmovq        16($ctx),%x#$H2
      test        \$3,$len        # is length 4*n+2?     jnz        .Lblocks_vpmadd52_2x_do
  .Lblocks_vpmadd52_4x_do:     vpbroadcastq    64($ctx),$R0        # load 4th power of the key     vpbroadcastq    96($ctx),$R1     vpbroadcastq    128($ctx),$R2     vpbroadcastq    160($ctx),$S1
  .Lblocks_vpmadd52_4x_key_loaded:     vpsllq        \$2,$R2,$S2        # S2 = R2*5*4     vpaddq        $R2,$S2,$S2     vpsllq        \$2,$S2,$S2
      test        \$7,$len        # is len 8*n?     jz        .Lblocks_vpmadd52_8x
      vmovdqu64    16*0($inp),$T2        # load data     vmovdqu64    16*2($inp),$T3     lea        16*4($inp),$inp
      vpunpcklqdq    $T3,$T2,$T1        # transpose data     vpunpckhqdq    $T3,$T2,$T3
      # at this point 64-bit lanes are ordered as 3-1-2-0
      vpsrlq        \$24,$T3,$T2        # splat the data     vporq        $PAD,$T2,$T2      vpaddq        $T2,$H2,$H2        # accumulate input     vpandq        $mask44,$T1,$T0     vpsrlq        \$44,$T1,$T1     vpsllq        \$20,$T3,$T3     vporq        $T3,$T1,$T1     vpandq        $mask44,$T1,$T1
      sub        \$4,$len     jz        .Ltail_vpmadd52_4x     jmp        .Loop_vpmadd52_4x     ud2
  .align    32 .Linit_vpmadd52:     vmovq        24($ctx),%x#$S1        # load key     vmovq        56($ctx),%x#$H2     vmovq        32($ctx),%x#$S2     vmovq        40($ctx),%x#$R0     vmovq        48($ctx),%x#$R1
      vmovdqa        $R0,$H0     vmovdqa        $R1,$H1     vmovdqa        $H2,$R2
      mov        \$2,%eax
  .Lmul_init_vpmadd52:     vpxorq        $D0lo,$D0lo,$D0lo     vpmadd52luq    $H2,$S1,$D0lo     vpxorq        $D0hi,$D0hi,$D0hi     vpmadd52huq    $H2,$S1,$D0hi     vpxorq        $D1lo,$D1lo,$D1lo     vpmadd52luq    $H2,$S2,$D1lo     vpxorq        $D1hi,$D1hi,$D1hi     vpmadd52huq    $H2,$S2,$D1hi     vpxorq        $D2lo,$D2lo,$D2lo     vpmadd52luq    $H2,$R0,$D2lo     vpxorq        $D2hi,$D2hi,$D2hi     vpmadd52huq    $H2,$R0,$D2hi
      vpmadd52luq    $H0,$R0,$D0lo     vpmadd52huq    $H0,$R0,$D0hi     vpmadd52luq    $H0,$R1,$D1lo     vpmadd52huq    $H0,$R1,$D1hi     vpmadd52luq    $H0,$R2,$D2lo     vpmadd52huq    $H0,$R2,$D2hi
      vpmadd52luq    $H1,$S2,$D0lo     vpmadd52huq    $H1,$S2,$D0hi     vpmadd52luq    $H1,$R0,$D1lo     vpmadd52huq    $H1,$R0,$D1hi     vpmadd52luq    $H1,$R1,$D2lo     vpmadd52huq    $H1,$R1,$D2hi
      ################################################################     # partial reduction     vpsrlq        \$44,$D0lo,$tmp     vpsllq        \$8,$D0hi,$D0hi     vpandq        $mask44,$D0lo,$H0     vpaddq        $tmp,$D0hi,$D0hi
      vpaddq        $D0hi,$D1lo,$D1lo
      vpsrlq        \$44,$D1lo,$tmp     vpsllq        \$8,$D1hi,$D1hi     vpandq        $mask44,$D1lo,$H1     vpaddq        $tmp,$D1hi,$D1hi
      vpaddq        $D1hi,$D2lo,$D2lo
      vpsrlq        \$42,$D2lo,$tmp     vpsllq        \$10,$D2hi,$D2hi     vpandq        $mask42,$D2lo,$H2     vpaddq        $tmp,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0     vpsllq        \$2,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0
      vpsrlq        \$44,$H0,$tmp        # additional step     vpandq        $mask44,$H0,$H0
      vpaddq        $tmp,$H1,$H1
      dec        %eax     jz        .Ldone_init_vpmadd52
      vpunpcklqdq    $R1,$H1,$R1        # 1,2     vpbroadcastq    %x#$H1,%x#$H1        # 2,2     vpunpcklqdq    $R2,$H2,$R2     vpbroadcastq    %x#$H2,%x#$H2     vpunpcklqdq    $R0,$H0,$R0     vpbroadcastq    %x#$H0,%x#$H0
      vpsllq        \$2,$R1,$S1        # S1 = R1*5*4     vpsllq        \$2,$R2,$S2        # S2 = R2*5*4     vpaddq        $R1,$S1,$S1     vpaddq        $R2,$S2,$S2     vpsllq        \$2,$S1,$S1     vpsllq        \$2,$S2,$S2
      jmp        .Lmul_init_vpmadd52     ud2
  .align    32 .Ldone_init_vpmadd52:     vinserti128    \$1,%x#$R1,$H1,$R1    # 1,2,3,4     vinserti128    \$1,%x#$R2,$H2,$R2     vinserti128    \$1,%x#$R0,$H0,$R0
      vpermq        \$0b11011000,$R1,$R1    # 1,3,2,4     vpermq        \$0b11011000,$R2,$R2     vpermq        \$0b11011000,$R0,$R0
      vpsllq        \$2,$R1,$S1        # S1 = R1*5*4     vpaddq        $R1,$S1,$S1     vpsllq        \$2,$S1,$S1
      vmovq        0($ctx),%x#$H0        # load current hash value     vmovq        8($ctx),%x#$H1     vmovq        16($ctx),%x#$H2
      test        \$3,$len        # is length 4*n+2?     jnz        .Ldone_init_vpmadd52_2x
      vmovdqu64    $R0,64($ctx)        # save key powers     vpbroadcastq    %x#$R0,$R0        # broadcast 4th power     vmovdqu64    $R1,96($ctx)     vpbroadcastq    %x#$R1,$R1     vmovdqu64    $R2,128($ctx)     vpbroadcastq    %x#$R2,$R2     vmovdqu64    $S1,160($ctx)     vpbroadcastq    %x#$S1,$S1
      jmp        .Lblocks_vpmadd52_4x_key_loaded     ud2
  .align    32 .Ldone_init_vpmadd52_2x:     vmovdqu64    $R0,64($ctx)        # save key powers     vpsrldq        \$8,$R0,$R0        # 0-1-0-2     vmovdqu64    $R1,96($ctx)     vpsrldq        \$8,$R1,$R1     vmovdqu64    $R2,128($ctx)     vpsrldq        \$8,$R2,$R2     vmovdqu64    $S1,160($ctx)     vpsrldq        \$8,$S1,$S1     jmp        .Lblocks_vpmadd52_2x_key_loaded     ud2
  .align    32 .Lblocks_vpmadd52_2x_do:     vmovdqu64    128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers     vmovdqu64    160+8($ctx),${S1}{%k1}{z}     vmovdqu64    64+8($ctx),${R0}{%k1}{z}     vmovdqu64    96+8($ctx),${R1}{%k1}{z}
  .Lblocks_vpmadd52_2x_key_loaded:     vmovdqu64    16*0($inp),$T2        # load data     vpxorq        $T3,$T3,$T3     lea        16*2($inp),$inp
      vpunpcklqdq    $T3,$T2,$T1        # transpose data     vpunpckhqdq    $T3,$T2,$T3
      # at this point 64-bit lanes are ordered as x-1-x-0
      vpsrlq        \$24,$T3,$T2        # splat the data     vporq        $PAD,$T2,$T2      vpaddq        $T2,$H2,$H2        # accumulate input     vpandq        $mask44,$T1,$T0     vpsrlq        \$44,$T1,$T1     vpsllq        \$20,$T3,$T3     vporq        $T3,$T1,$T1     vpandq        $mask44,$T1,$T1
      jmp        .Ltail_vpmadd52_2x     ud2
  .align    32 .Loop_vpmadd52_4x:     #vpaddq        $T2,$H2,$H2        # accumulate input     vpaddq        $T0,$H0,$H0     vpaddq        $T1,$H1,$H1
      vpxorq        $D0lo,$D0lo,$D0lo     vpmadd52luq    $H2,$S1,$D0lo     vpxorq        $D0hi,$D0hi,$D0hi     vpmadd52huq    $H2,$S1,$D0hi     vpxorq        $D1lo,$D1lo,$D1lo     vpmadd52luq    $H2,$S2,$D1lo     vpxorq        $D1hi,$D1hi,$D1hi     vpmadd52huq    $H2,$S2,$D1hi     vpxorq        $D2lo,$D2lo,$D2lo     vpmadd52luq    $H2,$R0,$D2lo     vpxorq        $D2hi,$D2hi,$D2hi     vpmadd52huq    $H2,$R0,$D2hi
       vmovdqu64    16*0($inp),$T2        # load data      vmovdqu64    16*2($inp),$T3      lea        16*4($inp),$inp     vpmadd52luq    $H0,$R0,$D0lo     vpmadd52huq    $H0,$R0,$D0hi     vpmadd52luq    $H0,$R1,$D1lo     vpmadd52huq    $H0,$R1,$D1hi     vpmadd52luq    $H0,$R2,$D2lo     vpmadd52huq    $H0,$R2,$D2hi
       vpunpcklqdq    $T3,$T2,$T1        # transpose data      vpunpckhqdq    $T3,$T2,$T3     vpmadd52luq    $H1,$S2,$D0lo     vpmadd52huq    $H1,$S2,$D0hi     vpmadd52luq    $H1,$R0,$D1lo     vpmadd52huq    $H1,$R0,$D1hi     vpmadd52luq    $H1,$R1,$D2lo     vpmadd52huq    $H1,$R1,$D2hi
      ################################################################     # partial reduction (interleaved with data splat)     vpsrlq        \$44,$D0lo,$tmp     vpsllq        \$8,$D0hi,$D0hi     vpandq        $mask44,$D0lo,$H0     vpaddq        $tmp,$D0hi,$D0hi
       vpsrlq        \$24,$T3,$T2      vporq        $PAD,$T2,$T2     vpaddq        $D0hi,$D1lo,$D1lo
      vpsrlq        \$44,$D1lo,$tmp     vpsllq        \$8,$D1hi,$D1hi     vpandq        $mask44,$D1lo,$H1     vpaddq        $tmp,$D1hi,$D1hi
       vpandq        $mask44,$T1,$T0      vpsrlq        \$44,$T1,$T1      vpsllq        \$20,$T3,$T3     vpaddq        $D1hi,$D2lo,$D2lo
      vpsrlq        \$42,$D2lo,$tmp     vpsllq        \$10,$D2hi,$D2hi     vpandq        $mask42,$D2lo,$H2     vpaddq        $tmp,$D2hi,$D2hi
        vpaddq    $T2,$H2,$H2        # accumulate input     vpaddq        $D2hi,$H0,$H0     vpsllq        \$2,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0      vporq        $T3,$T1,$T1      vpandq        $mask44,$T1,$T1
      vpsrlq        \$44,$H0,$tmp        # additional step     vpandq        $mask44,$H0,$H0
      vpaddq        $tmp,$H1,$H1
      sub        \$4,$len        # len-=64     jnz        .Loop_vpmadd52_4x
  .Ltail_vpmadd52_4x:     vmovdqu64    128($ctx),$R2        # load all key powers     vmovdqu64    160($ctx),$S1     vmovdqu64    64($ctx),$R0     vmovdqu64    96($ctx),$R1
  .Ltail_vpmadd52_2x:     vpsllq        \$2,$R2,$S2        # S2 = R2*5*4     vpaddq        $R2,$S2,$S2     vpsllq        \$2,$S2,$S2
      #vpaddq        $T2,$H2,$H2        # accumulate input     vpaddq        $T0,$H0,$H0     vpaddq        $T1,$H1,$H1
      vpxorq        $D0lo,$D0lo,$D0lo     vpmadd52luq    $H2,$S1,$D0lo     vpxorq        $D0hi,$D0hi,$D0hi     vpmadd52huq    $H2,$S1,$D0hi     vpxorq        $D1lo,$D1lo,$D1lo     vpmadd52luq    $H2,$S2,$D1lo     vpxorq        $D1hi,$D1hi,$D1hi     vpmadd52huq    $H2,$S2,$D1hi     vpxorq        $D2lo,$D2lo,$D2lo     vpmadd52luq    $H2,$R0,$D2lo     vpxorq        $D2hi,$D2hi,$D2hi     vpmadd52huq    $H2,$R0,$D2hi
      vpmadd52luq    $H0,$R0,$D0lo     vpmadd52huq    $H0,$R0,$D0hi     vpmadd52luq    $H0,$R1,$D1lo     vpmadd52huq    $H0,$R1,$D1hi     vpmadd52luq    $H0,$R2,$D2lo     vpmadd52huq    $H0,$R2,$D2hi
      vpmadd52luq    $H1,$S2,$D0lo     vpmadd52huq    $H1,$S2,$D0hi     vpmadd52luq    $H1,$R0,$D1lo     vpmadd52huq    $H1,$R0,$D1hi     vpmadd52luq    $H1,$R1,$D2lo     vpmadd52huq    $H1,$R1,$D2hi
      ################################################################     # horizontal addition
      mov        \$1,%eax     kmovw        %eax,%k1     vpsrldq        \$8,$D0lo,$T0     vpsrldq        \$8,$D0hi,$H0     vpsrldq        \$8,$D1lo,$T1     vpsrldq        \$8,$D1hi,$H1     vpaddq        $T0,$D0lo,$D0lo     vpaddq        $H0,$D0hi,$D0hi     vpsrldq        \$8,$D2lo,$T2     vpsrldq        \$8,$D2hi,$H2     vpaddq        $T1,$D1lo,$D1lo     vpaddq        $H1,$D1hi,$D1hi      vpermq        \$0x2,$D0lo,$T0      vpermq        \$0x2,$D0hi,$H0     vpaddq        $T2,$D2lo,$D2lo     vpaddq        $H2,$D2hi,$D2hi
      vpermq        \$0x2,$D1lo,$T1     vpermq        \$0x2,$D1hi,$H1     vpaddq        $T0,$D0lo,${D0lo}{%k1}{z}     vpaddq        $H0,$D0hi,${D0hi}{%k1}{z}     vpermq        \$0x2,$D2lo,$T2     vpermq        \$0x2,$D2hi,$H2     vpaddq        $T1,$D1lo,${D1lo}{%k1}{z}     vpaddq        $H1,$D1hi,${D1hi}{%k1}{z}     vpaddq        $T2,$D2lo,${D2lo}{%k1}{z}     vpaddq        $H2,$D2hi,${D2hi}{%k1}{z}
      ################################################################     # partial reduction     vpsrlq        \$44,$D0lo,$tmp     vpsllq        \$8,$D0hi,$D0hi     vpandq        $mask44,$D0lo,$H0     vpaddq        $tmp,$D0hi,$D0hi
      vpaddq        $D0hi,$D1lo,$D1lo
      vpsrlq        \$44,$D1lo,$tmp     vpsllq        \$8,$D1hi,$D1hi     vpandq        $mask44,$D1lo,$H1     vpaddq        $tmp,$D1hi,$D1hi
      vpaddq        $D1hi,$D2lo,$D2lo
      vpsrlq        \$42,$D2lo,$tmp     vpsllq        \$10,$D2hi,$D2hi     vpandq        $mask42,$D2lo,$H2     vpaddq        $tmp,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0     vpsllq        \$2,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0
      vpsrlq        \$44,$H0,$tmp        # additional step     vpandq        $mask44,$H0,$H0
      vpaddq        $tmp,$H1,$H1                         # at this point $len is                         # either 4*n+2 or 0...     sub        \$2,$len        # len-=32     ja        .Lblocks_vpmadd52_4x_do
      vmovq        %x#$H0,0($ctx)     vmovq        %x#$H1,8($ctx)     vmovq        %x#$H2,16($ctx)     vzeroall
  .Lno_data_vpmadd52_4x:     ret .cfi_endproc .size    poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x ___ } { ######################################################################## # As implied by its name 8x subroutine processes 8 blocks in parallel... # This is intermediate version, as it's used only in cases when input # length is either 8*n, 8*n+1 or 8*n+2...
  my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
  $code.=<<___; .type    poly1305_blocks_vpmadd52_8x,\@function,4 .align    32 poly1305_blocks_vpmadd52_8x: .cfi_startproc     shr    \$4,$len     jz    .Lno_data_vpmadd52_8x        # too short
      shl    \$40,$padbit     mov    64($ctx),%r8            # peek on power of the key
      vmovdqa64    .Lx_mask44(%rip),$mask44     vmovdqa64    .Lx_mask42(%rip),$mask42
      test    %r8,%r8                # is power value impossible?     js    .Linit_vpmadd52            # if it is, then init R[4]
      vmovq    0($ctx),%x#$H0            # load current hash value     vmovq    8($ctx),%x#$H1     vmovq    16($ctx),%x#$H2
  .Lblocks_vpmadd52_8x:     ################################################################     # fist we calculate more key powers
      vmovdqu64    128($ctx),$R2        # load 1-3-2-4 powers     vmovdqu64    160($ctx),$S1     vmovdqu64    64($ctx),$R0     vmovdqu64    96($ctx),$R1
      vpsllq        \$2,$R2,$S2        # S2 = R2*5*4     vpaddq        $R2,$S2,$S2     vpsllq        \$2,$S2,$S2
      vpbroadcastq    %x#$R2,$RR2        # broadcast 4th power     vpbroadcastq    %x#$R0,$RR0     vpbroadcastq    %x#$R1,$RR1
      vpxorq        $D0lo,$D0lo,$D0lo     vpmadd52luq    $RR2,$S1,$D0lo     vpxorq        $D0hi,$D0hi,$D0hi     vpmadd52huq    $RR2,$S1,$D0hi     vpxorq        $D1lo,$D1lo,$D1lo     vpmadd52luq    $RR2,$S2,$D1lo     vpxorq        $D1hi,$D1hi,$D1hi     vpmadd52huq    $RR2,$S2,$D1hi     vpxorq        $D2lo,$D2lo,$D2lo     vpmadd52luq    $RR2,$R0,$D2lo     vpxorq        $D2hi,$D2hi,$D2hi     vpmadd52huq    $RR2,$R0,$D2hi
      vpmadd52luq    $RR0,$R0,$D0lo     vpmadd52huq    $RR0,$R0,$D0hi     vpmadd52luq    $RR0,$R1,$D1lo     vpmadd52huq    $RR0,$R1,$D1hi     vpmadd52luq    $RR0,$R2,$D2lo     vpmadd52huq    $RR0,$R2,$D2hi
      vpmadd52luq    $RR1,$S2,$D0lo     vpmadd52huq    $RR1,$S2,$D0hi     vpmadd52luq    $RR1,$R0,$D1lo     vpmadd52huq    $RR1,$R0,$D1hi     vpmadd52luq    $RR1,$R1,$D2lo     vpmadd52huq    $RR1,$R1,$D2hi
      ################################################################     # partial reduction     vpsrlq        \$44,$D0lo,$tmp     vpsllq        \$8,$D0hi,$D0hi     vpandq        $mask44,$D0lo,$RR0     vpaddq        $tmp,$D0hi,$D0hi
      vpaddq        $D0hi,$D1lo,$D1lo
      vpsrlq        \$44,$D1lo,$tmp     vpsllq        \$8,$D1hi,$D1hi     vpandq        $mask44,$D1lo,$RR1     vpaddq        $tmp,$D1hi,$D1hi
      vpaddq        $D1hi,$D2lo,$D2lo
      vpsrlq        \$42,$D2lo,$tmp     vpsllq        \$10,$D2hi,$D2hi     vpandq        $mask42,$D2lo,$RR2     vpaddq        $tmp,$D2hi,$D2hi
      vpaddq        $D2hi,$RR0,$RR0     vpsllq        \$2,$D2hi,$D2hi
      vpaddq        $D2hi,$RR0,$RR0
      vpsrlq        \$44,$RR0,$tmp        # additional step     vpandq        $mask44,$RR0,$RR0
      vpaddq        $tmp,$RR1,$RR1
      ################################################################     # At this point Rx holds 1324 powers, RRx - 5768, and the goal     # is 15263748, which reflects how data is loaded...
      vpunpcklqdq    $R2,$RR2,$T2        # 3748     vpunpckhqdq    $R2,$RR2,$R2        # 1526     vpunpcklqdq    $R0,$RR0,$T0     vpunpckhqdq    $R0,$RR0,$R0     vpunpcklqdq    $R1,$RR1,$T1     vpunpckhqdq    $R1,$RR1,$R1 ___ ######## switch to %zmm map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
  $code.=<<___;     vshufi64x2    \$0x44,$R2,$T2,$RR2    # 15263748     vshufi64x2    \$0x44,$R0,$T0,$RR0     vshufi64x2    \$0x44,$R1,$T1,$RR1
      vmovdqu64    16*0($inp),$T2        # load data     vmovdqu64    16*4($inp),$T3     lea        16*8($inp),$inp
      vpsllq        \$2,$RR2,$SS2        # S2 = R2*5*4     vpsllq        \$2,$RR1,$SS1        # S1 = R1*5*4     vpaddq        $RR2,$SS2,$SS2     vpaddq        $RR1,$SS1,$SS1     vpsllq        \$2,$SS2,$SS2     vpsllq        \$2,$SS1,$SS1
      vpbroadcastq    $padbit,$PAD     vpbroadcastq    %x#$mask44,$mask44     vpbroadcastq    %x#$mask42,$mask42
      vpbroadcastq    %x#$SS1,$S1        # broadcast 8th power     vpbroadcastq    %x#$SS2,$S2     vpbroadcastq    %x#$RR0,$R0     vpbroadcastq    %x#$RR1,$R1     vpbroadcastq    %x#$RR2,$R2
      vpunpcklqdq    $T3,$T2,$T1        # transpose data     vpunpckhqdq    $T3,$T2,$T3
      # at this point 64-bit lanes are ordered as 73625140
      vpsrlq        \$24,$T3,$T2        # splat the data     vporq        $PAD,$T2,$T2      vpaddq        $T2,$H2,$H2        # accumulate input     vpandq        $mask44,$T1,$T0     vpsrlq        \$44,$T1,$T1     vpsllq        \$20,$T3,$T3     vporq        $T3,$T1,$T1     vpandq        $mask44,$T1,$T1
      sub        \$8,$len     jz        .Ltail_vpmadd52_8x     jmp        .Loop_vpmadd52_8x
  .align    32 .Loop_vpmadd52_8x:     #vpaddq        $T2,$H2,$H2        # accumulate input     vpaddq        $T0,$H0,$H0     vpaddq        $T1,$H1,$H1
      vpxorq        $D0lo,$D0lo,$D0lo     vpmadd52luq    $H2,$S1,$D0lo     vpxorq        $D0hi,$D0hi,$D0hi     vpmadd52huq    $H2,$S1,$D0hi     vpxorq        $D1lo,$D1lo,$D1lo     vpmadd52luq    $H2,$S2,$D1lo     vpxorq        $D1hi,$D1hi,$D1hi     vpmadd52huq    $H2,$S2,$D1hi     vpxorq        $D2lo,$D2lo,$D2lo     vpmadd52luq    $H2,$R0,$D2lo     vpxorq        $D2hi,$D2hi,$D2hi     vpmadd52huq    $H2,$R0,$D2hi
       vmovdqu64    16*0($inp),$T2        # load data      vmovdqu64    16*4($inp),$T3      lea        16*8($inp),$inp     vpmadd52luq    $H0,$R0,$D0lo     vpmadd52huq    $H0,$R0,$D0hi     vpmadd52luq    $H0,$R1,$D1lo     vpmadd52huq    $H0,$R1,$D1hi     vpmadd52luq    $H0,$R2,$D2lo     vpmadd52huq    $H0,$R2,$D2hi
       vpunpcklqdq    $T3,$T2,$T1        # transpose data      vpunpckhqdq    $T3,$T2,$T3     vpmadd52luq    $H1,$S2,$D0lo     vpmadd52huq    $H1,$S2,$D0hi     vpmadd52luq    $H1,$R0,$D1lo     vpmadd52huq    $H1,$R0,$D1hi     vpmadd52luq    $H1,$R1,$D2lo     vpmadd52huq    $H1,$R1,$D2hi
      ################################################################     # partial reduction (interleaved with data splat)     vpsrlq        \$44,$D0lo,$tmp     vpsllq        \$8,$D0hi,$D0hi     vpandq        $mask44,$D0lo,$H0     vpaddq        $tmp,$D0hi,$D0hi
       vpsrlq        \$24,$T3,$T2      vporq        $PAD,$T2,$T2     vpaddq        $D0hi,$D1lo,$D1lo
      vpsrlq        \$44,$D1lo,$tmp     vpsllq        \$8,$D1hi,$D1hi     vpandq        $mask44,$D1lo,$H1     vpaddq        $tmp,$D1hi,$D1hi
       vpandq        $mask44,$T1,$T0      vpsrlq        \$44,$T1,$T1      vpsllq        \$20,$T3,$T3     vpaddq        $D1hi,$D2lo,$D2lo
      vpsrlq        \$42,$D2lo,$tmp     vpsllq        \$10,$D2hi,$D2hi     vpandq        $mask42,$D2lo,$H2     vpaddq        $tmp,$D2hi,$D2hi
        vpaddq    $T2,$H2,$H2        # accumulate input     vpaddq        $D2hi,$H0,$H0     vpsllq        \$2,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0      vporq        $T3,$T1,$T1      vpandq        $mask44,$T1,$T1
      vpsrlq        \$44,$H0,$tmp        # additional step     vpandq        $mask44,$H0,$H0
      vpaddq        $tmp,$H1,$H1
      sub        \$8,$len        # len-=128     jnz        .Loop_vpmadd52_8x
  .Ltail_vpmadd52_8x:     #vpaddq        $T2,$H2,$H2        # accumulate input     vpaddq        $T0,$H0,$H0     vpaddq        $T1,$H1,$H1
      vpxorq        $D0lo,$D0lo,$D0lo     vpmadd52luq    $H2,$SS1,$D0lo     vpxorq        $D0hi,$D0hi,$D0hi     vpmadd52huq    $H2,$SS1,$D0hi     vpxorq        $D1lo,$D1lo,$D1lo     vpmadd52luq    $H2,$SS2,$D1lo     vpxorq        $D1hi,$D1hi,$D1hi     vpmadd52huq    $H2,$SS2,$D1hi     vpxorq        $D2lo,$D2lo,$D2lo     vpmadd52luq    $H2,$RR0,$D2lo     vpxorq        $D2hi,$D2hi,$D2hi     vpmadd52huq    $H2,$RR0,$D2hi
      vpmadd52luq    $H0,$RR0,$D0lo     vpmadd52huq    $H0,$RR0,$D0hi     vpmadd52luq    $H0,$RR1,$D1lo     vpmadd52huq    $H0,$RR1,$D1hi     vpmadd52luq    $H0,$RR2,$D2lo     vpmadd52huq    $H0,$RR2,$D2hi
      vpmadd52luq    $H1,$SS2,$D0lo     vpmadd52huq    $H1,$SS2,$D0hi     vpmadd52luq    $H1,$RR0,$D1lo     vpmadd52huq    $H1,$RR0,$D1hi     vpmadd52luq    $H1,$RR1,$D2lo     vpmadd52huq    $H1,$RR1,$D2hi
      ################################################################     # horizontal addition
      mov        \$1,%eax     kmovw        %eax,%k1     vpsrldq        \$8,$D0lo,$T0     vpsrldq        \$8,$D0hi,$H0     vpsrldq        \$8,$D1lo,$T1     vpsrldq        \$8,$D1hi,$H1     vpaddq        $T0,$D0lo,$D0lo     vpaddq        $H0,$D0hi,$D0hi     vpsrldq        \$8,$D2lo,$T2     vpsrldq        \$8,$D2hi,$H2     vpaddq        $T1,$D1lo,$D1lo     vpaddq        $H1,$D1hi,$D1hi      vpermq        \$0x2,$D0lo,$T0      vpermq        \$0x2,$D0hi,$H0     vpaddq        $T2,$D2lo,$D2lo     vpaddq        $H2,$D2hi,$D2hi
      vpermq        \$0x2,$D1lo,$T1     vpermq        \$0x2,$D1hi,$H1     vpaddq        $T0,$D0lo,$D0lo     vpaddq        $H0,$D0hi,$D0hi     vpermq        \$0x2,$D2lo,$T2     vpermq        \$0x2,$D2hi,$H2     vpaddq        $T1,$D1lo,$D1lo     vpaddq        $H1,$D1hi,$D1hi      vextracti64x4    \$1,$D0lo,%y#$T0      vextracti64x4    \$1,$D0hi,%y#$H0     vpaddq        $T2,$D2lo,$D2lo     vpaddq        $H2,$D2hi,$D2hi
      vextracti64x4    \$1,$D1lo,%y#$T1     vextracti64x4    \$1,$D1hi,%y#$H1     vextracti64x4    \$1,$D2lo,%y#$T2     vextracti64x4    \$1,$D2hi,%y#$H2 ___ ######## switch back to %ymm map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
  $code.=<<___;     vpaddq        $T0,$D0lo,${D0lo}{%k1}{z}     vpaddq        $H0,$D0hi,${D0hi}{%k1}{z}     vpaddq        $T1,$D1lo,${D1lo}{%k1}{z}     vpaddq        $H1,$D1hi,${D1hi}{%k1}{z}     vpaddq        $T2,$D2lo,${D2lo}{%k1}{z}     vpaddq        $H2,$D2hi,${D2hi}{%k1}{z}
      ################################################################     # partial reduction     vpsrlq        \$44,$D0lo,$tmp     vpsllq        \$8,$D0hi,$D0hi     vpandq        $mask44,$D0lo,$H0     vpaddq        $tmp,$D0hi,$D0hi
      vpaddq        $D0hi,$D1lo,$D1lo
      vpsrlq        \$44,$D1lo,$tmp     vpsllq        \$8,$D1hi,$D1hi     vpandq        $mask44,$D1lo,$H1     vpaddq        $tmp,$D1hi,$D1hi
      vpaddq        $D1hi,$D2lo,$D2lo
      vpsrlq        \$42,$D2lo,$tmp     vpsllq        \$10,$D2hi,$D2hi     vpandq        $mask42,$D2lo,$H2     vpaddq        $tmp,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0     vpsllq        \$2,$D2hi,$D2hi
      vpaddq        $D2hi,$H0,$H0
      vpsrlq        \$44,$H0,$tmp        # additional step     vpandq        $mask44,$H0,$H0
      vpaddq        $tmp,$H1,$H1
      ################################################################
      vmovq        %x#$H0,0($ctx)     vmovq        %x#$H1,8($ctx)     vmovq        %x#$H2,16($ctx)     vzeroall
  .Lno_data_vpmadd52_8x:     ret .cfi_endproc .size    poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x ___ } $code.=<<___; .type    poly1305_emit_base2_44,\@function,3 .align    32 poly1305_emit_base2_44: .cfi_startproc     endbranch     mov    0($ctx),%r8    # load hash value     mov    8($ctx),%r9     mov    16($ctx),%r10
      mov    %r9,%rax     shr    \$20,%r9     shl    \$44,%rax     mov    %r10,%rcx     shr    \$40,%r10     shl    \$24,%rcx
      add    %rax,%r8     adc    %rcx,%r9     adc    \$0,%r10
      mov    %r8,%rax     add    \$5,%r8        # compare to modulus     mov    %r9,%rcx     adc    \$0,%r9     adc    \$0,%r10     shr    \$2,%r10    # did 130-bit value overflow?     cmovnz    %r8,%rax     cmovnz    %r9,%rcx
      add    0($nonce),%rax    # accumulate nonce     adc    8($nonce),%rcx     mov    %rax,0($mac)    # write result     mov    %rcx,8($mac)
      ret .cfi_endproc .size    poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ }    }    } $code.=<<___; .align    64 .Lconst: .Lmask24: .long    0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 .L129: .long    `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 .Lmask26: .long    0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 .Lpermd_avx2: .long    2,2,2,3,2,0,2,1 .Lpermd_avx512: .long    0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
  .L2_44_inp_permd: .long    0,1,1,2,2,3,7,7 .L2_44_inp_shift: .quad    0,12,24,64 .L2_44_mask: .quad    0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff .L2_44_shift_rgt: .quad    44,44,42,64 .L2_44_shift_lft: .quad    8,8,10,64
  .align    64 .Lx_mask44: .quad    0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff .quad    0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff .Lx_mask42: .quad    0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff .quad    0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff ___ } $code.=<<___; .asciz    "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align    16 ___
  {    # chacha20-poly1305 helpers my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order                                   ("%rdi","%rsi","%rdx","%rcx");  # Unix order $code.=<<___; .globl    xor128_encrypt_n_pad .type    xor128_encrypt_n_pad,\@abi-omnipotent .align    16 xor128_encrypt_n_pad: .cfi_startproc     sub    $otp,$inp     sub    $otp,$out     mov    $len,%r10        # put len aside     shr    \$4,$len        # len / 16     jz    .Ltail_enc     nop .Loop_enc_xmm:     movdqu    ($inp,$otp),%xmm0     pxor    ($otp),%xmm0     movdqu    %xmm0,($out,$otp)     movdqa    %xmm0,($otp)     lea    16($otp),$otp     dec    $len     jnz    .Loop_enc_xmm
      and    \$15,%r10        # len % 16     jz    .Ldone_enc
  .Ltail_enc:     mov    \$16,$len     sub    %r10,$len     xor    %eax,%eax .Loop_enc_byte:     mov    ($inp,$otp),%al     xor    ($otp),%al     mov    %al,($out,$otp)     mov    %al,($otp)     lea    1($otp),$otp     dec    %r10     jnz    .Loop_enc_byte
      xor    %eax,%eax .Loop_enc_pad:     mov    %al,($otp)     lea    1($otp),$otp     dec    $len     jnz    .Loop_enc_pad
  .Ldone_enc:     mov    $otp,%rax     ret .cfi_endproc .size    xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
  .globl    xor128_decrypt_n_pad .type    xor128_decrypt_n_pad,\@abi-omnipotent .align    16 xor128_decrypt_n_pad: .cfi_startproc     sub    $otp,$inp     sub    $otp,$out     mov    $len,%r10        # put len aside     shr    \$4,$len        # len / 16     jz    .Ltail_dec     nop .Loop_dec_xmm:     movdqu    ($inp,$otp),%xmm0     movdqa    ($otp),%xmm1     pxor    %xmm0,%xmm1     movdqu    %xmm1,($out,$otp)     movdqa    %xmm0,($otp)     lea    16($otp),$otp     dec    $len     jnz    .Loop_dec_xmm
      pxor    %xmm1,%xmm1     and    \$15,%r10        # len % 16     jz    .Ldone_dec
  .Ltail_dec:     mov    \$16,$len     sub    %r10,$len     xor    %eax,%eax     xor    %r11,%r11 .Loop_dec_byte:     mov    ($inp,$otp),%r11b     mov    ($otp),%al     xor    %r11b,%al     mov    %al,($out,$otp)     mov    %r11b,($otp)     lea    1($otp),$otp     dec    %r10     jnz    .Loop_dec_byte
      xor    %eax,%eax .Loop_dec_pad:     mov    %al,($otp)     lea    1($otp),$otp     dec    $len     jnz    .Loop_dec_pad
  .Ldone_dec:     mov    $otp,%rax     ret .cfi_endproc .size    xor128_decrypt_n_pad,.-xor128_decrypt_n_pad ___ }
  # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, #        CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9";
  $code.=<<___; .extern    __imp_RtlVirtualUnwind .type    se_handler,\@abi-omnipotent .align    16 se_handler:     push    %rsi     push    %rdi     push    %rbx     push    %rbp     push    %r12     push    %r13     push    %r14     push    %r15     pushfq     sub    \$64,%rsp
      mov    120($context),%rax    # pull context->Rax     mov    248($context),%rbx    # pull context->Rip
      mov    8($disp),%rsi        # disp->ImageBase     mov    56($disp),%r11        # disp->HandlerData
      mov    0(%r11),%r10d        # HandlerData[0]     lea    (%rsi,%r10),%r10    # prologue label     cmp    %r10,%rbx        # context->Rip<.Lprologue     jb    .Lcommon_seh_tail
      mov    152($context),%rax    # pull context->Rsp
      mov    4(%r11),%r10d        # HandlerData[1]     lea    (%rsi,%r10),%r10    # epilogue label     cmp    %r10,%rbx        # context->Rip>=.Lepilogue     jae    .Lcommon_seh_tail
      lea    48(%rax),%rax
      mov    -8(%rax),%rbx     mov    -16(%rax),%rbp     mov    -24(%rax),%r12     mov    -32(%rax),%r13     mov    -40(%rax),%r14     mov    -48(%rax),%r15     mov    %rbx,144($context)    # restore context->Rbx     mov    %rbp,160($context)    # restore context->Rbp     mov    %r12,216($context)    # restore context->R12     mov    %r13,224($context)    # restore context->R13     mov    %r14,232($context)    # restore context->R14     mov    %r15,240($context)    # restore context->R14
      jmp    .Lcommon_seh_tail .size    se_handler,.-se_handler
  .type    avx_handler,\@abi-omnipotent .align    16 avx_handler:     push    %rsi     push    %rdi     push    %rbx     push    %rbp     push    %r12     push    %r13     push    %r14     push    %r15     pushfq     sub    \$64,%rsp
      mov    120($context),%rax    # pull context->Rax     mov    248($context),%rbx    # pull context->Rip
      mov    8($disp),%rsi        # disp->ImageBase     mov    56($disp),%r11        # disp->HandlerData
      mov    0(%r11),%r10d        # HandlerData[0]     lea    (%rsi,%r10),%r10    # prologue label     cmp    %r10,%rbx        # context->Rip<prologue label     jb    .Lcommon_seh_tail
      mov    152($context),%rax    # pull context->Rsp
      mov    4(%r11),%r10d        # HandlerData[1]     lea    (%rsi,%r10),%r10    # epilogue label     cmp    %r10,%rbx        # context->Rip>=epilogue label     jae    .Lcommon_seh_tail
      mov    208($context),%rax    # pull context->R11
      lea    0x50(%rax),%rsi     lea    0xf8(%rax),%rax     lea    512($context),%rdi    # &context.Xmm6     mov    \$20,%ecx     .long    0xa548f3fc        # cld; rep movsq
  .Lcommon_seh_tail:     mov    8(%rax),%rdi     mov    16(%rax),%rsi     mov    %rax,152($context)    # restore context->Rsp     mov    %rsi,168($context)    # restore context->Rsi     mov    %rdi,176($context)    # restore context->Rdi
      mov    40($disp),%rdi        # disp->ContextRecord     mov    $context,%rsi        # context     mov    \$154,%ecx        # sizeof(CONTEXT)     .long    0xa548f3fc        # cld; rep movsq
      mov    $disp,%rsi     xor    %rcx,%rcx        # arg1, UNW_FLAG_NHANDLER     mov    8(%rsi),%rdx        # arg2, disp->ImageBase     mov    0(%rsi),%r8        # arg3, disp->ControlPc     mov    16(%rsi),%r9        # arg4, disp->FunctionEntry     mov    40(%rsi),%r10        # disp->ContextRecord     lea    56(%rsi),%r11        # &disp->HandlerData     lea    24(%rsi),%r12        # &disp->EstablisherFrame     mov    %r10,32(%rsp)        # arg5     mov    %r11,40(%rsp)        # arg6     mov    %r12,48(%rsp)        # arg7     mov    %rcx,56(%rsp)        # arg8, (NULL)     call    *__imp_RtlVirtualUnwind(%rip)
      mov    \$1,%eax        # ExceptionContinueSearch     add    \$64,%rsp     popfq     pop    %r15     pop    %r14     pop    %r13     pop    %r12     pop    %rbp     pop    %rbx     pop    %rdi     pop    %rsi     ret .size    avx_handler,.-avx_handler
  .section    .pdata .align    4     .rva    .LSEH_begin_poly1305_init     .rva    .LSEH_end_poly1305_init     .rva    .LSEH_info_poly1305_init
      .rva    .LSEH_begin_poly1305_blocks     .rva    .LSEH_end_poly1305_blocks     .rva    .LSEH_info_poly1305_blocks
      .rva    .LSEH_begin_poly1305_emit     .rva    .LSEH_end_poly1305_emit     .rva    .LSEH_info_poly1305_emit ___ $code.=<<___ if ($avx);     .rva    .LSEH_begin_poly1305_blocks_avx     .rva    .Lbase2_64_avx     .rva    .LSEH_info_poly1305_blocks_avx_1
      .rva    .Lbase2_64_avx     .rva    .Leven_avx     .rva    .LSEH_info_poly1305_blocks_avx_2
      .rva    .Leven_avx     .rva    .LSEH_end_poly1305_blocks_avx     .rva    .LSEH_info_poly1305_blocks_avx_3
      .rva    .LSEH_begin_poly1305_emit_avx     .rva    .LSEH_end_poly1305_emit_avx     .rva    .LSEH_info_poly1305_emit_avx ___ $code.=<<___ if ($avx>1);     .rva    .LSEH_begin_poly1305_blocks_avx2     .rva    .Lbase2_64_avx2     .rva    .LSEH_info_poly1305_blocks_avx2_1
      .rva    .Lbase2_64_avx2     .rva    .Leven_avx2     .rva    .LSEH_info_poly1305_blocks_avx2_2
      .rva    .Leven_avx2     .rva    .LSEH_end_poly1305_blocks_avx2     .rva    .LSEH_info_poly1305_blocks_avx2_3 ___ $code.=<<___ if ($avx>2);     .rva    .LSEH_begin_poly1305_blocks_avx512     .rva    .LSEH_end_poly1305_blocks_avx512     .rva    .LSEH_info_poly1305_blocks_avx512 ___ $code.=<<___; .section    .xdata .align    8 .LSEH_info_poly1305_init:     .byte    9,0,0,0     .rva    se_handler     .rva    .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
  .LSEH_info_poly1305_blocks:     .byte    9,0,0,0     .rva    se_handler     .rva    .Lblocks_body,.Lblocks_epilogue
  .LSEH_info_poly1305_emit:     .byte    9,0,0,0     .rva    se_handler     .rva    .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit ___ $code.=<<___ if ($avx); .LSEH_info_poly1305_blocks_avx_1:     .byte    9,0,0,0     .rva    se_handler     .rva    .Lblocks_avx_body,.Lblocks_avx_epilogue        # HandlerData[]
  .LSEH_info_poly1305_blocks_avx_2:     .byte    9,0,0,0     .rva    se_handler     .rva    .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue    # HandlerData[]
  .LSEH_info_poly1305_blocks_avx_3:     .byte    9,0,0,0     .rva    avx_handler     .rva    .Ldo_avx_body,.Ldo_avx_epilogue            # HandlerData[]
  .LSEH_info_poly1305_emit_avx:     .byte    9,0,0,0     .rva    se_handler     .rva    .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx ___ $code.=<<___ if ($avx>1); .LSEH_info_poly1305_blocks_avx2_1:     .byte    9,0,0,0     .rva    se_handler     .rva    .Lblocks_avx2_body,.Lblocks_avx2_epilogue    # HandlerData[]
  .LSEH_info_poly1305_blocks_avx2_2:     .byte    9,0,0,0     .rva    se_handler     .rva    .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue    # HandlerData[]
  .LSEH_info_poly1305_blocks_avx2_3:     .byte    9,0,0,0     .rva    avx_handler     .rva    .Ldo_avx2_body,.Ldo_avx2_epilogue        # HandlerData[] ___ $code.=<<___ if ($avx>2); .LSEH_info_poly1305_blocks_avx512:     .byte    9,0,0,0     .rva    avx_handler     .rva    .Ldo_avx512_body,.Ldo_avx512_epilogue        # HandlerData[] ___ }
  foreach (split('\n',$code)) {     s/\`([^\`]*)\`/eval($1)/ge;     s/%r([a-z]+)#d/%e$1/g;     s/%r([0-9]+)#d/%r$1d/g;     s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
      print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; 
  |