www.tdfonline.com.ar

!C99Shell v. 2.1 [PHP 8 Update] [02.02.2022]!
Software: Apache/2.4.53 (Unix) OpenSSL/1.1.1o PHP/7.4.29 mod_perl/2.0.12 Perl/v5.34.1. PHP/7.4.29 uname -a: Linux vps-2738122-x 4.15.0-213-generic #224-Ubuntu SMP Mon Jun 19 13:30:12 UTC 2023 x86_64 uid=1(daemon) gid=1(daemon) grupos=1(daemon) Safe-mode: OFF (not secure) /opt/apex_tdfonline/proyectos/tdfonline/www/docs/openssl/crypto/chacha/asm/ drwxr-xr-x Free 11.74 GB of 61.93 GB (18.96%) Encoder Tools Proc. FTP brute Sec. SQL PHP-code Update Feedback Self remove Logout

#! /usr/bin/env perl
# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# December 2015
#
# ChaCha20 for s390x.
#
# 3 times faster than compiler-generated code.

#
# August 2018
#
# Add vx code path: 4x"vertical".
#
# Copyright IBM Corp. 2018
# Author: Patrick Steuer <patrick.steuer@de.ibm.com>

#
# February 2019
#
# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
# 4x"vertical" submission [on z13] and >3 faster than scalar code.
# But to harness overheads revert to transliteration of VSX code path
# from chacha-ppc module, which is also 4x"vertical", to handle inputs
# not longer than 256 bytes.

use strict;
use FindBin qw($Bin);
use lib "$Bin/../..";
use perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE);

# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;

my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
    $z=0;    # S/390 ABI
    $SIZE_T=4;
} else {
    $z=1;    # zSeries ABI
    $SIZE_T=8;
}

my $sp="%r15";
my $stdframe=16*$SIZE_T+4*8;

sub ROUND {
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
my @t=map("%r$_",(8,9));
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
my ($xc,$xc_)=map("$_",@t);

    # Consider order in which variables are addressed by their
    # index:
    #
    #    a   b   c   d
    #
    #    0   4   8  12 < even round
    #    1   5   9  13
    #    2   6  10  14
    #    3   7  11  15
    #    0   5  10  15 < odd round
    #    1   6  11  12
    #    2   7   8  13
    #    3   4   9  14
    #
    # 'a', 'b' and 'd's are permanently allocated in registers,
    # @x[0..7,12..15], while 'c's are maintained in memory. If
    # you observe 'c' column, you'll notice that pair of 'c's is
    # invariant between rounds. This means that we have to reload
    # them once per round, in the middle. This is why you'll see
    # 'c' stores and loads in the middle, but none in the beginning
    # or end.

    alr    (@x[$a0],@x[$b0]);    # Q1
     alr    (@x[$a1],@x[$b1]);    # Q2
    xr    (@x[$d0],@x[$a0]);
     xr    (@x[$d1],@x[$a1]);
    rll    (@x[$d0],@x[$d0],16);
     rll    (@x[$d1],@x[$d1],16);

    alr    ($xc,@x[$d0]);
     alr    ($xc_,@x[$d1]);
    xr    (@x[$b0],$xc);
     xr    (@x[$b1],$xc_);
    rll    (@x[$b0],@x[$b0],12);
     rll    (@x[$b1],@x[$b1],12);

    alr    (@x[$a0],@x[$b0]);
     alr    (@x[$a1],@x[$b1]);
    xr    (@x[$d0],@x[$a0]);
     xr    (@x[$d1],@x[$a1]);
    rll    (@x[$d0],@x[$d0],8);
     rll    (@x[$d1],@x[$d1],8);

    alr    ($xc,@x[$d0]);
     alr    ($xc_,@x[$d1]);
    xr    (@x[$b0],$xc);
     xr    (@x[$b1],$xc_);
    rll    (@x[$b0],@x[$b0],7);
     rll    (@x[$b1],@x[$b1],7);

    stm    ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)");    # reload pair of 'c's
    lm    ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");

    alr    (@x[$a2],@x[$b2]);    # Q3
     alr    (@x[$a3],@x[$b3]);    # Q4
    xr    (@x[$d2],@x[$a2]);
     xr    (@x[$d3],@x[$a3]);
    rll    (@x[$d2],@x[$d2],16);
     rll    (@x[$d3],@x[$d3],16);

    alr    ($xc,@x[$d2]);
     alr    ($xc_,@x[$d3]);
    xr    (@x[$b2],$xc);
     xr    (@x[$b3],$xc_);
    rll    (@x[$b2],@x[$b2],12);
     rll    (@x[$b3],@x[$b3],12);

    alr    (@x[$a2],@x[$b2]);
     alr    (@x[$a3],@x[$b3]);
    xr    (@x[$d2],@x[$a2]);
     xr    (@x[$d3],@x[$a3]);
    rll    (@x[$d2],@x[$d2],8);
     rll    (@x[$d3],@x[$d3],8);

    alr    ($xc,@x[$d2]);
     alr    ($xc_,@x[$d3]);
    xr    (@x[$b2],$xc);
     xr    (@x[$b3],$xc_);
    rll    (@x[$b2],@x[$b2],7);
     rll    (@x[$b3],@x[$b3],7);
}

sub VX_lane_ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
my @x=map("%v$_",(0..15));

    vaf    (@x[$a0],@x[$a0],@x[$b0]);    # Q1
    vx    (@x[$d0],@x[$d0],@x[$a0]);
    verllf    (@x[$d0],@x[$d0],16);
    vaf    (@x[$a1],@x[$a1],@x[$b1]);    # Q2
    vx    (@x[$d1],@x[$d1],@x[$a1]);
    verllf    (@x[$d1],@x[$d1],16);
    vaf    (@x[$a2],@x[$a2],@x[$b2]);    # Q3
    vx    (@x[$d2],@x[$d2],@x[$a2]);
    verllf    (@x[$d2],@x[$d2],16);
    vaf    (@x[$a3],@x[$a3],@x[$b3]);    # Q4
    vx    (@x[$d3],@x[$d3],@x[$a3]);
    verllf    (@x[$d3],@x[$d3],16);

    vaf    (@x[$c0],@x[$c0],@x[$d0]);
    vx    (@x[$b0],@x[$b0],@x[$c0]);
    verllf    (@x[$b0],@x[$b0],12);
    vaf    (@x[$c1],@x[$c1],@x[$d1]);
    vx    (@x[$b1],@x[$b1],@x[$c1]);
    verllf    (@x[$b1],@x[$b1],12);
    vaf    (@x[$c2],@x[$c2],@x[$d2]);
    vx    (@x[$b2],@x[$b2],@x[$c2]);
    verllf    (@x[$b2],@x[$b2],12);
    vaf    (@x[$c3],@x[$c3],@x[$d3]);
    vx    (@x[$b3],@x[$b3],@x[$c3]);
    verllf    (@x[$b3],@x[$b3],12);

    vaf    (@x[$a0],@x[$a0],@x[$b0]);
    vx    (@x[$d0],@x[$d0],@x[$a0]);
    verllf    (@x[$d0],@x[$d0],8);
    vaf    (@x[$a1],@x[$a1],@x[$b1]);
    vx    (@x[$d1],@x[$d1],@x[$a1]);
    verllf    (@x[$d1],@x[$d1],8);
    vaf    (@x[$a2],@x[$a2],@x[$b2]);
    vx    (@x[$d2],@x[$d2],@x[$a2]);
    verllf    (@x[$d2],@x[$d2],8);
    vaf    (@x[$a3],@x[$a3],@x[$b3]);
    vx    (@x[$d3],@x[$d3],@x[$a3]);
    verllf    (@x[$d3],@x[$d3],8);

    vaf    (@x[$c0],@x[$c0],@x[$d0]);
    vx    (@x[$b0],@x[$b0],@x[$c0]);
    verllf    (@x[$b0],@x[$b0],7);
    vaf    (@x[$c1],@x[$c1],@x[$d1]);
    vx    (@x[$b1],@x[$b1],@x[$c1]);
    verllf    (@x[$b1],@x[$b1],7);
    vaf    (@x[$c2],@x[$c2],@x[$d2]);
    vx    (@x[$b2],@x[$b2],@x[$c2]);
    verllf    (@x[$b2],@x[$b2],7);
    vaf    (@x[$c3],@x[$c3],@x[$d3]);
    vx    (@x[$b3],@x[$b3],@x[$c3]);
    verllf    (@x[$b3],@x[$b3],7);
}

sub VX_ROUND {
my @a=@_[0..5];
my @b=@_[6..11];
my @c=@_[12..17];
my @d=@_[18..23];
my $odd=@_[24];

    vaf        (@a[$_],@a[$_],@b[$_]) for (0..5);
    vx        (@d[$_],@d[$_],@a[$_]) for (0..5);
    verllf        (@d[$_],@d[$_],16) for (0..5);

    vaf        (@c[$_],@c[$_],@d[$_]) for (0..5);
    vx        (@b[$_],@b[$_],@c[$_]) for (0..5);
    verllf        (@b[$_],@b[$_],12) for (0..5);

    vaf        (@a[$_],@a[$_],@b[$_]) for (0..5);
    vx        (@d[$_],@d[$_],@a[$_]) for (0..5);
    verllf        (@d[$_],@d[$_],8) for (0..5);

    vaf        (@c[$_],@c[$_],@d[$_]) for (0..5);
    vx        (@b[$_],@b[$_],@c[$_]) for (0..5);
    verllf        (@b[$_],@b[$_],7) for (0..5);

    vsldb        (@c[$_],@c[$_],@c[$_],8) for (0..5);
    vsldb        (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
    vsldb        (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
}

PERLASM_BEGIN($output);

INCLUDE    ("s390x_arch.h");
TEXT    ();

################
# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
#                     const unsigned int key[8], const unsigned int counter[4])
my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
{
my $frame=$stdframe+4*20;
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
my @t=map("%r$_",(8,9));

GLOBL    ("ChaCha20_ctr32");
TYPE    ("ChaCha20_ctr32","\@function");
ALIGN    (32);
LABEL    ("ChaCha20_ctr32");
    larl    ("%r1","OPENSSL_s390xcap_P");

    lghi    ("%r0",64);
&{$z?    \&ltgr:\&ltr}    ($len,$len);        # len==0?
    bzr    ("%r14");
    lg    ("%r1","S390X_STFLE+16(%r1)");
&{$z?    \&clgr:\&clr}    ($len,"%r0");
    jle    (".Lshort");

    tmhh    ("%r1",0x4000);            # check for vx bit
    jnz    (".LChaCha20_ctr32_vx");

LABEL    (".Lshort");
&{$z?    \&aghi:\&ahi}    ($len,-64);
&{$z?    \&lghi:\&lhi}    ("%r1",-$frame);
&{$z?    \&stmg:\&stm}    ("%r6","%r15","6*$SIZE_T($sp)");
&{$z?    \&slgr:\&slr}    ($out,$inp);    # difference
    la    ($len,"0($inp,$len)");    # end of input minus 64
    larl    ("%r7",".Lsigma");
    lgr    ("%r0",$sp);
    la    ($sp,"0(%r1,$sp)");
&{$z?    \&stg:\&st}    ("%r0","0($sp)");

    lmg    ("%r8","%r11","0($key)");    # load key
    lmg    ("%r12","%r13","0($counter)");    # load counter
    lmg    ("%r6","%r7","0(%r7)");    # load sigma constant

    la    ("%r14","0($inp)");
&{$z?    \&stg:\&st}    ($out,"$frame+3*$SIZE_T($sp)");
&{$z?    \&stg:\&st}    ($len,"$frame+4*$SIZE_T($sp)");
    stmg    ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
    srlg    (@x[12],"%r12",32);    # 32-bit counter value
    j    (".Loop_outer");

ALIGN    (16);
LABEL    (".Loop_outer");
    lm    (@x[0],@x[7],"$stdframe+4*0($sp)");    # load x[0]-x[7]
    lm    (@t[0],@t[1],"$stdframe+4*10($sp)");    # load x[10]-x[11]
    lm    (@x[13],@x[15],"$stdframe+4*13($sp)");    # load x[13]-x[15]
    stm    (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
    lm    (@t[0],@t[1],"$stdframe+4*8($sp)");    # load x[8]-x[9]
    st    (@x[12],"$stdframe+4*12($sp)");    # save counter
&{$z?    \&stg:\&st}    ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
    lhi    ("%r14",10);
    j    (".Loop");

ALIGN    (4);
LABEL    (".Loop");
    ROUND    (0, 4, 8,12);
    ROUND    (0, 5,10,15);
    brct    ("%r14",".Loop");

&{$z?    \&lg:\&l}    ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
    stm    (@t[0],@t[1],"$stdframe+4*8+4*8($sp)");    # offload x[8]-x[9]
&{$z?    \&lmg:\&lm}    (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");

    al    (@x[0],"$stdframe+4*0($sp)");    # accumulate key schedule
    al    (@x[1],"$stdframe+4*1($sp)");
    al    (@x[2],"$stdframe+4*2($sp)");
    al    (@x[3],"$stdframe+4*3($sp)");
    al    (@x[4],"$stdframe+4*4($sp)");
    al    (@x[5],"$stdframe+4*5($sp)");
    al    (@x[6],"$stdframe+4*6($sp)");
    al    (@x[7],"$stdframe+4*7($sp)");
    lrvr    (@x[0],@x[0]);
    lrvr    (@x[1],@x[1]);
    lrvr    (@x[2],@x[2]);
    lrvr    (@x[3],@x[3]);
    lrvr    (@x[4],@x[4]);
    lrvr    (@x[5],@x[5]);
    lrvr    (@x[6],@x[6]);
    lrvr    (@x[7],@x[7]);
    al    (@x[12],"$stdframe+4*12($sp)");
    al    (@x[13],"$stdframe+4*13($sp)");
    al    (@x[14],"$stdframe+4*14($sp)");
    al    (@x[15],"$stdframe+4*15($sp)");
    lrvr    (@x[12],@x[12]);
    lrvr    (@x[13],@x[13]);
    lrvr    (@x[14],@x[14]);
    lrvr    (@x[15],@x[15]);

    la    (@t[0],"0(@t[0],%r14)");    # reconstruct output pointer
&{$z?    \&clgr:\&clr}    ("%r14",@t[1]);
    jh    (".Ltail");

    x    (@x[0],"4*0(%r14)");    # xor with input
    x    (@x[1],"4*1(%r14)");
    st    (@x[0],"4*0(@t[0])");    # store output
    x    (@x[2],"4*2(%r14)");
    st    (@x[1],"4*1(@t[0])");
    x    (@x[3],"4*3(%r14)");
    st    (@x[2],"4*2(@t[0])");
    x    (@x[4],"4*4(%r14)");
    st    (@x[3],"4*3(@t[0])");
     lm    (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");    # load x[8]-x[11]
    x    (@x[5],"4*5(%r14)");
    st    (@x[4],"4*4(@t[0])");
    x    (@x[6],"4*6(%r14)");
     al    (@x[0],"$stdframe+4*8($sp)");
    st    (@x[5],"4*5(@t[0])");
    x    (@x[7],"4*7(%r14)");
     al    (@x[1],"$stdframe+4*9($sp)");
    st    (@x[6],"4*6(@t[0])");
    x    (@x[12],"4*12(%r14)");
     al    (@x[2],"$stdframe+4*10($sp)");
    st    (@x[7],"4*7(@t[0])");
    x    (@x[13],"4*13(%r14)");
     al    (@x[3],"$stdframe+4*11($sp)");
    st    (@x[12],"4*12(@t[0])");
    x    (@x[14],"4*14(%r14)");
    st    (@x[13],"4*13(@t[0])");
    x    (@x[15],"4*15(%r14)");
    st    (@x[14],"4*14(@t[0])");
     lrvr    (@x[0],@x[0]);
    st    (@x[15],"4*15(@t[0])");
     lrvr    (@x[1],@x[1]);
     lrvr    (@x[2],@x[2]);
     lrvr    (@x[3],@x[3]);
    lhi    (@x[12],1);
     x    (@x[0],"4*8(%r14)");
    al    (@x[12],"$stdframe+4*12($sp)");    # increment counter
     x    (@x[1],"4*9(%r14)");
     st    (@x[0],"4*8(@t[0])");
     x    (@x[2],"4*10(%r14)");
     st    (@x[1],"4*9(@t[0])");
     x    (@x[3],"4*11(%r14)");
     st    (@x[2],"4*10(@t[0])");
     st    (@x[3],"4*11(@t[0])");

&{$z?    \&clgr:\&clr}    ("%r14",@t[1]);    # done yet?
    la    ("%r14","64(%r14)");
    jl    (".Loop_outer");

LABEL    (".Ldone");
    xgr    ("%r0","%r0");
    xgr    ("%r1","%r1");
    xgr    ("%r2","%r2");
    xgr    ("%r3","%r3");
    stmg    ("%r0","%r3","$stdframe+4*4($sp)");    # wipe key copy
    stmg    ("%r0","%r3","$stdframe+4*12($sp)");

&{$z?    \&lmg:\&lm}    ("%r6","%r15","$frame+6*$SIZE_T($sp)");
    br    ("%r14");

ALIGN    (16);
LABEL    (".Ltail");
    la    (@t[1],"64($t[1])");
    stm    (@x[0],@x[7],"$stdframe+4*0($sp)");
&{$z?    \&slgr:\&slr}    (@t[1],"%r14");
    lm    (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
&{$z?    \&lghi:\&lhi}    (@x[6],0);
    stm    (@x[12],@x[15],"$stdframe+4*12($sp)");
    al    (@x[0],"$stdframe+4*8($sp)");
    al    (@x[1],"$stdframe+4*9($sp)");
    al    (@x[2],"$stdframe+4*10($sp)");
    al    (@x[3],"$stdframe+4*11($sp)");
    lrvr    (@x[0],@x[0]);
    lrvr    (@x[1],@x[1]);
    lrvr    (@x[2],@x[2]);
    lrvr    (@x[3],@x[3]);
    stm    (@x[0],@x[3],"$stdframe+4*8($sp)");

LABEL    (".Loop_tail");
    llgc    (@x[4],"0(@x[6],%r14)");
    llgc    (@x[5],"$stdframe(@x[6],$sp)");
    xr    (@x[5],@x[4]);
    stc    (@x[5],"0(@x[6],@t[0])");
    la    (@x[6],"1(@x[6])");
    brct    (@t[1],".Loop_tail");

    j    (".Ldone");
SIZE    ("ChaCha20_ctr32",".-ChaCha20_ctr32");
}

########################################################################
# 4x"vertical" layout minimizes amount of instructions, but pipeline
# runs underutilized [because of vector instructions' high latency].
# On the other hand minimum amount of data it takes to fully utilize
# the pipeline is higher, so that effectively, short inputs would be
# processed slower. Hence this code path targeting <=256 bytes lengths.
#
{
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
my @K=map("%v$_",(16..19));
my $CTR="%v26";
my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
my $beperm="%v31";
my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
my $FRAME=$stdframe+4*16;

ALIGN    (32);
LABEL    ("ChaCha20_ctr32_4x");
LABEL    (".LChaCha20_ctr32_4x");
&{$z?    \&stmg:\&stm}    ("%r6","%r7","6*$SIZE_T($sp)");
if (!$z) {
    std    ("%f4","16*$SIZE_T+2*8($sp)");
    std    ("%f6","16*$SIZE_T+3*8($sp)");
}
&{$z?    \&lghi:\&lhi}    ("%r1",-$FRAME);
    lgr    ("%r0",$sp);
    la    ($sp,"0(%r1,$sp)");
&{$z?    \&stg:\&st}    ("%r0","0($sp)");    # back-chain
if ($z) {
    std    ("%f8","$stdframe+8*0($sp)");
    std    ("%f9","$stdframe+8*1($sp)");
    std    ("%f10","$stdframe+8*2($sp)");
    std    ("%f11","$stdframe+8*3($sp)");
    std    ("%f12","$stdframe+8*4($sp)");
    std    ("%f13","$stdframe+8*5($sp)");
    std    ("%f14","$stdframe+8*6($sp)");
    std    ("%f15","$stdframe+8*7($sp)");
}
    larl    ("%r7",".Lsigma");
    lhi    ("%r0",10);
    lhi    ("%r1",0);

    vl    (@K[0],"0(%r7)");        # load sigma
    vl    (@K[1],"0($key)");        # load key
    vl    (@K[2],"16($key)");
    vl    (@K[3],"0($counter)");        # load counter

    vl    ($beperm,"0x40(%r7)");
    vl    ($xt1,"0x50(%r7)");
    vrepf    ($CTR,@K[3],0);
    vlvgf    (@K[3],"%r1",0);        # clear @K[3].word[0]
    vaf    ($CTR,$CTR,$xt1);

#LABEL    (".Loop_outer_4x");
    vlm    ($xa0,$xa3,"0x60(%r7)");    # load [smashed] sigma

    vrepf    ($xb0,@K[1],0);            # smash the key
    vrepf    ($xb1,@K[1],1);
    vrepf    ($xb2,@K[1],2);
    vrepf    ($xb3,@K[1],3);

    vrepf    ($xc0,@K[2],0);
    vrepf    ($xc1,@K[2],1);
    vrepf    ($xc2,@K[2],2);
    vrepf    ($xc3,@K[2],3);

    vlr    ($xd0,$CTR);
    vrepf    ($xd1,@K[3],1);
    vrepf    ($xd2,@K[3],2);
    vrepf    ($xd3,@K[3],3);

LABEL    (".Loop_4x");
    VX_lane_ROUND(0, 4, 8,12);
    VX_lane_ROUND(0, 5,10,15);
    brct    ("%r0",".Loop_4x");

    vaf    ($xd0,$xd0,$CTR);

    vmrhf    ($xt0,$xa0,$xa1);        # transpose data
    vmrhf    ($xt1,$xa2,$xa3);
    vmrlf    ($xt2,$xa0,$xa1);
    vmrlf    ($xt3,$xa2,$xa3);
    vpdi    ($xa0,$xt0,$xt1,0b0000);
    vpdi    ($xa1,$xt0,$xt1,0b0101);
    vpdi    ($xa2,$xt2,$xt3,0b0000);
    vpdi    ($xa3,$xt2,$xt3,0b0101);

    vmrhf    ($xt0,$xb0,$xb1);
    vmrhf    ($xt1,$xb2,$xb3);
    vmrlf    ($xt2,$xb0,$xb1);
    vmrlf    ($xt3,$xb2,$xb3);
    vpdi    ($xb0,$xt0,$xt1,0b0000);
    vpdi    ($xb1,$xt0,$xt1,0b0101);
    vpdi    ($xb2,$xt2,$xt3,0b0000);
    vpdi    ($xb3,$xt2,$xt3,0b0101);

    vmrhf    ($xt0,$xc0,$xc1);
    vmrhf    ($xt1,$xc2,$xc3);
    vmrlf    ($xt2,$xc0,$xc1);
    vmrlf    ($xt3,$xc2,$xc3);
    vpdi    ($xc0,$xt0,$xt1,0b0000);
    vpdi    ($xc1,$xt0,$xt1,0b0101);
    vpdi    ($xc2,$xt2,$xt3,0b0000);
    vpdi    ($xc3,$xt2,$xt3,0b0101);

    vmrhf    ($xt0,$xd0,$xd1);
    vmrhf    ($xt1,$xd2,$xd3);
    vmrlf    ($xt2,$xd0,$xd1);
    vmrlf    ($xt3,$xd2,$xd3);
    vpdi    ($xd0,$xt0,$xt1,0b0000);
    vpdi    ($xd1,$xt0,$xt1,0b0101);
    vpdi    ($xd2,$xt2,$xt3,0b0000);
    vpdi    ($xd3,$xt2,$xt3,0b0101);

    #vrepif    ($xt0,4);
    #vaf    ($CTR,$CTR,$xt0);        # next counter value

    vaf    ($xa0,$xa0,@K[0]);
    vaf    ($xb0,$xb0,@K[1]);
    vaf    ($xc0,$xc0,@K[2]);
    vaf    ($xd0,$xd0,@K[3]);

    vperm    ($xa0,$xa0,$xa0,$beperm);
    vperm    ($xb0,$xb0,$xb0,$beperm);
    vperm    ($xc0,$xc0,$xc0,$beperm);
    vperm    ($xd0,$xd0,$xd0,$beperm);

    #&{$z?    \&clgfi:\&clfi} ($len,0x40);
    #jl    (".Ltail_4x");

    vlm    ($xt0,$xt3,"0($inp)");

    vx    ($xt0,$xt0,$xa0);
    vx    ($xt1,$xt1,$xb0);
    vx    ($xt2,$xt2,$xc0);
    vx    ($xt3,$xt3,$xd0);

    vstm    ($xt0,$xt3,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    #je    (".Ldone_4x");

    vaf    ($xa0,$xa1,@K[0]);
    vaf    ($xb0,$xb1,@K[1]);
    vaf    ($xc0,$xc1,@K[2]);
    vaf    ($xd0,$xd1,@K[3]);

    vperm    ($xa0,$xa0,$xa0,$beperm);
    vperm    ($xb0,$xb0,$xb0,$beperm);
    vperm    ($xc0,$xc0,$xc0,$beperm);
    vperm    ($xd0,$xd0,$xd0,$beperm);

&{$z?    \&clgfi:\&clfi} ($len,0x40);
    jl    (".Ltail_4x");

    vlm    ($xt0,$xt3,"0($inp)");

    vx    ($xt0,$xt0,$xa0);
    vx    ($xt1,$xt1,$xb0);
    vx    ($xt2,$xt2,$xc0);
    vx    ($xt3,$xt3,$xd0);

    vstm    ($xt0,$xt3,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    je    (".Ldone_4x");

    vaf    ($xa0,$xa2,@K[0]);
    vaf    ($xb0,$xb2,@K[1]);
    vaf    ($xc0,$xc2,@K[2]);
    vaf    ($xd0,$xd2,@K[3]);

    vperm    ($xa0,$xa0,$xa0,$beperm);
    vperm    ($xb0,$xb0,$xb0,$beperm);
    vperm    ($xc0,$xc0,$xc0,$beperm);
    vperm    ($xd0,$xd0,$xd0,$beperm);

&{$z?    \&clgfi:\&clfi} ($len,0x40);
    jl    (".Ltail_4x");

    vlm    ($xt0,$xt3,"0($inp)");

    vx    ($xt0,$xt0,$xa0);
    vx    ($xt1,$xt1,$xb0);
    vx    ($xt2,$xt2,$xc0);
    vx    ($xt3,$xt3,$xd0);

    vstm    ($xt0,$xt3,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    je    (".Ldone_4x");

    vaf    ($xa0,$xa3,@K[0]);
    vaf    ($xb0,$xb3,@K[1]);
    vaf    ($xc0,$xc3,@K[2]);
    vaf    ($xd0,$xd3,@K[3]);

    vperm    ($xa0,$xa0,$xa0,$beperm);
    vperm    ($xb0,$xb0,$xb0,$beperm);
    vperm    ($xc0,$xc0,$xc0,$beperm);
    vperm    ($xd0,$xd0,$xd0,$beperm);

&{$z?    \&clgfi:\&clfi} ($len,0x40);
    jl    (".Ltail_4x");

    vlm    ($xt0,$xt3,"0($inp)");

    vx    ($xt0,$xt0,$xa0);
    vx    ($xt1,$xt1,$xb0);
    vx    ($xt2,$xt2,$xc0);
    vx    ($xt3,$xt3,$xd0);

    vstm    ($xt0,$xt3,"0($out)");

    #la    $inp,0x40($inp));
    #la    $out,0x40($out));
    #lhi    %r0,10);
    #&{$z?    \&aghi:\&ahi}    $len,-0x40);
    #jne    .Loop_outer_4x);

LABEL    (".Ldone_4x");
if (!$z) {
    ld    ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
    ld    ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
} else {
    ld    ("%f8","$stdframe+8*0($sp)");
    ld    ("%f9","$stdframe+8*1($sp)");
    ld    ("%f10","$stdframe+8*2($sp)");
    ld    ("%f11","$stdframe+8*3($sp)");
    ld    ("%f12","$stdframe+8*4($sp)");
    ld    ("%f13","$stdframe+8*5($sp)");
    ld    ("%f14","$stdframe+8*6($sp)");
    ld    ("%f15","$stdframe+8*7($sp)");
}
&{$z?    \&lmg:\&lm}    ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
    la    ($sp,"$FRAME($sp)");
    br    ("%r14");

ALIGN    (16);
LABEL    (".Ltail_4x");
if (!$z) {
    vlr    ($xt0,$xb0);
    ld    ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
    ld    ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");

    vst    ($xa0,"$stdframe+0x00($sp)");
    vst    ($xt0,"$stdframe+0x10($sp)");
    vst    ($xc0,"$stdframe+0x20($sp)");
    vst    ($xd0,"$stdframe+0x30($sp)");
} else {
    vlr    ($xt0,$xc0);
    ld    ("%f8","$stdframe+8*0($sp)");
    ld    ("%f9","$stdframe+8*1($sp)");
    ld    ("%f10","$stdframe+8*2($sp)");
    ld    ("%f11","$stdframe+8*3($sp)");
    vlr    ($xt1,$xd0);
    ld    ("%f12","$stdframe+8*4($sp)");
    ld    ("%f13","$stdframe+8*5($sp)");
    ld    ("%f14","$stdframe+8*6($sp)");
    ld    ("%f15","$stdframe+8*7($sp)");

    vst    ($xa0,"$stdframe+0x00($sp)");
    vst    ($xb0,"$stdframe+0x10($sp)");
    vst    ($xt0,"$stdframe+0x20($sp)");
    vst    ($xt1,"$stdframe+0x30($sp)");
}
    lghi    ("%r1",0);

LABEL    (".Loop_tail_4x");
    llgc    ("%r5","0(%r1,$inp)");
    llgc    ("%r6","$stdframe(%r1,$sp)");
    xr    ("%r6","%r5");
    stc    ("%r6","0(%r1,$out)");
    la    ("%r1","1(%r1)");
    brct    ($len,".Loop_tail_4x");

&{$z?    \&lmg:\&lm}    ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
    la    ($sp,"$FRAME($sp)");
    br    ("%r14");
SIZE    ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
}

########################################################################
# 6x"horizontal" layout is optimal fit for the platform in its current
# shape, more specifically for given vector instructions' latency. Well,
# computational part of 8x"vertical" would be faster, but it consumes
# all registers and dealing with that will diminish the return...
#
{
my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
my @K=map("%v$_",(27,24..26));
my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
my $beperm="%v31";
my $FRAME=$stdframe + 4*16;

GLOBL    ("ChaCha20_ctr32_vx");
ALIGN    (32);
LABEL    ("ChaCha20_ctr32_vx");
LABEL    (".LChaCha20_ctr32_vx");
&{$z?    \&clgfi:\&clfi}    ($len,256);
    jle    (".LChaCha20_ctr32_4x");
&{$z?    \&stmg:\&stm}    ("%r6","%r7","6*$SIZE_T($sp)");
if (!$z) {
    std    ("%f4","16*$SIZE_T+2*8($sp)");
    std    ("%f6","16*$SIZE_T+3*8($sp)");
}
&{$z?    \&lghi:\&lhi}    ("%r1",-$FRAME);
    lgr    ("%r0",$sp);
    la    ($sp,"0(%r1,$sp)");
&{$z?    \&stg:\&st}    ("%r0","0($sp)");    # back-chain
if ($z) {
    std    ("%f8","$FRAME-8*8($sp)");
    std    ("%f9","$FRAME-8*7($sp)");
    std    ("%f10","$FRAME-8*6($sp)");
    std    ("%f11","$FRAME-8*5($sp)");
    std    ("%f12","$FRAME-8*4($sp)");
    std    ("%f13","$FRAME-8*3($sp)");
    std    ("%f14","$FRAME-8*2($sp)");
    std    ("%f15","$FRAME-8*1($sp)");
}
    larl    ("%r7",".Lsigma");
    lhi    ("%r0",10);

    vlm    (@K[1],@K[2],"0($key)");    # load key
    vl    (@K[3],"0($counter)");        # load counter

    vlm    (@K[0],"$beperm","0(%r7)");    # load sigma, increments, ...

LABEL    (".Loop_outer_vx");
    vlr    ($a0,@K[0]);
    vlr    ($b0,@K[1]);
    vlr    ($a1,@K[0]);
    vlr    ($b1,@K[1]);
    vlr    ($a2,@K[0]);
    vlr    ($b2,@K[1]);
    vlr    ($a3,@K[0]);
    vlr    ($b3,@K[1]);
    vlr    ($a4,@K[0]);
    vlr    ($b4,@K[1]);
    vlr    ($a5,@K[0]);
    vlr    ($b5,@K[1]);

    vlr    ($d0,@K[3]);
    vaf    ($d1,@K[3],$t1);        # K[3]+1
    vaf    ($d2,@K[3],$t2);        # K[3]+2
    vaf    ($d3,@K[3],$t3);        # K[3]+3
    vaf    ($d4,$d2,$t2);            # K[3]+4
    vaf    ($d5,$d2,$t3);            # K[3]+5

    vlr    ($c0,@K[2]);
    vlr    ($c1,@K[2]);
    vlr    ($c2,@K[2]);
    vlr    ($c3,@K[2]);
    vlr    ($c4,@K[2]);
    vlr    ($c5,@K[2]);

    vlr    ($t1,$d1);
    vlr    ($t2,$d2);
    vlr    ($t3,$d3);

ALIGN    (4);
LABEL    (".Loop_vx");

    VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
         $b0,$b1,$b2,$b3,$b4,$b5,
         $c0,$c1,$c2,$c3,$c4,$c5,
         $d0,$d1,$d2,$d3,$d4,$d5,
         0);

    VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
         $b0,$b1,$b2,$b3,$b4,$b5,
         $c0,$c1,$c2,$c3,$c4,$c5,
         $d0,$d1,$d2,$d3,$d4,$d5,
         1);

    brct    ("%r0",".Loop_vx");

    vaf    ($a0,$a0,@K[0]);
    vaf    ($b0,$b0,@K[1]);
    vaf    ($c0,$c0,@K[2]);
    vaf    ($d0,$d0,@K[3]);
    vaf    ($a1,$a1,@K[0]);
    vaf    ($d1,$d1,$t1);            # +K[3]+1

    vperm    ($a0,$a0,$a0,$beperm);
    vperm    ($b0,$b0,$b0,$beperm);
    vperm    ($c0,$c0,$c0,$beperm);
    vperm    ($d0,$d0,$d0,$beperm);

&{$z?    \&clgfi:\&clfi}    ($len,0x40);
    jl    (".Ltail_vx");

    vaf    ($d2,$d2,$t2);            # +K[3]+2
    vaf    ($d3,$d3,$t3);            # +K[3]+3
    vlm    ($t0,$t3,"0($inp)");

    vx    ($a0,$a0,$t0);
    vx    ($b0,$b0,$t1);
    vx    ($c0,$c0,$t2);
    vx    ($d0,$d0,$t3);

    vlm    (@K[0],$t3,"0(%r7)");        # re-load sigma and increments

    vstm    ($a0,$d0,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    je    (".Ldone_vx");

    vaf    ($b1,$b1,@K[1]);
    vaf    ($c1,$c1,@K[2]);

    vperm    ($a0,$a1,$a1,$beperm);
    vperm    ($b0,$b1,$b1,$beperm);
    vperm    ($c0,$c1,$c1,$beperm);
    vperm    ($d0,$d1,$d1,$beperm);

&{$z?    \&clgfi:\&clfi} ($len,0x40);
    jl    (".Ltail_vx");

    vlm    ($a1,$d1,"0($inp)");

    vx    ($a0,$a0,$a1);
    vx    ($b0,$b0,$b1);
    vx    ($c0,$c0,$c1);
    vx    ($d0,$d0,$d1);

    vstm    ($a0,$d0,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    je    (".Ldone_vx");

    vaf    ($a2,$a2,@K[0]);
    vaf    ($b2,$b2,@K[1]);
    vaf    ($c2,$c2,@K[2]);

    vperm    ($a0,$a2,$a2,$beperm);
    vperm    ($b0,$b2,$b2,$beperm);
    vperm    ($c0,$c2,$c2,$beperm);
    vperm    ($d0,$d2,$d2,$beperm);

&{$z?    \&clgfi:\&clfi}    ($len,0x40);
    jl    (".Ltail_vx");

    vlm    ($a1,$d1,"0($inp)");

    vx    ($a0,$a0,$a1);
    vx    ($b0,$b0,$b1);
    vx    ($c0,$c0,$c1);
    vx    ($d0,$d0,$d1);

    vstm    ($a0,$d0,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    je    (".Ldone_vx");

    vaf    ($a3,$a3,@K[0]);
    vaf    ($b3,$b3,@K[1]);
    vaf    ($c3,$c3,@K[2]);
    vaf    ($d2,@K[3],$t3);        # K[3]+3

    vperm    ($a0,$a3,$a3,$beperm);
    vperm    ($b0,$b3,$b3,$beperm);
    vperm    ($c0,$c3,$c3,$beperm);
    vperm    ($d0,$d3,$d3,$beperm);

&{$z?    \&clgfi:\&clfi}    ($len,0x40);
    jl    (".Ltail_vx");

    vaf    ($d3,$d2,$t1);            # K[3]+4
    vlm    ($a1,$d1,"0($inp)");

    vx    ($a0,$a0,$a1);
    vx    ($b0,$b0,$b1);
    vx    ($c0,$c0,$c1);
    vx    ($d0,$d0,$d1);

    vstm    ($a0,$d0,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    je    (".Ldone_vx");

    vaf    ($a4,$a4,@K[0]);
    vaf    ($b4,$b4,@K[1]);
    vaf    ($c4,$c4,@K[2]);
    vaf    ($d4,$d4,$d3);            # +K[3]+4
    vaf    ($d3,$d3,$t1);            # K[3]+5
    vaf    (@K[3],$d2,$t3);        # K[3]+=6

    vperm    ($a0,$a4,$a4,$beperm);
    vperm    ($b0,$b4,$b4,$beperm);
    vperm    ($c0,$c4,$c4,$beperm);
    vperm    ($d0,$d4,$d4,$beperm);

&{$z?    \&clgfi:\&clfi}    ($len,0x40);
    jl    (".Ltail_vx");

    vlm    ($a1,$d1,"0($inp)");

    vx    ($a0,$a0,$a1);
    vx    ($b0,$b0,$b1);
    vx    ($c0,$c0,$c1);
    vx    ($d0,$d0,$d1);

    vstm    ($a0,$d0,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    je    (".Ldone_vx");

    vaf    ($a5,$a5,@K[0]);
    vaf    ($b5,$b5,@K[1]);
    vaf    ($c5,$c5,@K[2]);
    vaf    ($d5,$d5,$d3);            # +K[3]+5

    vperm    ($a0,$a5,$a5,$beperm);
    vperm    ($b0,$b5,$b5,$beperm);
    vperm    ($c0,$c5,$c5,$beperm);
    vperm    ($d0,$d5,$d5,$beperm);

&{$z?    \&clgfi:\&clfi} ($len,0x40);
    jl    (".Ltail_vx");

    vlm    ($a1,$d1,"0($inp)");

    vx    ($a0,$a0,$a1);
    vx    ($b0,$b0,$b1);
    vx    ($c0,$c0,$c1);
    vx    ($d0,$d0,$d1);

    vstm    ($a0,$d0,"0($out)");

    la    ($inp,"0x40($inp)");
    la    ($out,"0x40($out)");
    lhi    ("%r0",10);
&{$z?    \&aghi:\&ahi}    ($len,-0x40);
    jne    (".Loop_outer_vx");

LABEL    (".Ldone_vx");
if (!$z) {
    ld    ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
    ld    ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
} else {
    ld    ("%f8","$FRAME-8*8($sp)");
    ld    ("%f9","$FRAME-8*7($sp)");
    ld    ("%f10","$FRAME-8*6($sp)");
    ld    ("%f11","$FRAME-8*5($sp)");
    ld    ("%f12","$FRAME-8*4($sp)");
    ld    ("%f13","$FRAME-8*3($sp)");
    ld    ("%f14","$FRAME-8*2($sp)");
    ld    ("%f15","$FRAME-8*1($sp)");
}
&{$z?    \&lmg:\&lm}    ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
    la    ($sp,"$FRAME($sp)");
    br    ("%r14");

ALIGN    (16);
LABEL    (".Ltail_vx");
if (!$z) {
    ld    ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
    ld    ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
} else {
    ld    ("%f8","$FRAME-8*8($sp)");
    ld    ("%f9","$FRAME-8*7($sp)");
    ld    ("%f10","$FRAME-8*6($sp)");
    ld    ("%f11","$FRAME-8*5($sp)");
    ld    ("%f12","$FRAME-8*4($sp)");
    ld    ("%f13","$FRAME-8*3($sp)");
    ld    ("%f14","$FRAME-8*2($sp)");
    ld    ("%f15","$FRAME-8*1($sp)");
}
    vstm    ($a0,$d0,"$stdframe($sp)");
    lghi    ("%r1",0);

LABEL    (".Loop_tail_vx");
    llgc    ("%r5","0(%r1,$inp)");
    llgc    ("%r6","$stdframe(%r1,$sp)");
    xr    ("%r6","%r5");
    stc    ("%r6","0(%r1,$out)");
    la    ("%r1","1(%r1)");
    brct    ($len,".Loop_tail_vx");

&{$z?    \&lmg:\&lm}    ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
    la    ($sp,"$FRAME($sp)");
    br    ("%r14");
SIZE    ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
}
################

ALIGN    (32);
LABEL    (".Lsigma");
LONG    (0x61707865,0x3320646e,0x79622d32,0x6b206574);    # endian-neutral sigma
LONG    (1,0,0,0);
LONG    (2,0,0,0);
LONG    (3,0,0,0);
LONG    (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);    # byte swap

LONG    (0,1,2,3);
LONG    (0x61707865,0x61707865,0x61707865,0x61707865);    # smashed sigma
LONG    (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
LONG    (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
LONG    (0x6b206574,0x6b206574,0x6b206574,0x6b206574);

ASCIZ    ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
ALIGN    (4);

PERLASM_END();
:: Command execute ::
Enter:	Select: