...
Run Format

Text file src/crypto/internal/fips140/nistec/p256_asm_s390x.s

Documentation: crypto/internal/fips140/nistec

     1// Copyright 2016 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5//go:build !purego
     6
     7#include "textflag.h"
     8#include "go_asm.h"
     9
    10DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    11DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    12DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    13DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    14DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    15DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    16DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    17DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    18DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    19DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    20DATA p256<>+0x50(SB)/8, $0x0706050403020100 // LE2BE permute mask
    21DATA p256<>+0x58(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
    22DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    23DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    24DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    25DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    26DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    27DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    28DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    29DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    30DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    31DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    32DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    33DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    34DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    35DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    36DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    37DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    38DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    39DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    40DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    41DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    42GLOBL p256<>(SB), 8, $96
    43GLOBL p256mul<>(SB), 8, $160
    44
    45// ---------------------------------------
    46// iff cond == 1  val <- -val
    47// func p256NegCond(val *p256Element, cond int)
    48#define P1ptr   R1
    49#define CPOOL   R4
    50
    51#define Y1L   V0
    52#define Y1H   V1
    53#define T1L   V2
    54#define T1H   V3
    55
    56#define PL    V30
    57#define PH    V31
    58
    59#define ZER   V4
    60#define SEL1  V5
    61#define CAR1  V6
    62TEXT ·p256NegCond(SB), NOSPLIT, $0
    63	MOVD val+0(FP), P1ptr
    64
    65	MOVD $p256mul<>+0x00(SB), CPOOL
    66	VL   16(CPOOL), PL
    67	VL   0(CPOOL), PH
    68
    69	VL   16(P1ptr), Y1H
    70	VPDI $0x4, Y1H, Y1H, Y1H
    71	VL   0(P1ptr), Y1L
    72	VPDI $0x4, Y1L, Y1L, Y1L
    73
    74	VLREPG cond+8(FP), SEL1
    75	VZERO  ZER
    76	VCEQG  SEL1, ZER, SEL1
    77
    78	VSCBIQ Y1L, PL, CAR1
    79	VSQ    Y1L, PL, T1L
    80	VSBIQ  PH, Y1H, CAR1, T1H
    81
    82	VSEL Y1L, T1L, SEL1, Y1L
    83	VSEL Y1H, T1H, SEL1, Y1H
    84
    85	VPDI $0x4, Y1H, Y1H, Y1H
    86	VST  Y1H, 16(P1ptr)
    87	VPDI $0x4, Y1L, Y1L, Y1L
    88	VST  Y1L, 0(P1ptr)
    89	RET
    90
    91#undef P1ptr
    92#undef CPOOL
    93#undef Y1L
    94#undef Y1H
    95#undef T1L
    96#undef T1H
    97#undef PL
    98#undef PH
    99#undef ZER
   100#undef SEL1
   101#undef CAR1
   102
   103// ---------------------------------------
   104// if cond == 0 res <- b; else res <- a
   105// func p256MovCond(res, a, b *P256Point, cond int)
   106#define P3ptr   R1
   107#define P1ptr   R2
   108#define P2ptr   R3
   109
   110#define X1L    V0
   111#define X1H    V1
   112#define Y1L    V2
   113#define Y1H    V3
   114#define Z1L    V4
   115#define Z1H    V5
   116#define X2L    V6
   117#define X2H    V7
   118#define Y2L    V8
   119#define Y2H    V9
   120#define Z2L    V10
   121#define Z2H    V11
   122
   123#define ZER   V18
   124#define SEL1  V19
   125TEXT ·p256MovCond(SB), NOSPLIT, $0
   126	MOVD   res+0(FP), P3ptr
   127	MOVD   a+8(FP), P1ptr
   128	MOVD   b+16(FP), P2ptr
   129	VLREPG cond+24(FP), SEL1
   130	VZERO  ZER
   131	VCEQG  SEL1, ZER, SEL1
   132
   133	VL 0(P1ptr), X1H
   134	VL 16(P1ptr), X1L
   135	VL 32(P1ptr), Y1H
   136	VL 48(P1ptr), Y1L
   137	VL 64(P1ptr), Z1H
   138	VL 80(P1ptr), Z1L
   139
   140	VL 0(P2ptr), X2H
   141	VL 16(P2ptr), X2L
   142	VL 32(P2ptr), Y2H
   143	VL 48(P2ptr), Y2L
   144	VL 64(P2ptr), Z2H
   145	VL 80(P2ptr), Z2L
   146
   147	VSEL X2L, X1L, SEL1, X1L
   148	VSEL X2H, X1H, SEL1, X1H
   149	VSEL Y2L, Y1L, SEL1, Y1L
   150	VSEL Y2H, Y1H, SEL1, Y1H
   151	VSEL Z2L, Z1L, SEL1, Z1L
   152	VSEL Z2H, Z1H, SEL1, Z1H
   153
   154	VST X1H, 0(P3ptr)
   155	VST X1L, 16(P3ptr)
   156	VST Y1H, 32(P3ptr)
   157	VST Y1L, 48(P3ptr)
   158	VST Z1H, 64(P3ptr)
   159	VST Z1L, 80(P3ptr)
   160
   161	RET
   162
   163#undef P3ptr
   164#undef P1ptr
   165#undef P2ptr
   166#undef X1L
   167#undef X1H
   168#undef Y1L
   169#undef Y1H
   170#undef Z1L
   171#undef Z1H
   172#undef X2L
   173#undef X2H
   174#undef Y2L
   175#undef Y2H
   176#undef Z2L
   177#undef Z2H
   178#undef ZER
   179#undef SEL1
   180
   181// ---------------------------------------
   182// Constant time table access
   183// Indexed from 1 to 15, with -1 offset
   184// (index 0 is implicitly point at infinity)
   185// func p256Select(res *P256Point, table *p256Table, idx int)
   186#define P3ptr   R1
   187#define P1ptr   R2
   188#define COUNT   R4
   189
   190#define X1L    V0
   191#define X1H    V1
   192#define Y1L    V2
   193#define Y1H    V3
   194#define Z1L    V4
   195#define Z1H    V5
   196#define X2L    V6
   197#define X2H    V7
   198#define Y2L    V8
   199#define Y2H    V9
   200#define Z2L    V10
   201#define Z2H    V11
   202
   203#define ONE   V18
   204#define IDX   V19
   205#define SEL1  V20
   206#define SEL2  V21
   207TEXT ·p256Select(SB), NOSPLIT, $0
   208	MOVD   res+0(FP), P3ptr
   209	MOVD   table+8(FP), P1ptr
   210	VLREPB idx+(16+7)(FP), IDX
   211	VREPIB $1, ONE
   212	VREPIB $1, SEL2
   213	MOVD   $1, COUNT
   214
   215	VZERO X1H
   216	VZERO X1L
   217	VZERO Y1H
   218	VZERO Y1L
   219	VZERO Z1H
   220	VZERO Z1L
   221
   222loop_select:
   223	VL 0(P1ptr), X2H
   224	VL 16(P1ptr), X2L
   225	VL 32(P1ptr), Y2H
   226	VL 48(P1ptr), Y2L
   227	VL 64(P1ptr), Z2H
   228	VL 80(P1ptr), Z2L
   229
   230	VCEQG SEL2, IDX, SEL1
   231
   232	VSEL X2L, X1L, SEL1, X1L
   233	VSEL X2H, X1H, SEL1, X1H
   234	VSEL Y2L, Y1L, SEL1, Y1L
   235	VSEL Y2H, Y1H, SEL1, Y1H
   236	VSEL Z2L, Z1L, SEL1, Z1L
   237	VSEL Z2H, Z1H, SEL1, Z1H
   238
   239	VAB  SEL2, ONE, SEL2
   240	ADDW $1, COUNT
   241	ADD  $96, P1ptr
   242	CMPW COUNT, $17
   243	BLT  loop_select
   244
   245	VST X1H, 0(P3ptr)
   246	VST X1L, 16(P3ptr)
   247	VST Y1H, 32(P3ptr)
   248	VST Y1L, 48(P3ptr)
   249	VST Z1H, 64(P3ptr)
   250	VST Z1L, 80(P3ptr)
   251	RET
   252
   253#undef P3ptr
   254#undef P1ptr
   255#undef COUNT
   256#undef X1L
   257#undef X1H
   258#undef Y1L
   259#undef Y1H
   260#undef Z1L
   261#undef Z1H
   262#undef X2L
   263#undef X2H
   264#undef Y2L
   265#undef Y2H
   266#undef Z2L
   267#undef Z2H
   268#undef ONE
   269#undef IDX
   270#undef SEL1
   271#undef SEL2
   272
   273// ---------------------------------------
   274
   275//  func p256FromMont(res, in *p256Element)
   276#define res_ptr R1
   277#define x_ptr   R2
   278#define CPOOL   R4
   279
   280#define T0   V0
   281#define T1   V1
   282#define T2   V2
   283#define TT0  V3
   284#define TT1  V4
   285
   286#define ZER   V6
   287#define SEL1  V7
   288#define SEL2  V8
   289#define CAR1  V9
   290#define CAR2  V10
   291#define RED1  V11
   292#define RED2  V12
   293#define PL    V13
   294#define PH    V14
   295
   296TEXT ·p256FromMont(SB), NOSPLIT, $0
   297	MOVD res+0(FP), res_ptr
   298	MOVD in+8(FP), x_ptr
   299
   300	VZERO T2
   301	VZERO ZER
   302	MOVD  $p256<>+0x00(SB), CPOOL
   303	VL    16(CPOOL), PL
   304	VL    0(CPOOL), PH
   305	VL    48(CPOOL), SEL2
   306	VL    64(CPOOL), SEL1
   307
   308	VL   (0*16)(x_ptr), T0
   309	VPDI $0x4, T0, T0, T0
   310	VL   (1*16)(x_ptr), T1
   311	VPDI $0x4, T1, T1, T1
   312
   313	// First round
   314	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   315	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   316	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   317
   318	VSLDB $8, T1, T0, T0
   319	VSLDB $8, T2, T1, T1
   320
   321	VACCQ  T0, RED1, CAR1
   322	VAQ    T0, RED1, T0
   323	VACCCQ T1, RED2, CAR1, CAR2
   324	VACQ   T1, RED2, CAR1, T1
   325	VAQ    T2, CAR2, T2
   326
   327	// Second round
   328	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   329	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   330	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   331
   332	VSLDB $8, T1, T0, T0
   333	VSLDB $8, T2, T1, T1
   334
   335	VACCQ  T0, RED1, CAR1
   336	VAQ    T0, RED1, T0
   337	VACCCQ T1, RED2, CAR1, CAR2
   338	VACQ   T1, RED2, CAR1, T1
   339	VAQ    T2, CAR2, T2
   340
   341	// Third round
   342	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   343	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   344	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   345
   346	VSLDB $8, T1, T0, T0
   347	VSLDB $8, T2, T1, T1
   348
   349	VACCQ  T0, RED1, CAR1
   350	VAQ    T0, RED1, T0
   351	VACCCQ T1, RED2, CAR1, CAR2
   352	VACQ   T1, RED2, CAR1, T1
   353	VAQ    T2, CAR2, T2
   354
   355	// Last round
   356	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   357	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   358	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   359
   360	VSLDB $8, T1, T0, T0
   361	VSLDB $8, T2, T1, T1
   362
   363	VACCQ  T0, RED1, CAR1
   364	VAQ    T0, RED1, T0
   365	VACCCQ T1, RED2, CAR1, CAR2
   366	VACQ   T1, RED2, CAR1, T1
   367	VAQ    T2, CAR2, T2
   368
   369	// ---------------------------------------------------
   370
   371	VSCBIQ  PL, T0, CAR1
   372	VSQ     PL, T0, TT0
   373	VSBCBIQ T1, PH, CAR1, CAR2
   374	VSBIQ   T1, PH, CAR1, TT1
   375	VSBIQ   T2, ZER, CAR2, T2
   376
   377	// what output to use, TT1||TT0 or T1||T0?
   378	VSEL T0, TT0, T2, T0
   379	VSEL T1, TT1, T2, T1
   380
   381	VPDI $0x4, T0, T0, TT0
   382	VST  TT0, (0*16)(res_ptr)
   383	VPDI $0x4, T1, T1, TT1
   384	VST  TT1, (1*16)(res_ptr)
   385	RET
   386
   387#undef res_ptr
   388#undef x_ptr
   389#undef CPOOL
   390#undef T0
   391#undef T1
   392#undef T2
   393#undef TT0
   394#undef TT1
   395#undef ZER
   396#undef SEL1
   397#undef SEL2
   398#undef CAR1
   399#undef CAR2
   400#undef RED1
   401#undef RED2
   402#undef PL
   403#undef PH
   404
   405// Constant time table access
   406// Indexed from 1 to 15, with -1 offset
   407// (index 0 is implicitly point at infinity)
   408// func p256SelectBase(point *p256Point, table []p256Point, idx int)
   409// new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   410
   411#define P3ptr   R1
   412#define P1ptr   R2
   413#define COUNT   R4
   414#define CPOOL   R5
   415
   416#define X1L    V0
   417#define X1H    V1
   418#define Y1L    V2
   419#define Y1H    V3
   420#define Z1L    V4
   421#define Z1H    V5
   422#define X2L    V6
   423#define X2H    V7
   424#define Y2L    V8
   425#define Y2H    V9
   426#define Z2L    V10
   427#define Z2H    V11
   428#define LE2BE  V12
   429
   430#define ONE   V18
   431#define IDX   V19
   432#define SEL1  V20
   433#define SEL2  V21
   434
   435TEXT ·p256SelectAffine(SB), NOSPLIT, $0
   436	MOVD   res+0(FP), P3ptr
   437	MOVD   table+8(FP), P1ptr
   438	MOVD   $p256<>+0x00(SB), CPOOL
   439	VLREPB idx+(16+7)(FP), IDX
   440	VREPIB $1, ONE
   441	VREPIB $1, SEL2
   442	MOVD   $1, COUNT
   443	VL     80(CPOOL), LE2BE
   444
   445	VZERO X1H
   446	VZERO X1L
   447	VZERO Y1H
   448	VZERO Y1L
   449
   450loop_select:
   451	VL 0(P1ptr), X2H
   452	VL 16(P1ptr), X2L
   453	VL 32(P1ptr), Y2H
   454	VL 48(P1ptr), Y2L
   455
   456	VCEQG SEL2, IDX, SEL1
   457
   458	VSEL X2L, X1L, SEL1, X1L
   459	VSEL X2H, X1H, SEL1, X1H
   460	VSEL Y2L, Y1L, SEL1, Y1L
   461	VSEL Y2H, Y1H, SEL1, Y1H
   462
   463	VAB  SEL2, ONE, SEL2
   464	ADDW $1, COUNT
   465	ADD  $64, P1ptr
   466	CMPW COUNT, $33 // len(p256AffineTable) + 1
   467	BLT  loop_select
   468	VST  X1H, 0(P3ptr)
   469	VST  X1L, 16(P3ptr)
   470	VST  Y1H, 32(P3ptr)
   471	VST  Y1L, 48(P3ptr)
   472
   473	RET
   474
   475#undef P3ptr
   476#undef P1ptr
   477#undef COUNT
   478#undef X1L
   479#undef X1H
   480#undef Y1L
   481#undef Y1H
   482#undef Z1L
   483#undef Z1H
   484#undef X2L
   485#undef X2H
   486#undef Y2L
   487#undef Y2H
   488#undef Z2L
   489#undef Z2H
   490#undef ONE
   491#undef IDX
   492#undef SEL1
   493#undef SEL2
   494#undef CPOOL
   495
   496// ---------------------------------------
   497// p256MulInternal
   498// V0-V3,V30,V31 - Not Modified
   499// V4-V15 - Volatile
   500
   501#define CPOOL   R4
   502
   503// Parameters
   504#define X0    V0 // Not modified
   505#define X1    V1 // Not modified
   506#define Y0    V2 // Not modified
   507#define Y1    V3 // Not modified
   508#define T0    V4
   509#define T1    V5
   510#define P0    V30 // Not modified
   511#define P1    V31 // Not modified
   512
   513// Temporaries
   514#define YDIG  V6 // Overloaded with CAR2, ZER
   515#define ADD1H V7 // Overloaded with ADD3H
   516#define ADD2H V8 // Overloaded with ADD4H
   517#define ADD3  V9 // Overloaded with SEL2,SEL5
   518#define ADD4  V10 // Overloaded with SEL3,SEL6
   519#define RED1  V11 // Overloaded with CAR2
   520#define RED2  V12
   521#define RED3  V13 // Overloaded with SEL1
   522#define T2    V14
   523// Overloaded temporaries
   524#define ADD1  V4 // Overloaded with T0
   525#define ADD2  V5 // Overloaded with T1
   526#define ADD3H V7 // Overloaded with ADD1H
   527#define ADD4H V8 // Overloaded with ADD2H
   528#define ZER   V6 // Overloaded with YDIG, CAR2
   529#define CAR1  V6 // Overloaded with YDIG, ZER
   530#define CAR2  V11 // Overloaded with RED1
   531// Constant Selects
   532#define SEL1  V13 // Overloaded with RED3
   533#define SEL2  V9 // Overloaded with ADD3,SEL5
   534#define SEL3  V10 // Overloaded with ADD4,SEL6
   535#define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   536#define SEL5  V9 // Overloaded with ADD3,SEL2
   537#define SEL6  V10 // Overloaded with ADD4,SEL3
   538
   539/* *
   540 * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   541 * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   542 * With you, SIMD be...
   543 *
   544 *                                           +--------+--------+
   545 *                                  +--------|  RED2  |  RED1  |
   546 *                                  |        +--------+--------+
   547 *                                  |       ---+--------+--------+
   548 *                                  |  +---- T2|   T1   |   T0   |--+
   549 *                                  |  |    ---+--------+--------+  |
   550 *                                  |  |                            |
   551 *                                  |  |    ======================= |
   552 *                                  |  |                            |
   553 *                                  |  |       +--------+--------+<-+
   554 *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   555 *                                  |  |       +--------+--------+  |     |
   556 *                                  |  |     +--------+--------+<---+     |
   557 *                                  |  |     | ADD2H  | ADD1H  |--+       |
   558 *                                  |  |     +--------+--------+  |       |
   559 *                                  |  |     +--------+--------+<-+       |
   560 *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   561 *                                  |  |     +--------+--------+  | |     |
   562 *                                  |  |   +--------+--------+<---+ |     |
   563 *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   564 *                                  |  |   +--------+--------+      | |   V
   565 *                                  |  | ------------------------   | | +--------+
   566 *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   567 *                                  |  |                            | | +--------+
   568 *                                  |  +---->+--------+--------+    | |   |
   569 *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   570 *                                  |        +--------+--------+    | |   |
   571 *                                  +---->---+--------+--------+    | |   |
   572 *                                         T2|   T1   |   T0   |----+ |   |
   573 *                                        ---+--------+--------+    | |   |
   574 *                                        ---+--------+--------+<---+ |   |
   575 *                                    +--- T2|   T1   |   T0   |----------+
   576 *                                    |   ---+--------+--------+      |   |
   577 *                                    |  +--------+--------+<-------------+
   578 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   579 *                                    |  +--------+--------+     |    |   |
   580 *                                    |  +--------+<----------------------+
   581 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   582 *                                    |  +--------+              |    |
   583 *                                    +--->+--------+--------+   |    |
   584 *                                         |   T1   |   T0   |--------+
   585 *                                         +--------+--------+   |    |
   586 *                                   --------------------------- |    |
   587 *                                                               |    |
   588 *                                       +--------+--------+<----+    |
   589 *                                       |  RED2  |  RED1  |          |
   590 *                                       +--------+--------+          |
   591 *                                      ---+--------+--------+<-------+
   592 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   593 *                                      ---+--------+--------+
   594 *
   595 *                                                                *Mi obra de arte de siglo XXI @vpaprots
   596 *
   597 *
   598 * First group is special, doesn't get the two inputs:
   599 *                                             +--------+--------+<-+
   600 *                                     +-------|  ADD2  |  ADD1  |--|-----+
   601 *                                     |       +--------+--------+  |     |
   602 *                                     |     +--------+--------+<---+     |
   603 *                                     |     | ADD2H  | ADD1H  |--+       |
   604 *                                     |     +--------+--------+  |       |
   605 *                                     |     +--------+--------+<-+       |
   606 *                                     |     |  ADD4  |  ADD3  |--|-+     |
   607 *                                     |     +--------+--------+  | |     |
   608 *                                     |   +--------+--------+<---+ |     |
   609 *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   610 *                                     |   +--------+--------+      | |   V
   611 *                                     | ------------------------   | | +--------+
   612 *                                     |                            | | |  RED3  |  [d0 0 0 d0]
   613 *                                     |                            | | +--------+
   614 *                                     +---->+--------+--------+    | |   |
   615 *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
   616 *                                           +--------+--------+    | |   |
   617 *                                        ---+--------+--------+<---+ |   |
   618 *                                    +--- T2|   T1   |   T0   |----------+
   619 *                                    |   ---+--------+--------+      |   |
   620 *                                    |  +--------+--------+<-------------+
   621 *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   622 *                                    |  +--------+--------+     |    |   |
   623 *                                    |  +--------+<----------------------+
   624 *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   625 *                                    |  +--------+              |    |
   626 *                                    +--->+--------+--------+   |    |
   627 *                                         |   T1   |   T0   |--------+
   628 *                                         +--------+--------+   |    |
   629 *                                   --------------------------- |    |
   630 *                                                               |    |
   631 *                                       +--------+--------+<----+    |
   632 *                                       |  RED2  |  RED1  |          |
   633 *                                       +--------+--------+          |
   634 *                                      ---+--------+--------+<-------+
   635 *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   636 *                                      ---+--------+--------+
   637 *
   638 * Last 'group' needs to RED2||RED1 shifted less
   639 */
   640TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
   641	VL 32(CPOOL), SEL1
   642	VL 48(CPOOL), SEL2
   643	VL 64(CPOOL), SEL3
   644	VL 80(CPOOL), SEL4
   645
   646	// ---------------------------------------------------
   647
   648	VREPF $3, Y0, YDIG
   649	VMLHF X0, YDIG, ADD1H
   650	VMLHF X1, YDIG, ADD2H
   651	VMLF  X0, YDIG, ADD1
   652	VMLF  X1, YDIG, ADD2
   653
   654	VREPF  $2, Y0, YDIG
   655	VMALF  X0, YDIG, ADD1H, ADD3
   656	VMALF  X1, YDIG, ADD2H, ADD4
   657	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   658	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   659
   660	VZERO ZER
   661	VL    32(CPOOL), SEL1
   662	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   663
   664	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
   665	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
   666
   667	VACCQ  T0, ADD3, CAR1
   668	VAQ    T0, ADD3, T0       // ADD3 Free
   669	VACCCQ T1, ADD4, CAR1, T2
   670	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
   671
   672	VL    48(CPOOL), SEL2
   673	VL    64(CPOOL), SEL3
   674	VL    80(CPOOL), SEL4
   675	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   676	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   677	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   678	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
   679
   680	VSLDB $12, T1, T0, T0
   681	VSLDB $12, T2, T1, T1
   682
   683	VACCQ  T0, ADD3H, CAR1
   684	VAQ    T0, ADD3H, T0
   685	VACCCQ T1, ADD4H, CAR1, T2
   686	VACQ   T1, ADD4H, CAR1, T1
   687
   688	// ---------------------------------------------------
   689
   690	VREPF  $1, Y0, YDIG
   691	VMALHF X0, YDIG, T0, ADD1H
   692	VMALHF X1, YDIG, T1, ADD2H
   693	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
   694	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
   695
   696	VREPF  $0, Y0, YDIG
   697	VMALF  X0, YDIG, ADD1H, ADD3
   698	VMALF  X1, YDIG, ADD2H, ADD4
   699	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
   700	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
   701
   702	VZERO ZER
   703	VL    32(CPOOL), SEL1
   704	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   705
   706	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
   707	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
   708
   709	VACCQ  T0, RED1, CAR1
   710	VAQ    T0, RED1, T0
   711	VACCCQ T1, RED2, CAR1, T2
   712	VACQ   T1, RED2, CAR1, T1
   713
   714	VACCQ  T0, ADD3, CAR1
   715	VAQ    T0, ADD3, T0
   716	VACCCQ T1, ADD4, CAR1, CAR2
   717	VACQ   T1, ADD4, CAR1, T1
   718	VAQ    T2, CAR2, T2
   719
   720	VL    48(CPOOL), SEL2
   721	VL    64(CPOOL), SEL3
   722	VL    80(CPOOL), SEL4
   723	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   724	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   725	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   726	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
   727
   728	VSLDB $12, T1, T0, T0
   729	VSLDB $12, T2, T1, T1
   730
   731	VACCQ  T0, ADD3H, CAR1
   732	VAQ    T0, ADD3H, T0
   733	VACCCQ T1, ADD4H, CAR1, T2
   734	VACQ   T1, ADD4H, CAR1, T1
   735
   736	// ---------------------------------------------------
   737
   738	VREPF  $3, Y1, YDIG
   739	VMALHF X0, YDIG, T0, ADD1H
   740	VMALHF X1, YDIG, T1, ADD2H
   741	VMALF  X0, YDIG, T0, ADD1
   742	VMALF  X1, YDIG, T1, ADD2
   743
   744	VREPF  $2, Y1, YDIG
   745	VMALF  X0, YDIG, ADD1H, ADD3
   746	VMALF  X1, YDIG, ADD2H, ADD4
   747	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
   748	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
   749
   750	VZERO ZER
   751	VL    32(CPOOL), SEL1
   752	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   753
   754	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
   755	VSLDB $12, T2, ADD2, T1   // ADD2 Free
   756
   757	VACCQ  T0, RED1, CAR1
   758	VAQ    T0, RED1, T0
   759	VACCCQ T1, RED2, CAR1, T2
   760	VACQ   T1, RED2, CAR1, T1
   761
   762	VACCQ  T0, ADD3, CAR1
   763	VAQ    T0, ADD3, T0
   764	VACCCQ T1, ADD4, CAR1, CAR2
   765	VACQ   T1, ADD4, CAR1, T1
   766	VAQ    T2, CAR2, T2
   767
   768	VL    48(CPOOL), SEL2
   769	VL    64(CPOOL), SEL3
   770	VL    80(CPOOL), SEL4
   771	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
   772	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
   773	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
   774	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
   775
   776	VSLDB $12, T1, T0, T0
   777	VSLDB $12, T2, T1, T1
   778
   779	VACCQ  T0, ADD3H, CAR1
   780	VAQ    T0, ADD3H, T0
   781	VACCCQ T1, ADD4H, CAR1, T2
   782	VACQ   T1, ADD4H, CAR1, T1
   783
   784	// ---------------------------------------------------
   785
   786	VREPF  $1, Y1, YDIG
   787	VMALHF X0, YDIG, T0, ADD1H
   788	VMALHF X1, YDIG, T1, ADD2H
   789	VMALF  X0, YDIG, T0, ADD1
   790	VMALF  X1, YDIG, T1, ADD2
   791
   792	VREPF  $0, Y1, YDIG
   793	VMALF  X0, YDIG, ADD1H, ADD3
   794	VMALF  X1, YDIG, ADD2H, ADD4
   795	VMALHF X0, YDIG, ADD1H, ADD3H
   796	VMALHF X1, YDIG, ADD2H, ADD4H
   797
   798	VZERO ZER
   799	VL    32(CPOOL), SEL1
   800	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
   801
   802	VSLDB $12, ADD2, ADD1, T0
   803	VSLDB $12, T2, ADD2, T1
   804
   805	VACCQ  T0, RED1, CAR1
   806	VAQ    T0, RED1, T0
   807	VACCCQ T1, RED2, CAR1, T2
   808	VACQ   T1, RED2, CAR1, T1
   809
   810	VACCQ  T0, ADD3, CAR1
   811	VAQ    T0, ADD3, T0
   812	VACCCQ T1, ADD4, CAR1, CAR2
   813	VACQ   T1, ADD4, CAR1, T1
   814	VAQ    T2, CAR2, T2
   815
   816	VL    96(CPOOL), SEL5
   817	VL    112(CPOOL), SEL6
   818	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
   819	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
   820	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
   821
   822	VSLDB $12, T1, T0, T0
   823	VSLDB $12, T2, T1, T1
   824
   825	VACCQ  T0, ADD3H, CAR1
   826	VAQ    T0, ADD3H, T0
   827	VACCCQ T1, ADD4H, CAR1, T2
   828	VACQ   T1, ADD4H, CAR1, T1
   829
   830	VACCQ  T0, RED1, CAR1
   831	VAQ    T0, RED1, T0
   832	VACCCQ T1, RED2, CAR1, CAR2
   833	VACQ   T1, RED2, CAR1, T1
   834	VAQ    T2, CAR2, T2
   835
   836	// ---------------------------------------------------
   837
   838	VZERO   RED3
   839	VSCBIQ  P0, T0, CAR1
   840	VSQ     P0, T0, ADD1H
   841	VSBCBIQ T1, P1, CAR1, CAR2
   842	VSBIQ   T1, P1, CAR1, ADD2H
   843	VSBIQ   T2, RED3, CAR2, T2
   844
   845	// what output to use, ADD2H||ADD1H or T1||T0?
   846	VSEL T0, ADD1H, T2, T0
   847	VSEL T1, ADD2H, T2, T1
   848	RET
   849
   850#undef CPOOL
   851
   852#undef X0
   853#undef X1
   854#undef Y0
   855#undef Y1
   856#undef T0
   857#undef T1
   858#undef P0
   859#undef P1
   860
   861#undef SEL1
   862#undef SEL2
   863#undef SEL3
   864#undef SEL4
   865#undef SEL5
   866#undef SEL6
   867
   868#undef YDIG
   869#undef ADD1H
   870#undef ADD2H
   871#undef ADD3
   872#undef ADD4
   873#undef RED1
   874#undef RED2
   875#undef RED3
   876#undef T2
   877#undef ADD1
   878#undef ADD2
   879#undef ADD3H
   880#undef ADD4H
   881#undef ZER
   882#undef CAR1
   883#undef CAR2
   884
   885// ---------------------------------------
   886
   887// Parameters
   888#define X0    V0
   889#define X1    V1
   890#define Y0    V2
   891#define Y1    V3
   892
   893TEXT p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
   894	VLR X0, Y0
   895	VLR X1, Y1
   896	BR  p256MulInternal<>(SB)
   897
   898#undef X0
   899#undef X1
   900#undef Y0
   901#undef Y1
   902
   903#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
   904	VZERO   ZER                \
   905	VSCBIQ  Y0, X0, CAR1       \
   906	VSQ     Y0, X0, T0         \
   907	VSBCBIQ X1, Y1, CAR1, SEL1 \
   908	VSBIQ   X1, Y1, CAR1, T1   \
   909	VSQ     SEL1, ZER, SEL1    \
   910	                           \
   911	VACCQ   T0, PL, CAR1       \
   912	VAQ     T0, PL, TT0        \
   913	VACQ    T1, PH, CAR1, TT1  \
   914	                           \
   915	VSEL    T0, TT0, SEL1, T0  \
   916	VSEL    T1, TT1, SEL1, T1  \
   917
   918#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
   919	VACCQ   X0, Y0, CAR1        \
   920	VAQ     X0, Y0, T0          \
   921	VACCCQ  X1, Y1, CAR1, T2    \
   922	VACQ    X1, Y1, CAR1, T1    \
   923	                            \
   924	VZERO   ZER                 \
   925	VSCBIQ  PL, T0, CAR1        \
   926	VSQ     PL, T0, TT0         \
   927	VSBCBIQ T1, PH, CAR1, CAR2  \
   928	VSBIQ   T1, PH, CAR1, TT1   \
   929	VSBIQ   T2, ZER, CAR2, SEL1 \
   930	                            \
   931	VSEL    T0, TT0, SEL1, T0   \
   932	VSEL    T1, TT1, SEL1, T1
   933
   934#define p256HalfInternal(T1, T0, X1, X0) \
   935	VZERO  ZER                \
   936	VSBIQ  ZER, ZER, X0, SEL1 \
   937	                          \
   938	VACCQ  X0, PL, CAR1       \
   939	VAQ    X0, PL, T0         \
   940	VACCCQ X1, PH, CAR1, T2   \
   941	VACQ   X1, PH, CAR1, T1   \
   942	                          \
   943	VSEL   X0, T0, SEL1, T0   \
   944	VSEL   X1, T1, SEL1, T1   \
   945	VSEL   ZER, T2, SEL1, T2  \
   946	                          \
   947	VSLDB  $15, T2, ZER, TT1  \
   948	VSLDB  $15, T1, ZER, TT0  \
   949	VREPIB $1, SEL1           \
   950	VSRL   SEL1, T0, T0       \
   951	VSRL   SEL1, T1, T1       \
   952	VREPIB $7, SEL1           \
   953	VSL    SEL1, TT0, TT0     \
   954	VSL    SEL1, TT1, TT1     \
   955	VO     T0, TT0, T0        \
   956	VO     T1, TT1, T1
   957
   958// ---------------------------------------
   959// func p256Mul(res, in1, in2 *p256Element)
   960#define res_ptr R1
   961#define x_ptr   R2
   962#define y_ptr   R3
   963#define CPOOL   R4
   964
   965// Parameters
   966#define X0    V0
   967#define X1    V1
   968#define Y0    V2
   969#define Y1    V3
   970#define T0    V4
   971#define T1    V5
   972
   973// Constants
   974#define P0    V30
   975#define P1    V31
   976TEXT ·p256Mul(SB), NOSPLIT, $0
   977	MOVD res+0(FP), res_ptr
   978	MOVD in1+8(FP), x_ptr
   979	MOVD in2+16(FP), y_ptr
   980
   981	VL   (0*16)(x_ptr), X0
   982	VPDI $0x4, X0, X0, X0
   983	VL   (1*16)(x_ptr), X1
   984	VPDI $0x4, X1, X1, X1
   985	VL   (0*16)(y_ptr), Y0
   986	VPDI $0x4, Y0, Y0, Y0
   987	VL   (1*16)(y_ptr), Y1
   988	VPDI $0x4, Y1, Y1, Y1
   989
   990	MOVD $p256mul<>+0x00(SB), CPOOL
   991	VL   16(CPOOL), P0
   992	VL   0(CPOOL), P1
   993
   994	CALL p256MulInternal<>(SB)
   995
   996	VPDI $0x4, T0, T0, T0
   997	VST  T0, (0*16)(res_ptr)
   998	VPDI $0x4, T1, T1, T1
   999	VST  T1, (1*16)(res_ptr)
  1000	RET
  1001
  1002#undef res_ptr
  1003#undef x_ptr
  1004#undef y_ptr
  1005#undef CPOOL
  1006
  1007#undef X0
  1008#undef X1
  1009#undef Y0
  1010#undef Y1
  1011#undef T0
  1012#undef T1
  1013#undef P0
  1014#undef P1
  1015
  1016// ---------------------------------------
  1017//  func p256Sqr(res, in *p256Element, n int)
  1018#define res_ptr R1
  1019#define x_ptr   R2
  1020#define y_ptr   R3
  1021#define CPOOL   R4
  1022#define COUNT   R5
  1023#define N       R6
  1024
  1025// Parameters
  1026#define X0    V0
  1027#define X1    V1
  1028#define T0    V4
  1029#define T1    V5
  1030
  1031// Constants
  1032#define P0    V30
  1033#define P1    V31
  1034TEXT ·p256Sqr(SB), NOSPLIT, $0
  1035	MOVD res+0(FP), res_ptr
  1036	MOVD in+8(FP), x_ptr
  1037
  1038	VL   (0*16)(x_ptr), X0
  1039	VPDI $0x4, X0, X0, X0
  1040	VL   (1*16)(x_ptr), X1
  1041	VPDI $0x4, X1, X1, X1
  1042
  1043	MOVD $p256mul<>+0x00(SB), CPOOL
  1044	MOVD $0, COUNT
  1045	MOVD n+16(FP), N
  1046	VL   16(CPOOL), P0
  1047	VL   0(CPOOL), P1
  1048
  1049loop:
  1050	CALL p256SqrInternal<>(SB)
  1051	VLR  T0, X0
  1052	VLR  T1, X1
  1053	ADDW $1, COUNT
  1054	CMPW COUNT, N
  1055	BLT  loop
  1056
  1057	VPDI $0x4, T0, T0, T0
  1058	VST  T0, (0*16)(res_ptr)
  1059	VPDI $0x4, T1, T1, T1
  1060	VST  T1, (1*16)(res_ptr)
  1061	RET
  1062
  1063#undef res_ptr
  1064#undef x_ptr
  1065#undef y_ptr
  1066#undef CPOOL
  1067#undef COUNT
  1068#undef N
  1069
  1070#undef X0
  1071#undef X1
  1072#undef T0
  1073#undef T1
  1074#undef P0
  1075#undef P1
  1076
  1077// Point add with P2 being affine point
  1078// If sign == 1 -> P2 = -P2
  1079// If sel == 0 -> P3 = P1
  1080// if zero == 0 -> P3 = P2
  1081// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1082#define P3ptr   R1
  1083#define P1ptr   R2
  1084#define P2ptr   R3
  1085#define CPOOL   R4
  1086
  1087// Temporaries in REGs
  1088#define Y2L    V15
  1089#define Y2H    V16
  1090#define T1L    V17
  1091#define T1H    V18
  1092#define T2L    V19
  1093#define T2H    V20
  1094#define T3L    V21
  1095#define T3H    V22
  1096#define T4L    V23
  1097#define T4H    V24
  1098
  1099// Temps for Sub and Add
  1100#define TT0  V11
  1101#define TT1  V12
  1102#define T2   V13
  1103
  1104// p256MulAsm Parameters
  1105#define X0    V0
  1106#define X1    V1
  1107#define Y0    V2
  1108#define Y1    V3
  1109#define T0    V4
  1110#define T1    V5
  1111
  1112#define PL    V30
  1113#define PH    V31
  1114
  1115// Names for zero/sel selects
  1116#define X1L    V0
  1117#define X1H    V1
  1118#define Y1L    V2 // p256MulAsmParmY
  1119#define Y1H    V3 // p256MulAsmParmY
  1120#define Z1L    V4
  1121#define Z1H    V5
  1122#define X2L    V0
  1123#define X2H    V1
  1124#define Z2L    V4
  1125#define Z2H    V5
  1126#define X3L    V17 // T1L
  1127#define X3H    V18 // T1H
  1128#define Y3L    V21 // T3L
  1129#define Y3H    V22 // T3H
  1130#define Z3L    V28
  1131#define Z3H    V29
  1132
  1133#define ZER   V6
  1134#define SEL1  V7
  1135#define CAR1  V8
  1136#define CAR2  V9
  1137/* *
  1138 * Three operand formula:
  1139 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1140 * T1 = Z1²
  1141 * T2 = T1*Z1
  1142 * T1 = T1*X2
  1143 * T2 = T2*Y2
  1144 * T1 = T1-X1
  1145 * T2 = T2-Y1
  1146 * Z3 = Z1*T1
  1147 * T3 = T1²
  1148 * T4 = T3*T1
  1149 * T3 = T3*X1
  1150 * T1 = 2*T3
  1151 * X3 = T2²
  1152 * X3 = X3-T1
  1153 * X3 = X3-T4
  1154 * T3 = T3-X3
  1155 * T3 = T3*T2
  1156 * T4 = T4*Y1
  1157 * Y3 = T3-T4
  1158
  1159 * Three operand formulas, but with MulInternal X,Y used to store temps
  1160X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1161X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1162X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1163X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1164SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1165SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1166X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1167X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1168X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1169X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1170ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1171X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1172SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1173SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1174SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1175X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1176X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1177SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1178
  1179	*/
  1180TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1181	MOVD res+0(FP), P3ptr
  1182	MOVD in1+8(FP), P1ptr
  1183	MOVD in2+16(FP), P2ptr
  1184
  1185	MOVD $p256mul<>+0x00(SB), CPOOL
  1186	VL   16(CPOOL), PL
  1187	VL   0(CPOOL), PH
  1188
  1189	//	if (sign == 1) {
  1190	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1191	//	}
  1192
  1193	VL   48(P2ptr), Y2H
  1194	VPDI $0x4, Y2H, Y2H, Y2H
  1195	VL   32(P2ptr), Y2L
  1196	VPDI $0x4, Y2L, Y2L, Y2L
  1197
  1198	VLREPG sign+24(FP), SEL1
  1199	VZERO  ZER
  1200	VCEQG  SEL1, ZER, SEL1
  1201
  1202	VSCBIQ Y2L, PL, CAR1
  1203	VSQ    Y2L, PL, T1L
  1204	VSBIQ  PH, Y2H, CAR1, T1H
  1205
  1206	VSEL Y2L, T1L, SEL1, Y2L
  1207	VSEL Y2H, T1H, SEL1, Y2H
  1208
  1209/* *
  1210 * Three operand formula:
  1211 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1212 */
  1213	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1214	VL   80(P1ptr), X1       // Z1H
  1215	VPDI $0x4, X1, X1, X1
  1216	VL   64(P1ptr), X0       // Z1L
  1217	VPDI $0x4, X0, X0, X0
  1218	VLR  X0, Y0
  1219	VLR  X1, Y1
  1220	CALL p256SqrInternal<>(SB)
  1221
  1222	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1223	VLR  T0, X0
  1224	VLR  T1, X1
  1225	CALL p256MulInternal<>(SB)
  1226	VLR  T0, T2L
  1227	VLR  T1, T2H
  1228
  1229	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1230	VL   16(P2ptr), Y1       // X2H
  1231	VPDI $0x4, Y1, Y1, Y1
  1232	VL   0(P2ptr), Y0        // X2L
  1233	VPDI $0x4, Y0, Y0, Y0
  1234	CALL p256MulInternal<>(SB)
  1235	VLR  T0, T1L
  1236	VLR  T1, T1H
  1237
  1238	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1239	VLR  T2L, X0
  1240	VLR  T2H, X1
  1241	VLR  Y2L, Y0
  1242	VLR  Y2H, Y1
  1243	CALL p256MulInternal<>(SB)
  1244
  1245	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1246	VL   48(P1ptr), Y1H
  1247	VPDI $0x4, Y1H, Y1H, Y1H
  1248	VL   32(P1ptr), Y1L
  1249	VPDI $0x4, Y1L, Y1L, Y1L
  1250	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1251
  1252	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1253	VL   16(P1ptr), X1H
  1254	VPDI $0x4, X1H, X1H, X1H
  1255	VL   0(P1ptr), X1L
  1256	VPDI $0x4, X1L, X1L, X1L
  1257	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1258
  1259	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1260	VL   80(P1ptr), X1       // Z1H
  1261	VPDI $0x4, X1, X1, X1
  1262	VL   64(P1ptr), X0       // Z1L
  1263	VPDI $0x4, X0, X0, X0
  1264	CALL p256MulInternal<>(SB)
  1265
  1266	// VST T1, 64(P3ptr)
  1267	// VST T0, 80(P3ptr)
  1268	VLR T0, Z3L
  1269	VLR T1, Z3H
  1270
  1271	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1272	VLR  Y0, X0
  1273	VLR  Y1, X1
  1274	CALL p256SqrInternal<>(SB)
  1275	VLR  T0, X0
  1276	VLR  T1, X1
  1277
  1278	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1279	CALL p256MulInternal<>(SB)
  1280	VLR  T0, T4L
  1281	VLR  T1, T4H
  1282
  1283	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1284	VL   16(P1ptr), Y1       // X1H
  1285	VPDI $0x4, Y1, Y1, Y1
  1286	VL   0(P1ptr), Y0        // X1L
  1287	VPDI $0x4, Y0, Y0, Y0
  1288	CALL p256MulInternal<>(SB)
  1289	VLR  T0, T3L
  1290	VLR  T1, T3H
  1291
  1292	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1293	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1294
  1295	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1296	VLR  T2L, X0
  1297	VLR  T2H, X1
  1298	VLR  T2L, Y0
  1299	VLR  T2H, Y1
  1300	CALL p256SqrInternal<>(SB)
  1301
  1302	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1303	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1304
  1305	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1306	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1307	VLR T0, X3L
  1308	VLR T1, X3H
  1309
  1310	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1311	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1312
  1313	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1314	CALL p256MulInternal<>(SB)
  1315	VLR  T0, T3L
  1316	VLR  T1, T3H
  1317
  1318	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1319	VLR  T4L, X0
  1320	VLR  T4H, X1
  1321	VL   48(P1ptr), Y1       // Y1H
  1322	VPDI $0x4, Y1, Y1, Y1
  1323	VL   32(P1ptr), Y0       // Y1L
  1324	VPDI $0x4, Y0, Y0, Y0
  1325	CALL p256MulInternal<>(SB)
  1326
  1327	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1328	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1329
  1330	//	if (sel == 0) {
  1331	//		copy(P3.x[:], X1)
  1332	//		copy(P3.y[:], Y1)
  1333	//		copy(P3.z[:], Z1)
  1334	//	}
  1335
  1336	VL   16(P1ptr), X1H
  1337	VPDI $0x4, X1H, X1H, X1H
  1338	VL   0(P1ptr), X1L
  1339	VPDI $0x4, X1L, X1L, X1L
  1340
  1341	// Y1 already loaded, left over from addition
  1342	VL   80(P1ptr), Z1H
  1343	VPDI $0x4, Z1H, Z1H, Z1H
  1344	VL   64(P1ptr), Z1L
  1345	VPDI $0x4, Z1L, Z1L, Z1L
  1346
  1347	VLREPG sel+32(FP), SEL1
  1348	VZERO  ZER
  1349	VCEQG  SEL1, ZER, SEL1
  1350
  1351	VSEL X1L, X3L, SEL1, X3L
  1352	VSEL X1H, X3H, SEL1, X3H
  1353	VSEL Y1L, Y3L, SEL1, Y3L
  1354	VSEL Y1H, Y3H, SEL1, Y3H
  1355	VSEL Z1L, Z3L, SEL1, Z3L
  1356	VSEL Z1H, Z3H, SEL1, Z3H
  1357
  1358	//	if (zero == 0) {
  1359	//		copy(P3.x[:], X2)
  1360	//		copy(P3.y[:], Y2)
  1361	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  1362	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  1363	//	}
  1364	VL   16(P2ptr), X2H
  1365	VPDI $0x4, X2H, X2H, X2H
  1366	VL   0(P2ptr), X2L
  1367	VPDI $0x4, X2L, X2L, X2L
  1368
  1369	// Y2 already loaded
  1370	VL 128(CPOOL), Z2H
  1371	VL 144(CPOOL), Z2L
  1372
  1373	VLREPG zero+40(FP), SEL1
  1374	VZERO  ZER
  1375	VCEQG  SEL1, ZER, SEL1
  1376
  1377	VSEL X2L, X3L, SEL1, X3L
  1378	VSEL X2H, X3H, SEL1, X3H
  1379	VSEL Y2L, Y3L, SEL1, Y3L
  1380	VSEL Y2H, Y3H, SEL1, Y3H
  1381	VSEL Z2L, Z3L, SEL1, Z3L
  1382	VSEL Z2H, Z3H, SEL1, Z3H
  1383
  1384	// All done, store out the result!!!
  1385	VPDI $0x4, X3H, X3H, X3H
  1386	VST  X3H, 16(P3ptr)
  1387	VPDI $0x4, X3L, X3L, X3L
  1388	VST  X3L, 0(P3ptr)
  1389	VPDI $0x4, Y3H, Y3H, Y3H
  1390	VST  Y3H, 48(P3ptr)
  1391	VPDI $0x4, Y3L, Y3L, Y3L
  1392	VST  Y3L, 32(P3ptr)
  1393	VPDI $0x4, Z3H, Z3H, Z3H
  1394	VST  Z3H, 80(P3ptr)
  1395	VPDI $0x4, Z3L, Z3L, Z3L
  1396	VST  Z3L, 64(P3ptr)
  1397
  1398	RET
  1399
  1400#undef P3ptr
  1401#undef P1ptr
  1402#undef P2ptr
  1403#undef CPOOL
  1404
  1405#undef Y2L
  1406#undef Y2H
  1407#undef T1L
  1408#undef T1H
  1409#undef T2L
  1410#undef T2H
  1411#undef T3L
  1412#undef T3H
  1413#undef T4L
  1414#undef T4H
  1415
  1416#undef TT0
  1417#undef TT1
  1418#undef T2
  1419
  1420#undef X0
  1421#undef X1
  1422#undef Y0
  1423#undef Y1
  1424#undef T0
  1425#undef T1
  1426
  1427#undef PL
  1428#undef PH
  1429
  1430#undef X1L
  1431#undef X1H
  1432#undef Y1L
  1433#undef Y1H
  1434#undef Z1L
  1435#undef Z1H
  1436#undef X2L
  1437#undef X2H
  1438#undef Z2L
  1439#undef Z2H
  1440#undef X3L
  1441#undef X3H
  1442#undef Y3L
  1443#undef Y3H
  1444#undef Z3L
  1445#undef Z3H
  1446
  1447#undef ZER
  1448#undef SEL1
  1449#undef CAR1
  1450#undef CAR2
  1451
  1452// func p256PointDoubleAsm(res, in *P256Point)
  1453// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1454// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1455// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1456#define P3ptr   R1
  1457#define P1ptr   R2
  1458#define CPOOL   R4
  1459
  1460// Temporaries in REGs
  1461#define X3L    V15
  1462#define X3H    V16
  1463#define Y3L    V17
  1464#define Y3H    V18
  1465#define T1L    V19
  1466#define T1H    V20
  1467#define T2L    V21
  1468#define T2H    V22
  1469#define T3L    V23
  1470#define T3H    V24
  1471
  1472#define X1L    V6
  1473#define X1H    V7
  1474#define Y1L    V8
  1475#define Y1H    V9
  1476#define Z1L    V10
  1477#define Z1H    V11
  1478
  1479// Temps for Sub and Add
  1480#define TT0  V11
  1481#define TT1  V12
  1482#define T2   V13
  1483
  1484// p256MulAsm Parameters
  1485#define X0    V0
  1486#define X1    V1
  1487#define Y0    V2
  1488#define Y1    V3
  1489#define T0    V4
  1490#define T1    V5
  1491
  1492#define PL    V30
  1493#define PH    V31
  1494
  1495#define Z3L    V23
  1496#define Z3H    V24
  1497
  1498#define ZER   V26
  1499#define SEL1  V27
  1500#define CAR1  V28
  1501#define CAR2  V29
  1502/*
  1503 * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1504 * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1505 * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1506 * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1507 * 	B  = 2Y₁
  1508 * 	Z₃ = B×Z₁
  1509 * 	C  = B²
  1510 * 	D  = C×X₁
  1511 * 	X₃ = A²-2D
  1512 * 	Y₃ = (D-X₃)×A-C²/2
  1513 *
  1514 * Three-operand formula:
  1515 *       T1 = Z1²
  1516 *       T2 = X1-T1
  1517 *       T1 = X1+T1
  1518 *       T2 = T2*T1
  1519 *       T2 = 3*T2
  1520 *       Y3 = 2*Y1
  1521 *       Z3 = Y3*Z1
  1522 *       Y3 = Y3²
  1523 *       T3 = Y3*X1
  1524 *       Y3 = Y3²
  1525 *       Y3 = half*Y3
  1526 *       X3 = T2²
  1527 *       T1 = 2*T3
  1528 *       X3 = X3-T1
  1529 *       T1 = T3-X3
  1530 *       T1 = T1*T2
  1531 *       Y3 = T1-Y3
  1532 */
  1533
  1534TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  1535	MOVD res+0(FP), P3ptr
  1536	MOVD in+8(FP), P1ptr
  1537
  1538	MOVD $p256mul<>+0x00(SB), CPOOL
  1539	VL   16(CPOOL), PL
  1540	VL   0(CPOOL), PH
  1541
  1542	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1543	VL   80(P1ptr), X1        // Z1H
  1544	VPDI $0x4, X1, X1, X1
  1545	VL   64(P1ptr), X0        // Z1L
  1546	VPDI $0x4, X0, X0, X0
  1547	VLR  X0, Y0
  1548	VLR  X1, Y1
  1549	CALL p256SqrInternal<>(SB)
  1550
  1551	// SUB(X<X1-T)            // T2 = X1-T1
  1552	VL   16(P1ptr), X1H
  1553	VPDI $0x4, X1H, X1H, X1H
  1554	VL   0(P1ptr), X1L
  1555	VPDI $0x4, X1L, X1L, X1L
  1556	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1557
  1558	// ADD(Y<X1+T)            // T1 = X1+T1
  1559	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1560
  1561	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1562	CALL p256MulInternal<>(SB)
  1563
  1564	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1565	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1566	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1567
  1568	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1569	VL   48(P1ptr), Y1H
  1570	VPDI $0x4, Y1H, Y1H, Y1H
  1571	VL   32(P1ptr), Y1L
  1572	VPDI $0x4, Y1L, Y1L, Y1L
  1573	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  1574
  1575	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  1576	VL   80(P1ptr), Y1        // Z1H
  1577	VPDI $0x4, Y1, Y1, Y1
  1578	VL   64(P1ptr), Y0        // Z1L
  1579	VPDI $0x4, Y0, Y0, Y0
  1580	CALL p256MulInternal<>(SB)
  1581	VPDI $0x4, T1, T1, TT1
  1582	VST  TT1, 80(P3ptr)
  1583	VPDI $0x4, T0, T0, TT0
  1584	VST  TT0, 64(P3ptr)
  1585
  1586	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1587	VLR  X0, Y0
  1588	VLR  X1, Y1
  1589	CALL p256SqrInternal<>(SB)
  1590
  1591	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  1592	VLR  T0, X0
  1593	VLR  T1, X1
  1594	VL   16(P1ptr), Y1
  1595	VPDI $0x4, Y1, Y1, Y1
  1596	VL   0(P1ptr), Y0
  1597	VPDI $0x4, Y0, Y0, Y0
  1598	CALL p256MulInternal<>(SB)
  1599	VLR  T0, T3L
  1600	VLR  T1, T3H
  1601
  1602	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1603	VLR  X0, Y0
  1604	VLR  X1, Y1
  1605	CALL p256SqrInternal<>(SB)
  1606
  1607	// HAL(Y3<T)              // Y3 = half*Y3
  1608	p256HalfInternal(Y3H,Y3L, T1,T0)
  1609
  1610	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  1611	VLR  T2L, X0
  1612	VLR  T2H, X1
  1613	VLR  T2L, Y0
  1614	VLR  T2H, Y1
  1615	CALL p256SqrInternal<>(SB)
  1616
  1617	// ADD(T1<T3+T3)          // T1 = 2*T3
  1618	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  1619
  1620	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  1621	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  1622	VPDI $0x4, X3H, X3H, TT1
  1623	VST  TT1, 16(P3ptr)
  1624	VPDI $0x4, X3L, X3L, TT0
  1625	VST  TT0, 0(P3ptr)
  1626
  1627	// SUB(X<T3-X3)           // T1 = T3-X3
  1628	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  1629
  1630	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  1631	CALL p256MulInternal<>(SB)
  1632
  1633	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  1634	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  1635
  1636	VPDI $0x4, Y3H, Y3H, Y3H
  1637	VST  Y3H, 48(P3ptr)
  1638	VPDI $0x4, Y3L, Y3L, Y3L
  1639	VST  Y3L, 32(P3ptr)
  1640	RET
  1641
  1642#undef P3ptr
  1643#undef P1ptr
  1644#undef CPOOL
  1645#undef X3L
  1646#undef X3H
  1647#undef Y3L
  1648#undef Y3H
  1649#undef T1L
  1650#undef T1H
  1651#undef T2L
  1652#undef T2H
  1653#undef T3L
  1654#undef T3H
  1655#undef X1L
  1656#undef X1H
  1657#undef Y1L
  1658#undef Y1H
  1659#undef Z1L
  1660#undef Z1H
  1661#undef TT0
  1662#undef TT1
  1663#undef T2
  1664#undef X0
  1665#undef X1
  1666#undef Y0
  1667#undef Y1
  1668#undef T0
  1669#undef T1
  1670#undef PL
  1671#undef PH
  1672#undef Z3L
  1673#undef Z3H
  1674#undef ZER
  1675#undef SEL1
  1676#undef CAR1
  1677#undef CAR2
  1678
  1679// func p256PointAddAsm(res, in1, in2 *P256Point) int
  1680#define P3ptr  R1
  1681#define P1ptr  R2
  1682#define P2ptr  R3
  1683#define CPOOL  R4
  1684#define ISZERO R5
  1685#define TRUE   R6
  1686
  1687// Temporaries in REGs
  1688#define T1L   V16
  1689#define T1H   V17
  1690#define T2L   V18
  1691#define T2H   V19
  1692#define U1L   V20
  1693#define U1H   V21
  1694#define S1L   V22
  1695#define S1H   V23
  1696#define HL    V24
  1697#define HH    V25
  1698#define RL    V26
  1699#define RH    V27
  1700
  1701// Temps for Sub and Add
  1702#define ZER   V6
  1703#define SEL1  V7
  1704#define CAR1  V8
  1705#define CAR2  V9
  1706#define TT0  V11
  1707#define TT1  V12
  1708#define T2   V13
  1709
  1710// p256MulAsm Parameters
  1711#define X0    V0
  1712#define X1    V1
  1713#define Y0    V2
  1714#define Y1    V3
  1715#define T0    V4
  1716#define T1    V5
  1717
  1718#define PL    V30
  1719#define PH    V31
  1720/*
  1721 * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  1722 *
  1723 * A = X₁×Z₂²
  1724 * B = Y₁×Z₂³
  1725 * C = X₂×Z₁²-A
  1726 * D = Y₂×Z₁³-B
  1727 * X₃ = D² - 2A×C² - C³
  1728 * Y₃ = D×(A×C² - X₃) - B×C³
  1729 * Z₃ = Z₁×Z₂×C
  1730 *
  1731 * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  1732 * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  1733 *
  1734 * T1 = Z1*Z1
  1735 * T2 = Z2*Z2
  1736 * U1 = X1*T2
  1737 * H  = X2*T1
  1738 * H  = H-U1
  1739 * Z3 = Z1*Z2
  1740 * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  1741 *
  1742 * S1 = Z2*T2
  1743 * S1 = Y1*S1
  1744 * R  = Z1*T1
  1745 * R  = Y2*R
  1746 * R  = R-S1
  1747 *
  1748 * T1 = H*H
  1749 * T2 = H*T1
  1750 * U1 = U1*T1
  1751 *
  1752 * X3 = R*R
  1753 * X3 = X3-T2
  1754 * T1 = 2*U1
  1755 * X3 = X3-T1 << store-out X3 result reg
  1756 *
  1757 * T2 = S1*T2
  1758 * Y3 = U1-X3
  1759 * Y3 = R*Y3
  1760 * Y3 = Y3-T2 << store-out Y3 result reg
  1761
  1762 	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  1763	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  1764	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  1765	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  1766	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  1767	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  1768	// SUB(H<H-T)            // H  = H-U1
  1769	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  1770	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  1771	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  1772	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  1773	// SUB(R<T-S1)           // R  = R-S1
  1774	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  1775	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  1776	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  1777	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  1778	// SUB(T<T-T2)           // X3 = X3-T2
  1779	// ADD(X<U1+U1)          // T1 = 2*U1
  1780	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  1781	// SUB(Y<U1-T)           // Y3 = U1-X3
  1782	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  1783	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  1784	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  1785	*/
  1786TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  1787	MOVD res+0(FP), P3ptr
  1788	MOVD in1+8(FP), P1ptr
  1789	MOVD in2+16(FP), P2ptr
  1790
  1791	MOVD $p256mul<>+0x00(SB), CPOOL
  1792	VL   16(CPOOL), PL
  1793	VL   0(CPOOL), PH
  1794
  1795	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  1796	VL   80(P1ptr), X1       // Z1H
  1797	VPDI $0x4, X1, X1, X1
  1798	VL   64(P1ptr), X0       // Z1L
  1799	VPDI $0x4, X0, X0, X0
  1800	VLR  X0, Y0
  1801	VLR  X1, Y1
  1802	CALL p256SqrInternal<>(SB)
  1803
  1804	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  1805	VLR  T0, Y0
  1806	VLR  T1, Y1
  1807	CALL p256MulInternal<>(SB)
  1808	VLR  T0, RL
  1809	VLR  T1, RH
  1810
  1811	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  1812	VL   16(P2ptr), X1       // X2H
  1813	VPDI $0x4, X1, X1, X1
  1814	VL   0(P2ptr), X0        // X2L
  1815	VPDI $0x4, X0, X0, X0
  1816	CALL p256MulInternal<>(SB)
  1817	VLR  T0, HL
  1818	VLR  T1, HH
  1819
  1820	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  1821	VL   80(P2ptr), X1       // Z2H
  1822	VPDI $0x4, X1, X1, X1
  1823	VL   64(P2ptr), X0       // Z2L
  1824	VPDI $0x4, X0, X0, X0
  1825	VLR  X0, Y0
  1826	VLR  X1, Y1
  1827	CALL p256SqrInternal<>(SB)
  1828
  1829	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  1830	VLR  T0, Y0
  1831	VLR  T1, Y1
  1832	CALL p256MulInternal<>(SB)
  1833	VLR  T0, S1L
  1834	VLR  T1, S1H
  1835
  1836	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  1837	VL   16(P1ptr), X1       // X1H
  1838	VPDI $0x4, X1, X1, X1
  1839	VL   0(P1ptr), X0        // X1L
  1840	VPDI $0x4, X0, X0, X0
  1841	CALL p256MulInternal<>(SB)
  1842	VLR  T0, U1L
  1843	VLR  T1, U1H
  1844
  1845	// SUB(H<H-T)            // H  = H-U1
  1846	p256SubInternal(HH,HL,HH,HL,T1,T0)
  1847
  1848	// if H == 0 or H^P == 0 then ret=1 else ret=0
  1849	// clobbers T1H and T1L
  1850	MOVD   $0, ISZERO
  1851	MOVD   $1, TRUE
  1852	VZERO  ZER
  1853	VO     HL, HH, T1H
  1854	VCEQGS ZER, T1H, T1H
  1855	MOVDEQ TRUE, ISZERO
  1856	VX     HL, PL, T1L
  1857	VX     HH, PH, T1H
  1858	VO     T1L, T1H, T1H
  1859	VCEQGS ZER, T1H, T1H
  1860	MOVDEQ TRUE, ISZERO
  1861	MOVD   ISZERO, ret+24(FP)
  1862
  1863	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  1864	VL   80(P1ptr), X1       // Z1H
  1865	VPDI $0x4, X1, X1, X1
  1866	VL   64(P1ptr), X0       // Z1L
  1867	VPDI $0x4, X0, X0, X0
  1868	VL   80(P2ptr), Y1       // Z2H
  1869	VPDI $0x4, Y1, Y1, Y1
  1870	VL   64(P2ptr), Y0       // Z2L
  1871	VPDI $0x4, Y0, Y0, Y0
  1872	CALL p256MulInternal<>(SB)
  1873
  1874	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  1875	VLR  T0, X0
  1876	VLR  T1, X1
  1877	VLR  HL, Y0
  1878	VLR  HH, Y1
  1879	CALL p256MulInternal<>(SB)
  1880	VPDI $0x4, T1, T1, TT1
  1881	VST  TT1, 80(P3ptr)
  1882	VPDI $0x4, T0, T0, TT0
  1883	VST  TT0, 64(P3ptr)
  1884
  1885	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  1886	VL   48(P1ptr), X1
  1887	VPDI $0x4, X1, X1, X1
  1888	VL   32(P1ptr), X0
  1889	VPDI $0x4, X0, X0, X0
  1890	VLR  S1L, Y0
  1891	VLR  S1H, Y1
  1892	CALL p256MulInternal<>(SB)
  1893	VLR  T0, S1L
  1894	VLR  T1, S1H
  1895
  1896	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  1897	VL   48(P2ptr), X1
  1898	VPDI $0x4, X1, X1, X1
  1899	VL   32(P2ptr), X0
  1900	VPDI $0x4, X0, X0, X0
  1901	VLR  RL, Y0
  1902	VLR  RH, Y1
  1903	CALL p256MulInternal<>(SB)
  1904
  1905	// SUB(R<T-S1)           // R  = T-S1
  1906	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  1907
  1908	// if R == 0 or R^P == 0 then ret=ret else ret=0
  1909	// clobbers T1H and T1L
  1910	MOVD   $0, ISZERO
  1911	MOVD   $1, TRUE
  1912	VZERO  ZER
  1913	VO     RL, RH, T1H
  1914	VCEQGS ZER, T1H, T1H
  1915	MOVDEQ TRUE, ISZERO
  1916	VX     RL, PL, T1L
  1917	VX     RH, PH, T1H
  1918	VO     T1L, T1H, T1H
  1919	VCEQGS ZER, T1H, T1H
  1920	MOVDEQ TRUE, ISZERO
  1921	AND    ret+24(FP), ISZERO
  1922	MOVD   ISZERO, ret+24(FP)
  1923
  1924	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  1925	VLR  HL, X0
  1926	VLR  HH, X1
  1927	VLR  HL, Y0
  1928	VLR  HH, Y1
  1929	CALL p256SqrInternal<>(SB)
  1930
  1931	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  1932	VLR  T0, Y0
  1933	VLR  T1, Y1
  1934	CALL p256MulInternal<>(SB)
  1935	VLR  T0, T2L
  1936	VLR  T1, T2H
  1937
  1938	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  1939	VLR  U1L, X0
  1940	VLR  U1H, X1
  1941	CALL p256MulInternal<>(SB)
  1942	VLR  T0, U1L
  1943	VLR  T1, U1H
  1944
  1945	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  1946	VLR  RL, X0
  1947	VLR  RH, X1
  1948	VLR  RL, Y0
  1949	VLR  RH, Y1
  1950	CALL p256SqrInternal<>(SB)
  1951
  1952	// SUB(T<T-T2)           // X3 = X3-T2
  1953	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  1954
  1955	// ADD(X<U1+U1)          // T1 = 2*U1
  1956	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  1957
  1958	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  1959	p256SubInternal(T1,T0,T1,T0,X1,X0)
  1960	VPDI $0x4, T1, T1, TT1
  1961	VST  TT1, 16(P3ptr)
  1962	VPDI $0x4, T0, T0, TT0
  1963	VST  TT0, 0(P3ptr)
  1964
  1965	// SUB(Y<U1-T)           // Y3 = U1-X3
  1966	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  1967
  1968	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  1969	VLR  RL, X0
  1970	VLR  RH, X1
  1971	CALL p256MulInternal<>(SB)
  1972	VLR  T0, U1L
  1973	VLR  T1, U1H
  1974
  1975	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  1976	VLR  S1L, X0
  1977	VLR  S1H, X1
  1978	VLR  T2L, Y0
  1979	VLR  T2H, Y1
  1980	CALL p256MulInternal<>(SB)
  1981
  1982	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  1983	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  1984	VPDI $0x4, T1, T1, T1
  1985	VST  T1, 48(P3ptr)
  1986	VPDI $0x4, T0, T0, T0
  1987	VST  T0, 32(P3ptr)
  1988
  1989	RET

View as plain text