...
Run Format

Text file src/crypto/internal/fips140/sha256/sha256block_amd64.s

Documentation: crypto/internal/fips140/sha256

     1// Code generated by command: go run sha256block_amd64_asm.go -out ../sha256block_amd64.s. DO NOT EDIT.
     2
     3//go:build !purego
     4
     5#include "textflag.h"
     6
     7// func blockAVX2(dig *Digest, p []byte)
     8// Requires: AVX, AVX2, BMI2
     9TEXT ·blockAVX2(SB), $536-32
    10	MOVQ dig+0(FP), SI
    11	MOVQ p_base+8(FP), DI
    12	MOVQ p_len+16(FP), DX
    13	LEAQ -64(DI)(DX*1), DX
    14	MOVQ DX, 512(SP)
    15	CMPQ DX, DI
    16	JE   avx2_only_one_block
    17
    18	// Load initial digest
    19	MOVL (SI), AX
    20	MOVL 4(SI), BX
    21	MOVL 8(SI), CX
    22	MOVL 12(SI), R8
    23	MOVL 16(SI), DX
    24	MOVL 20(SI), R9
    25	MOVL 24(SI), R10
    26	MOVL 28(SI), R11
    27
    28avx2_loop0:
    29	// at each iteration works with one block (512 bit)
    30	VMOVDQU (DI), Y0
    31	VMOVDQU 32(DI), Y1
    32	VMOVDQU 64(DI), Y2
    33	VMOVDQU 96(DI), Y3
    34	VMOVDQU flip_mask<>+0(SB), Y13
    35
    36	// Apply Byte Flip Mask: LE -> BE
    37	VPSHUFB Y13, Y0, Y0
    38	VPSHUFB Y13, Y1, Y1
    39	VPSHUFB Y13, Y2, Y2
    40	VPSHUFB Y13, Y3, Y3
    41
    42	// Transpose data into high/low parts
    43	VPERM2I128 $0x20, Y2, Y0, Y4
    44	VPERM2I128 $0x31, Y2, Y0, Y5
    45	VPERM2I128 $0x20, Y3, Y1, Y6
    46	VPERM2I128 $0x31, Y3, Y1, Y7
    47	LEAQ       K256<>+0(SB), BP
    48
    49avx2_last_block_enter:
    50	ADDQ $0x40, DI
    51	MOVQ DI, 520(SP)
    52	XORQ SI, SI
    53
    54avx2_loop1:
    55	// Do 4 rounds and scheduling
    56	VPADDD   (BP)(SI*1), Y4, Y9
    57	VMOVDQU  Y9, (SP)(SI*1)
    58	MOVL     AX, DI
    59	RORXL    $0x19, DX, R13
    60	RORXL    $0x0b, DX, R14
    61	ADDL     (SP)(SI*1), R11
    62	ORL      CX, DI
    63	VPALIGNR $0x04, Y6, Y7, Y0
    64	MOVL     R9, R15
    65	RORXL    $0x0d, AX, R12
    66	XORL     R14, R13
    67	XORL     R10, R15
    68	VPADDD   Y4, Y0, Y0
    69	RORXL    $0x06, DX, R14
    70	ANDL     DX, R15
    71	XORL     R14, R13
    72	RORXL    $0x16, AX, R14
    73	ADDL     R11, R8
    74	ANDL     BX, DI
    75	VPALIGNR $0x04, Y4, Y5, Y1
    76	XORL     R12, R14
    77	RORXL    $0x02, AX, R12
    78	XORL     R10, R15
    79	VPSRLD   $0x07, Y1, Y2
    80	XORL     R12, R14
    81	MOVL     AX, R12
    82	ANDL     CX, R12
    83	ADDL     R13, R15
    84	VPSLLD   $0x19, Y1, Y3
    85	ORL      R12, DI
    86	ADDL     R14, R11
    87	ADDL     R15, R8
    88	VPOR     Y2, Y3, Y3
    89	VPSRLD   $0x12, Y1, Y2
    90	ADDL     R15, R11
    91	ADDL     DI, R11
    92	MOVL     R11, DI
    93	RORXL    $0x19, R8, R13
    94	RORXL    $0x0b, R8, R14
    95	ADDL     4(SP)(SI*1), R10
    96	ORL      BX, DI
    97	VPSRLD   $0x03, Y1, Y8
    98	MOVL     DX, R15
    99	RORXL    $0x0d, R11, R12
   100	XORL     R14, R13
   101	XORL     R9, R15
   102	RORXL    $0x06, R8, R14
   103	XORL     R14, R13
   104	RORXL    $0x16, R11, R14
   105	ANDL     R8, R15
   106	ADDL     R10, CX
   107	VPSLLD   $0x0e, Y1, Y1
   108	ANDL     AX, DI
   109	XORL     R12, R14
   110	VPXOR    Y1, Y3, Y3
   111	RORXL    $0x02, R11, R12
   112	XORL     R9, R15
   113	VPXOR    Y2, Y3, Y3
   114	XORL     R12, R14
   115	MOVL     R11, R12
   116	ANDL     BX, R12
   117	ADDL     R13, R15
   118	VPXOR    Y8, Y3, Y1
   119	VPSHUFD  $0xfa, Y7, Y2
   120	ORL      R12, DI
   121	ADDL     R14, R10
   122	VPADDD   Y1, Y0, Y0
   123	ADDL     R15, CX
   124	ADDL     R15, R10
   125	ADDL     DI, R10
   126	VPSRLD   $0x0a, Y2, Y8
   127	MOVL     R10, DI
   128	RORXL    $0x19, CX, R13
   129	ADDL     8(SP)(SI*1), R9
   130	VPSRLQ   $0x13, Y2, Y3
   131	RORXL    $0x0b, CX, R14
   132	ORL      AX, DI
   133	MOVL     R8, R15
   134	XORL     DX, R15
   135	RORXL    $0x0d, R10, R12
   136	XORL     R14, R13
   137	VPSRLQ   $0x11, Y2, Y2
   138	ANDL     CX, R15
   139	RORXL    $0x06, CX, R14
   140	VPXOR    Y3, Y2, Y2
   141	ADDL     R9, BX
   142	ANDL     R11, DI
   143	XORL     R14, R13
   144	RORXL    $0x16, R10, R14
   145	VPXOR    Y2, Y8, Y8
   146	XORL     DX, R15
   147	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   148	XORL     R12, R14
   149	RORXL    $0x02, R10, R12
   150	VPADDD   Y8, Y0, Y0
   151	XORL     R12, R14
   152	MOVL     R10, R12
   153	ANDL     AX, R12
   154	ADDL     R13, R15
   155	VPSHUFD  $0x50, Y0, Y2
   156	ORL      R12, DI
   157	ADDL     R14, R9
   158	ADDL     R15, BX
   159	ADDL     R15, R9
   160	ADDL     DI, R9
   161	MOVL     R9, DI
   162	RORXL    $0x19, BX, R13
   163	RORXL    $0x0b, BX, R14
   164	ADDL     12(SP)(SI*1), DX
   165	ORL      R11, DI
   166	VPSRLD   $0x0a, Y2, Y11
   167	MOVL     CX, R15
   168	RORXL    $0x0d, R9, R12
   169	XORL     R14, R13
   170	XORL     R8, R15
   171	VPSRLQ   $0x13, Y2, Y3
   172	RORXL    $0x06, BX, R14
   173	ANDL     BX, R15
   174	ADDL     DX, AX
   175	ANDL     R10, DI
   176	VPSRLQ   $0x11, Y2, Y2
   177	XORL     R14, R13
   178	XORL     R8, R15
   179	VPXOR    Y3, Y2, Y2
   180	RORXL    $0x16, R9, R14
   181	ADDL     R13, R15
   182	VPXOR    Y2, Y11, Y11
   183	XORL     R12, R14
   184	ADDL     R15, AX
   185	RORXL    $0x02, R9, R12
   186	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   187	VPADDD   Y0, Y11, Y4
   188	XORL     R12, R14
   189	MOVL     R9, R12
   190	ANDL     R11, R12
   191	ORL      R12, DI
   192	ADDL     R14, DX
   193	ADDL     R15, DX
   194	ADDL     DI, DX
   195
   196	// Do 4 rounds and scheduling
   197	VPADDD   32(BP)(SI*1), Y5, Y9
   198	VMOVDQU  Y9, 32(SP)(SI*1)
   199	MOVL     DX, DI
   200	RORXL    $0x19, AX, R13
   201	RORXL    $0x0b, AX, R14
   202	ADDL     32(SP)(SI*1), R8
   203	ORL      R10, DI
   204	VPALIGNR $0x04, Y7, Y4, Y0
   205	MOVL     BX, R15
   206	RORXL    $0x0d, DX, R12
   207	XORL     R14, R13
   208	XORL     CX, R15
   209	VPADDD   Y5, Y0, Y0
   210	RORXL    $0x06, AX, R14
   211	ANDL     AX, R15
   212	XORL     R14, R13
   213	RORXL    $0x16, DX, R14
   214	ADDL     R8, R11
   215	ANDL     R9, DI
   216	VPALIGNR $0x04, Y5, Y6, Y1
   217	XORL     R12, R14
   218	RORXL    $0x02, DX, R12
   219	XORL     CX, R15
   220	VPSRLD   $0x07, Y1, Y2
   221	XORL     R12, R14
   222	MOVL     DX, R12
   223	ANDL     R10, R12
   224	ADDL     R13, R15
   225	VPSLLD   $0x19, Y1, Y3
   226	ORL      R12, DI
   227	ADDL     R14, R8
   228	ADDL     R15, R11
   229	VPOR     Y2, Y3, Y3
   230	VPSRLD   $0x12, Y1, Y2
   231	ADDL     R15, R8
   232	ADDL     DI, R8
   233	MOVL     R8, DI
   234	RORXL    $0x19, R11, R13
   235	RORXL    $0x0b, R11, R14
   236	ADDL     36(SP)(SI*1), CX
   237	ORL      R9, DI
   238	VPSRLD   $0x03, Y1, Y8
   239	MOVL     AX, R15
   240	RORXL    $0x0d, R8, R12
   241	XORL     R14, R13
   242	XORL     BX, R15
   243	RORXL    $0x06, R11, R14
   244	XORL     R14, R13
   245	RORXL    $0x16, R8, R14
   246	ANDL     R11, R15
   247	ADDL     CX, R10
   248	VPSLLD   $0x0e, Y1, Y1
   249	ANDL     DX, DI
   250	XORL     R12, R14
   251	VPXOR    Y1, Y3, Y3
   252	RORXL    $0x02, R8, R12
   253	XORL     BX, R15
   254	VPXOR    Y2, Y3, Y3
   255	XORL     R12, R14
   256	MOVL     R8, R12
   257	ANDL     R9, R12
   258	ADDL     R13, R15
   259	VPXOR    Y8, Y3, Y1
   260	VPSHUFD  $0xfa, Y4, Y2
   261	ORL      R12, DI
   262	ADDL     R14, CX
   263	VPADDD   Y1, Y0, Y0
   264	ADDL     R15, R10
   265	ADDL     R15, CX
   266	ADDL     DI, CX
   267	VPSRLD   $0x0a, Y2, Y8
   268	MOVL     CX, DI
   269	RORXL    $0x19, R10, R13
   270	ADDL     40(SP)(SI*1), BX
   271	VPSRLQ   $0x13, Y2, Y3
   272	RORXL    $0x0b, R10, R14
   273	ORL      DX, DI
   274	MOVL     R11, R15
   275	XORL     AX, R15
   276	RORXL    $0x0d, CX, R12
   277	XORL     R14, R13
   278	VPSRLQ   $0x11, Y2, Y2
   279	ANDL     R10, R15
   280	RORXL    $0x06, R10, R14
   281	VPXOR    Y3, Y2, Y2
   282	ADDL     BX, R9
   283	ANDL     R8, DI
   284	XORL     R14, R13
   285	RORXL    $0x16, CX, R14
   286	VPXOR    Y2, Y8, Y8
   287	XORL     AX, R15
   288	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   289	XORL     R12, R14
   290	RORXL    $0x02, CX, R12
   291	VPADDD   Y8, Y0, Y0
   292	XORL     R12, R14
   293	MOVL     CX, R12
   294	ANDL     DX, R12
   295	ADDL     R13, R15
   296	VPSHUFD  $0x50, Y0, Y2
   297	ORL      R12, DI
   298	ADDL     R14, BX
   299	ADDL     R15, R9
   300	ADDL     R15, BX
   301	ADDL     DI, BX
   302	MOVL     BX, DI
   303	RORXL    $0x19, R9, R13
   304	RORXL    $0x0b, R9, R14
   305	ADDL     44(SP)(SI*1), AX
   306	ORL      R8, DI
   307	VPSRLD   $0x0a, Y2, Y11
   308	MOVL     R10, R15
   309	RORXL    $0x0d, BX, R12
   310	XORL     R14, R13
   311	XORL     R11, R15
   312	VPSRLQ   $0x13, Y2, Y3
   313	RORXL    $0x06, R9, R14
   314	ANDL     R9, R15
   315	ADDL     AX, DX
   316	ANDL     CX, DI
   317	VPSRLQ   $0x11, Y2, Y2
   318	XORL     R14, R13
   319	XORL     R11, R15
   320	VPXOR    Y3, Y2, Y2
   321	RORXL    $0x16, BX, R14
   322	ADDL     R13, R15
   323	VPXOR    Y2, Y11, Y11
   324	XORL     R12, R14
   325	ADDL     R15, DX
   326	RORXL    $0x02, BX, R12
   327	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   328	VPADDD   Y0, Y11, Y5
   329	XORL     R12, R14
   330	MOVL     BX, R12
   331	ANDL     R8, R12
   332	ORL      R12, DI
   333	ADDL     R14, AX
   334	ADDL     R15, AX
   335	ADDL     DI, AX
   336
   337	// Do 4 rounds and scheduling
   338	VPADDD   64(BP)(SI*1), Y6, Y9
   339	VMOVDQU  Y9, 64(SP)(SI*1)
   340	MOVL     AX, DI
   341	RORXL    $0x19, DX, R13
   342	RORXL    $0x0b, DX, R14
   343	ADDL     64(SP)(SI*1), R11
   344	ORL      CX, DI
   345	VPALIGNR $0x04, Y4, Y5, Y0
   346	MOVL     R9, R15
   347	RORXL    $0x0d, AX, R12
   348	XORL     R14, R13
   349	XORL     R10, R15
   350	VPADDD   Y6, Y0, Y0
   351	RORXL    $0x06, DX, R14
   352	ANDL     DX, R15
   353	XORL     R14, R13
   354	RORXL    $0x16, AX, R14
   355	ADDL     R11, R8
   356	ANDL     BX, DI
   357	VPALIGNR $0x04, Y6, Y7, Y1
   358	XORL     R12, R14
   359	RORXL    $0x02, AX, R12
   360	XORL     R10, R15
   361	VPSRLD   $0x07, Y1, Y2
   362	XORL     R12, R14
   363	MOVL     AX, R12
   364	ANDL     CX, R12
   365	ADDL     R13, R15
   366	VPSLLD   $0x19, Y1, Y3
   367	ORL      R12, DI
   368	ADDL     R14, R11
   369	ADDL     R15, R8
   370	VPOR     Y2, Y3, Y3
   371	VPSRLD   $0x12, Y1, Y2
   372	ADDL     R15, R11
   373	ADDL     DI, R11
   374	MOVL     R11, DI
   375	RORXL    $0x19, R8, R13
   376	RORXL    $0x0b, R8, R14
   377	ADDL     68(SP)(SI*1), R10
   378	ORL      BX, DI
   379	VPSRLD   $0x03, Y1, Y8
   380	MOVL     DX, R15
   381	RORXL    $0x0d, R11, R12
   382	XORL     R14, R13
   383	XORL     R9, R15
   384	RORXL    $0x06, R8, R14
   385	XORL     R14, R13
   386	RORXL    $0x16, R11, R14
   387	ANDL     R8, R15
   388	ADDL     R10, CX
   389	VPSLLD   $0x0e, Y1, Y1
   390	ANDL     AX, DI
   391	XORL     R12, R14
   392	VPXOR    Y1, Y3, Y3
   393	RORXL    $0x02, R11, R12
   394	XORL     R9, R15
   395	VPXOR    Y2, Y3, Y3
   396	XORL     R12, R14
   397	MOVL     R11, R12
   398	ANDL     BX, R12
   399	ADDL     R13, R15
   400	VPXOR    Y8, Y3, Y1
   401	VPSHUFD  $0xfa, Y5, Y2
   402	ORL      R12, DI
   403	ADDL     R14, R10
   404	VPADDD   Y1, Y0, Y0
   405	ADDL     R15, CX
   406	ADDL     R15, R10
   407	ADDL     DI, R10
   408	VPSRLD   $0x0a, Y2, Y8
   409	MOVL     R10, DI
   410	RORXL    $0x19, CX, R13
   411	ADDL     72(SP)(SI*1), R9
   412	VPSRLQ   $0x13, Y2, Y3
   413	RORXL    $0x0b, CX, R14
   414	ORL      AX, DI
   415	MOVL     R8, R15
   416	XORL     DX, R15
   417	RORXL    $0x0d, R10, R12
   418	XORL     R14, R13
   419	VPSRLQ   $0x11, Y2, Y2
   420	ANDL     CX, R15
   421	RORXL    $0x06, CX, R14
   422	VPXOR    Y3, Y2, Y2
   423	ADDL     R9, BX
   424	ANDL     R11, DI
   425	XORL     R14, R13
   426	RORXL    $0x16, R10, R14
   427	VPXOR    Y2, Y8, Y8
   428	XORL     DX, R15
   429	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   430	XORL     R12, R14
   431	RORXL    $0x02, R10, R12
   432	VPADDD   Y8, Y0, Y0
   433	XORL     R12, R14
   434	MOVL     R10, R12
   435	ANDL     AX, R12
   436	ADDL     R13, R15
   437	VPSHUFD  $0x50, Y0, Y2
   438	ORL      R12, DI
   439	ADDL     R14, R9
   440	ADDL     R15, BX
   441	ADDL     R15, R9
   442	ADDL     DI, R9
   443	MOVL     R9, DI
   444	RORXL    $0x19, BX, R13
   445	RORXL    $0x0b, BX, R14
   446	ADDL     76(SP)(SI*1), DX
   447	ORL      R11, DI
   448	VPSRLD   $0x0a, Y2, Y11
   449	MOVL     CX, R15
   450	RORXL    $0x0d, R9, R12
   451	XORL     R14, R13
   452	XORL     R8, R15
   453	VPSRLQ   $0x13, Y2, Y3
   454	RORXL    $0x06, BX, R14
   455	ANDL     BX, R15
   456	ADDL     DX, AX
   457	ANDL     R10, DI
   458	VPSRLQ   $0x11, Y2, Y2
   459	XORL     R14, R13
   460	XORL     R8, R15
   461	VPXOR    Y3, Y2, Y2
   462	RORXL    $0x16, R9, R14
   463	ADDL     R13, R15
   464	VPXOR    Y2, Y11, Y11
   465	XORL     R12, R14
   466	ADDL     R15, AX
   467	RORXL    $0x02, R9, R12
   468	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   469	VPADDD   Y0, Y11, Y6
   470	XORL     R12, R14
   471	MOVL     R9, R12
   472	ANDL     R11, R12
   473	ORL      R12, DI
   474	ADDL     R14, DX
   475	ADDL     R15, DX
   476	ADDL     DI, DX
   477
   478	// Do 4 rounds and scheduling
   479	VPADDD   96(BP)(SI*1), Y7, Y9
   480	VMOVDQU  Y9, 96(SP)(SI*1)
   481	MOVL     DX, DI
   482	RORXL    $0x19, AX, R13
   483	RORXL    $0x0b, AX, R14
   484	ADDL     96(SP)(SI*1), R8
   485	ORL      R10, DI
   486	VPALIGNR $0x04, Y5, Y6, Y0
   487	MOVL     BX, R15
   488	RORXL    $0x0d, DX, R12
   489	XORL     R14, R13
   490	XORL     CX, R15
   491	VPADDD   Y7, Y0, Y0
   492	RORXL    $0x06, AX, R14
   493	ANDL     AX, R15
   494	XORL     R14, R13
   495	RORXL    $0x16, DX, R14
   496	ADDL     R8, R11
   497	ANDL     R9, DI
   498	VPALIGNR $0x04, Y7, Y4, Y1
   499	XORL     R12, R14
   500	RORXL    $0x02, DX, R12
   501	XORL     CX, R15
   502	VPSRLD   $0x07, Y1, Y2
   503	XORL     R12, R14
   504	MOVL     DX, R12
   505	ANDL     R10, R12
   506	ADDL     R13, R15
   507	VPSLLD   $0x19, Y1, Y3
   508	ORL      R12, DI
   509	ADDL     R14, R8
   510	ADDL     R15, R11
   511	VPOR     Y2, Y3, Y3
   512	VPSRLD   $0x12, Y1, Y2
   513	ADDL     R15, R8
   514	ADDL     DI, R8
   515	MOVL     R8, DI
   516	RORXL    $0x19, R11, R13
   517	RORXL    $0x0b, R11, R14
   518	ADDL     100(SP)(SI*1), CX
   519	ORL      R9, DI
   520	VPSRLD   $0x03, Y1, Y8
   521	MOVL     AX, R15
   522	RORXL    $0x0d, R8, R12
   523	XORL     R14, R13
   524	XORL     BX, R15
   525	RORXL    $0x06, R11, R14
   526	XORL     R14, R13
   527	RORXL    $0x16, R8, R14
   528	ANDL     R11, R15
   529	ADDL     CX, R10
   530	VPSLLD   $0x0e, Y1, Y1
   531	ANDL     DX, DI
   532	XORL     R12, R14
   533	VPXOR    Y1, Y3, Y3
   534	RORXL    $0x02, R8, R12
   535	XORL     BX, R15
   536	VPXOR    Y2, Y3, Y3
   537	XORL     R12, R14
   538	MOVL     R8, R12
   539	ANDL     R9, R12
   540	ADDL     R13, R15
   541	VPXOR    Y8, Y3, Y1
   542	VPSHUFD  $0xfa, Y6, Y2
   543	ORL      R12, DI
   544	ADDL     R14, CX
   545	VPADDD   Y1, Y0, Y0
   546	ADDL     R15, R10
   547	ADDL     R15, CX
   548	ADDL     DI, CX
   549	VPSRLD   $0x0a, Y2, Y8
   550	MOVL     CX, DI
   551	RORXL    $0x19, R10, R13
   552	ADDL     104(SP)(SI*1), BX
   553	VPSRLQ   $0x13, Y2, Y3
   554	RORXL    $0x0b, R10, R14
   555	ORL      DX, DI
   556	MOVL     R11, R15
   557	XORL     AX, R15
   558	RORXL    $0x0d, CX, R12
   559	XORL     R14, R13
   560	VPSRLQ   $0x11, Y2, Y2
   561	ANDL     R10, R15
   562	RORXL    $0x06, R10, R14
   563	VPXOR    Y3, Y2, Y2
   564	ADDL     BX, R9
   565	ANDL     R8, DI
   566	XORL     R14, R13
   567	RORXL    $0x16, CX, R14
   568	VPXOR    Y2, Y8, Y8
   569	XORL     AX, R15
   570	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
   571	XORL     R12, R14
   572	RORXL    $0x02, CX, R12
   573	VPADDD   Y8, Y0, Y0
   574	XORL     R12, R14
   575	MOVL     CX, R12
   576	ANDL     DX, R12
   577	ADDL     R13, R15
   578	VPSHUFD  $0x50, Y0, Y2
   579	ORL      R12, DI
   580	ADDL     R14, BX
   581	ADDL     R15, R9
   582	ADDL     R15, BX
   583	ADDL     DI, BX
   584	MOVL     BX, DI
   585	RORXL    $0x19, R9, R13
   586	RORXL    $0x0b, R9, R14
   587	ADDL     108(SP)(SI*1), AX
   588	ORL      R8, DI
   589	VPSRLD   $0x0a, Y2, Y11
   590	MOVL     R10, R15
   591	RORXL    $0x0d, BX, R12
   592	XORL     R14, R13
   593	XORL     R11, R15
   594	VPSRLQ   $0x13, Y2, Y3
   595	RORXL    $0x06, R9, R14
   596	ANDL     R9, R15
   597	ADDL     AX, DX
   598	ANDL     CX, DI
   599	VPSRLQ   $0x11, Y2, Y2
   600	XORL     R14, R13
   601	XORL     R11, R15
   602	VPXOR    Y3, Y2, Y2
   603	RORXL    $0x16, BX, R14
   604	ADDL     R13, R15
   605	VPXOR    Y2, Y11, Y11
   606	XORL     R12, R14
   607	ADDL     R15, DX
   608	RORXL    $0x02, BX, R12
   609	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
   610	VPADDD   Y0, Y11, Y7
   611	XORL     R12, R14
   612	MOVL     BX, R12
   613	ANDL     R8, R12
   614	ORL      R12, DI
   615	ADDL     R14, AX
   616	ADDL     R15, AX
   617	ADDL     DI, AX
   618	ADDQ     $0x80, SI
   619	CMPQ     SI, $0x00000180
   620	JB       avx2_loop1
   621
   622avx2_loop2:
   623	VPADDD  (BP)(SI*1), Y4, Y9
   624	VMOVDQU Y9, (SP)(SI*1)
   625	MOVL    R9, R15
   626	RORXL   $0x19, DX, R13
   627	RORXL   $0x0b, DX, R14
   628	XORL    R10, R15
   629	XORL    R14, R13
   630	RORXL   $0x06, DX, R14
   631	ANDL    DX, R15
   632	XORL    R14, R13
   633	RORXL   $0x0d, AX, R12
   634	XORL    R10, R15
   635	RORXL   $0x16, AX, R14
   636	MOVL    AX, DI
   637	XORL    R12, R14
   638	RORXL   $0x02, AX, R12
   639	ADDL    (SP)(SI*1), R11
   640	ORL     CX, DI
   641	XORL    R12, R14
   642	MOVL    AX, R12
   643	ANDL    BX, DI
   644	ANDL    CX, R12
   645	ADDL    R13, R15
   646	ADDL    R11, R8
   647	ORL     R12, DI
   648	ADDL    R14, R11
   649	ADDL    R15, R8
   650	ADDL    R15, R11
   651	MOVL    DX, R15
   652	RORXL   $0x19, R8, R13
   653	RORXL   $0x0b, R8, R14
   654	XORL    R9, R15
   655	XORL    R14, R13
   656	RORXL   $0x06, R8, R14
   657	ANDL    R8, R15
   658	ADDL    DI, R11
   659	XORL    R14, R13
   660	RORXL   $0x0d, R11, R12
   661	XORL    R9, R15
   662	RORXL   $0x16, R11, R14
   663	MOVL    R11, DI
   664	XORL    R12, R14
   665	RORXL   $0x02, R11, R12
   666	ADDL    4(SP)(SI*1), R10
   667	ORL     BX, DI
   668	XORL    R12, R14
   669	MOVL    R11, R12
   670	ANDL    AX, DI
   671	ANDL    BX, R12
   672	ADDL    R13, R15
   673	ADDL    R10, CX
   674	ORL     R12, DI
   675	ADDL    R14, R10
   676	ADDL    R15, CX
   677	ADDL    R15, R10
   678	MOVL    R8, R15
   679	RORXL   $0x19, CX, R13
   680	RORXL   $0x0b, CX, R14
   681	XORL    DX, R15
   682	XORL    R14, R13
   683	RORXL   $0x06, CX, R14
   684	ANDL    CX, R15
   685	ADDL    DI, R10
   686	XORL    R14, R13
   687	RORXL   $0x0d, R10, R12
   688	XORL    DX, R15
   689	RORXL   $0x16, R10, R14
   690	MOVL    R10, DI
   691	XORL    R12, R14
   692	RORXL   $0x02, R10, R12
   693	ADDL    8(SP)(SI*1), R9
   694	ORL     AX, DI
   695	XORL    R12, R14
   696	MOVL    R10, R12
   697	ANDL    R11, DI
   698	ANDL    AX, R12
   699	ADDL    R13, R15
   700	ADDL    R9, BX
   701	ORL     R12, DI
   702	ADDL    R14, R9
   703	ADDL    R15, BX
   704	ADDL    R15, R9
   705	MOVL    CX, R15
   706	RORXL   $0x19, BX, R13
   707	RORXL   $0x0b, BX, R14
   708	XORL    R8, R15
   709	XORL    R14, R13
   710	RORXL   $0x06, BX, R14
   711	ANDL    BX, R15
   712	ADDL    DI, R9
   713	XORL    R14, R13
   714	RORXL   $0x0d, R9, R12
   715	XORL    R8, R15
   716	RORXL   $0x16, R9, R14
   717	MOVL    R9, DI
   718	XORL    R12, R14
   719	RORXL   $0x02, R9, R12
   720	ADDL    12(SP)(SI*1), DX
   721	ORL     R11, DI
   722	XORL    R12, R14
   723	MOVL    R9, R12
   724	ANDL    R10, DI
   725	ANDL    R11, R12
   726	ADDL    R13, R15
   727	ADDL    DX, AX
   728	ORL     R12, DI
   729	ADDL    R14, DX
   730	ADDL    R15, AX
   731	ADDL    R15, DX
   732	ADDL    DI, DX
   733	VPADDD  32(BP)(SI*1), Y5, Y9
   734	VMOVDQU Y9, 32(SP)(SI*1)
   735	MOVL    BX, R15
   736	RORXL   $0x19, AX, R13
   737	RORXL   $0x0b, AX, R14
   738	XORL    CX, R15
   739	XORL    R14, R13
   740	RORXL   $0x06, AX, R14
   741	ANDL    AX, R15
   742	XORL    R14, R13
   743	RORXL   $0x0d, DX, R12
   744	XORL    CX, R15
   745	RORXL   $0x16, DX, R14
   746	MOVL    DX, DI
   747	XORL    R12, R14
   748	RORXL   $0x02, DX, R12
   749	ADDL    32(SP)(SI*1), R8
   750	ORL     R10, DI
   751	XORL    R12, R14
   752	MOVL    DX, R12
   753	ANDL    R9, DI
   754	ANDL    R10, R12
   755	ADDL    R13, R15
   756	ADDL    R8, R11
   757	ORL     R12, DI
   758	ADDL    R14, R8
   759	ADDL    R15, R11
   760	ADDL    R15, R8
   761	MOVL    AX, R15
   762	RORXL   $0x19, R11, R13
   763	RORXL   $0x0b, R11, R14
   764	XORL    BX, R15
   765	XORL    R14, R13
   766	RORXL   $0x06, R11, R14
   767	ANDL    R11, R15
   768	ADDL    DI, R8
   769	XORL    R14, R13
   770	RORXL   $0x0d, R8, R12
   771	XORL    BX, R15
   772	RORXL   $0x16, R8, R14
   773	MOVL    R8, DI
   774	XORL    R12, R14
   775	RORXL   $0x02, R8, R12
   776	ADDL    36(SP)(SI*1), CX
   777	ORL     R9, DI
   778	XORL    R12, R14
   779	MOVL    R8, R12
   780	ANDL    DX, DI
   781	ANDL    R9, R12
   782	ADDL    R13, R15
   783	ADDL    CX, R10
   784	ORL     R12, DI
   785	ADDL    R14, CX
   786	ADDL    R15, R10
   787	ADDL    R15, CX
   788	MOVL    R11, R15
   789	RORXL   $0x19, R10, R13
   790	RORXL   $0x0b, R10, R14
   791	XORL    AX, R15
   792	XORL    R14, R13
   793	RORXL   $0x06, R10, R14
   794	ANDL    R10, R15
   795	ADDL    DI, CX
   796	XORL    R14, R13
   797	RORXL   $0x0d, CX, R12
   798	XORL    AX, R15
   799	RORXL   $0x16, CX, R14
   800	MOVL    CX, DI
   801	XORL    R12, R14
   802	RORXL   $0x02, CX, R12
   803	ADDL    40(SP)(SI*1), BX
   804	ORL     DX, DI
   805	XORL    R12, R14
   806	MOVL    CX, R12
   807	ANDL    R8, DI
   808	ANDL    DX, R12
   809	ADDL    R13, R15
   810	ADDL    BX, R9
   811	ORL     R12, DI
   812	ADDL    R14, BX
   813	ADDL    R15, R9
   814	ADDL    R15, BX
   815	MOVL    R10, R15
   816	RORXL   $0x19, R9, R13
   817	RORXL   $0x0b, R9, R14
   818	XORL    R11, R15
   819	XORL    R14, R13
   820	RORXL   $0x06, R9, R14
   821	ANDL    R9, R15
   822	ADDL    DI, BX
   823	XORL    R14, R13
   824	RORXL   $0x0d, BX, R12
   825	XORL    R11, R15
   826	RORXL   $0x16, BX, R14
   827	MOVL    BX, DI
   828	XORL    R12, R14
   829	RORXL   $0x02, BX, R12
   830	ADDL    44(SP)(SI*1), AX
   831	ORL     R8, DI
   832	XORL    R12, R14
   833	MOVL    BX, R12
   834	ANDL    CX, DI
   835	ANDL    R8, R12
   836	ADDL    R13, R15
   837	ADDL    AX, DX
   838	ORL     R12, DI
   839	ADDL    R14, AX
   840	ADDL    R15, DX
   841	ADDL    R15, AX
   842	ADDL    DI, AX
   843	ADDQ    $0x40, SI
   844	VMOVDQU Y6, Y4
   845	VMOVDQU Y7, Y5
   846	CMPQ    SI, $0x00000200
   847	JB      avx2_loop2
   848	MOVQ    dig+0(FP), SI
   849	MOVQ    520(SP), DI
   850	ADDL    AX, (SI)
   851	MOVL    (SI), AX
   852	ADDL    BX, 4(SI)
   853	MOVL    4(SI), BX
   854	ADDL    CX, 8(SI)
   855	MOVL    8(SI), CX
   856	ADDL    R8, 12(SI)
   857	MOVL    12(SI), R8
   858	ADDL    DX, 16(SI)
   859	MOVL    16(SI), DX
   860	ADDL    R9, 20(SI)
   861	MOVL    20(SI), R9
   862	ADDL    R10, 24(SI)
   863	MOVL    24(SI), R10
   864	ADDL    R11, 28(SI)
   865	MOVL    28(SI), R11
   866	CMPQ    512(SP), DI
   867	JB      done_hash
   868	XORQ    SI, SI
   869
   870avx2_loop3:
   871	MOVL  R9, R15
   872	RORXL $0x19, DX, R13
   873	RORXL $0x0b, DX, R14
   874	XORL  R10, R15
   875	XORL  R14, R13
   876	RORXL $0x06, DX, R14
   877	ANDL  DX, R15
   878	XORL  R14, R13
   879	RORXL $0x0d, AX, R12
   880	XORL  R10, R15
   881	RORXL $0x16, AX, R14
   882	MOVL  AX, DI
   883	XORL  R12, R14
   884	RORXL $0x02, AX, R12
   885	ADDL  16(SP)(SI*1), R11
   886	ORL   CX, DI
   887	XORL  R12, R14
   888	MOVL  AX, R12
   889	ANDL  BX, DI
   890	ANDL  CX, R12
   891	ADDL  R13, R15
   892	ADDL  R11, R8
   893	ORL   R12, DI
   894	ADDL  R14, R11
   895	ADDL  R15, R8
   896	ADDL  R15, R11
   897	MOVL  DX, R15
   898	RORXL $0x19, R8, R13
   899	RORXL $0x0b, R8, R14
   900	XORL  R9, R15
   901	XORL  R14, R13
   902	RORXL $0x06, R8, R14
   903	ANDL  R8, R15
   904	ADDL  DI, R11
   905	XORL  R14, R13
   906	RORXL $0x0d, R11, R12
   907	XORL  R9, R15
   908	RORXL $0x16, R11, R14
   909	MOVL  R11, DI
   910	XORL  R12, R14
   911	RORXL $0x02, R11, R12
   912	ADDL  20(SP)(SI*1), R10
   913	ORL   BX, DI
   914	XORL  R12, R14
   915	MOVL  R11, R12
   916	ANDL  AX, DI
   917	ANDL  BX, R12
   918	ADDL  R13, R15
   919	ADDL  R10, CX
   920	ORL   R12, DI
   921	ADDL  R14, R10
   922	ADDL  R15, CX
   923	ADDL  R15, R10
   924	MOVL  R8, R15
   925	RORXL $0x19, CX, R13
   926	RORXL $0x0b, CX, R14
   927	XORL  DX, R15
   928	XORL  R14, R13
   929	RORXL $0x06, CX, R14
   930	ANDL  CX, R15
   931	ADDL  DI, R10
   932	XORL  R14, R13
   933	RORXL $0x0d, R10, R12
   934	XORL  DX, R15
   935	RORXL $0x16, R10, R14
   936	MOVL  R10, DI
   937	XORL  R12, R14
   938	RORXL $0x02, R10, R12
   939	ADDL  24(SP)(SI*1), R9
   940	ORL   AX, DI
   941	XORL  R12, R14
   942	MOVL  R10, R12
   943	ANDL  R11, DI
   944	ANDL  AX, R12
   945	ADDL  R13, R15
   946	ADDL  R9, BX
   947	ORL   R12, DI
   948	ADDL  R14, R9
   949	ADDL  R15, BX
   950	ADDL  R15, R9
   951	MOVL  CX, R15
   952	RORXL $0x19, BX, R13
   953	RORXL $0x0b, BX, R14
   954	XORL  R8, R15
   955	XORL  R14, R13
   956	RORXL $0x06, BX, R14
   957	ANDL  BX, R15
   958	ADDL  DI, R9
   959	XORL  R14, R13
   960	RORXL $0x0d, R9, R12
   961	XORL  R8, R15
   962	RORXL $0x16, R9, R14
   963	MOVL  R9, DI
   964	XORL  R12, R14
   965	RORXL $0x02, R9, R12
   966	ADDL  28(SP)(SI*1), DX
   967	ORL   R11, DI
   968	XORL  R12, R14
   969	MOVL  R9, R12
   970	ANDL  R10, DI
   971	ANDL  R11, R12
   972	ADDL  R13, R15
   973	ADDL  DX, AX
   974	ORL   R12, DI
   975	ADDL  R14, DX
   976	ADDL  R15, AX
   977	ADDL  R15, DX
   978	ADDL  DI, DX
   979	MOVL  BX, R15
   980	RORXL $0x19, AX, R13
   981	RORXL $0x0b, AX, R14
   982	XORL  CX, R15
   983	XORL  R14, R13
   984	RORXL $0x06, AX, R14
   985	ANDL  AX, R15
   986	XORL  R14, R13
   987	RORXL $0x0d, DX, R12
   988	XORL  CX, R15
   989	RORXL $0x16, DX, R14
   990	MOVL  DX, DI
   991	XORL  R12, R14
   992	RORXL $0x02, DX, R12
   993	ADDL  48(SP)(SI*1), R8
   994	ORL   R10, DI
   995	XORL  R12, R14
   996	MOVL  DX, R12
   997	ANDL  R9, DI
   998	ANDL  R10, R12
   999	ADDL  R13, R15
  1000	ADDL  R8, R11
  1001	ORL   R12, DI
  1002	ADDL  R14, R8
  1003	ADDL  R15, R11
  1004	ADDL  R15, R8
  1005	MOVL  AX, R15
  1006	RORXL $0x19, R11, R13
  1007	RORXL $0x0b, R11, R14
  1008	XORL  BX, R15
  1009	XORL  R14, R13
  1010	RORXL $0x06, R11, R14
  1011	ANDL  R11, R15
  1012	ADDL  DI, R8
  1013	XORL  R14, R13
  1014	RORXL $0x0d, R8, R12
  1015	XORL  BX, R15
  1016	RORXL $0x16, R8, R14
  1017	MOVL  R8, DI
  1018	XORL  R12, R14
  1019	RORXL $0x02, R8, R12
  1020	ADDL  52(SP)(SI*1), CX
  1021	ORL   R9, DI
  1022	XORL  R12, R14
  1023	MOVL  R8, R12
  1024	ANDL  DX, DI
  1025	ANDL  R9, R12
  1026	ADDL  R13, R15
  1027	ADDL  CX, R10
  1028	ORL   R12, DI
  1029	ADDL  R14, CX
  1030	ADDL  R15, R10
  1031	ADDL  R15, CX
  1032	MOVL  R11, R15
  1033	RORXL $0x19, R10, R13
  1034	RORXL $0x0b, R10, R14
  1035	XORL  AX, R15
  1036	XORL  R14, R13
  1037	RORXL $0x06, R10, R14
  1038	ANDL  R10, R15
  1039	ADDL  DI, CX
  1040	XORL  R14, R13
  1041	RORXL $0x0d, CX, R12
  1042	XORL  AX, R15
  1043	RORXL $0x16, CX, R14
  1044	MOVL  CX, DI
  1045	XORL  R12, R14
  1046	RORXL $0x02, CX, R12
  1047	ADDL  56(SP)(SI*1), BX
  1048	ORL   DX, DI
  1049	XORL  R12, R14
  1050	MOVL  CX, R12
  1051	ANDL  R8, DI
  1052	ANDL  DX, R12
  1053	ADDL  R13, R15
  1054	ADDL  BX, R9
  1055	ORL   R12, DI
  1056	ADDL  R14, BX
  1057	ADDL  R15, R9
  1058	ADDL  R15, BX
  1059	MOVL  R10, R15
  1060	RORXL $0x19, R9, R13
  1061	RORXL $0x0b, R9, R14
  1062	XORL  R11, R15
  1063	XORL  R14, R13
  1064	RORXL $0x06, R9, R14
  1065	ANDL  R9, R15
  1066	ADDL  DI, BX
  1067	XORL  R14, R13
  1068	RORXL $0x0d, BX, R12
  1069	XORL  R11, R15
  1070	RORXL $0x16, BX, R14
  1071	MOVL  BX, DI
  1072	XORL  R12, R14
  1073	RORXL $0x02, BX, R12
  1074	ADDL  60(SP)(SI*1), AX
  1075	ORL   R8, DI
  1076	XORL  R12, R14
  1077	MOVL  BX, R12
  1078	ANDL  CX, DI
  1079	ANDL  R8, R12
  1080	ADDL  R13, R15
  1081	ADDL  AX, DX
  1082	ORL   R12, DI
  1083	ADDL  R14, AX
  1084	ADDL  R15, DX
  1085	ADDL  R15, AX
  1086	ADDL  DI, AX
  1087	ADDQ  $0x40, SI
  1088	CMPQ  SI, $0x00000200
  1089	JB    avx2_loop3
  1090	MOVQ  dig+0(FP), SI
  1091	MOVQ  520(SP), DI
  1092	ADDQ  $0x40, DI
  1093	ADDL  AX, (SI)
  1094	MOVL  (SI), AX
  1095	ADDL  BX, 4(SI)
  1096	MOVL  4(SI), BX
  1097	ADDL  CX, 8(SI)
  1098	MOVL  8(SI), CX
  1099	ADDL  R8, 12(SI)
  1100	MOVL  12(SI), R8
  1101	ADDL  DX, 16(SI)
  1102	MOVL  16(SI), DX
  1103	ADDL  R9, 20(SI)
  1104	MOVL  20(SI), R9
  1105	ADDL  R10, 24(SI)
  1106	MOVL  24(SI), R10
  1107	ADDL  R11, 28(SI)
  1108	MOVL  28(SI), R11
  1109	CMPQ  512(SP), DI
  1110	JA    avx2_loop0
  1111	JB    done_hash
  1112
  1113avx2_do_last_block:
  1114	VMOVDQU (DI), X4
  1115	VMOVDQU 16(DI), X5
  1116	VMOVDQU 32(DI), X6
  1117	VMOVDQU 48(DI), X7
  1118	VMOVDQU flip_mask<>+0(SB), Y13
  1119	VPSHUFB X13, X4, X4
  1120	VPSHUFB X13, X5, X5
  1121	VPSHUFB X13, X6, X6
  1122	VPSHUFB X13, X7, X7
  1123	LEAQ    K256<>+0(SB), BP
  1124	JMP     avx2_last_block_enter
  1125
  1126avx2_only_one_block:
  1127	MOVL (SI), AX
  1128	MOVL 4(SI), BX
  1129	MOVL 8(SI), CX
  1130	MOVL 12(SI), R8
  1131	MOVL 16(SI), DX
  1132	MOVL 20(SI), R9
  1133	MOVL 24(SI), R10
  1134	MOVL 28(SI), R11
  1135	JMP  avx2_do_last_block
  1136
  1137done_hash:
  1138	VZEROUPPER
  1139	RET
  1140
  1141DATA flip_mask<>+0(SB)/8, $0x0405060700010203
  1142DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b
  1143DATA flip_mask<>+16(SB)/8, $0x0405060700010203
  1144DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b
  1145GLOBL flip_mask<>(SB), RODATA, $32
  1146
  1147DATA K256<>+0(SB)/4, $0x428a2f98
  1148DATA K256<>+4(SB)/4, $0x71374491
  1149DATA K256<>+8(SB)/4, $0xb5c0fbcf
  1150DATA K256<>+12(SB)/4, $0xe9b5dba5
  1151DATA K256<>+16(SB)/4, $0x428a2f98
  1152DATA K256<>+20(SB)/4, $0x71374491
  1153DATA K256<>+24(SB)/4, $0xb5c0fbcf
  1154DATA K256<>+28(SB)/4, $0xe9b5dba5
  1155DATA K256<>+32(SB)/4, $0x3956c25b
  1156DATA K256<>+36(SB)/4, $0x59f111f1
  1157DATA K256<>+40(SB)/4, $0x923f82a4
  1158DATA K256<>+44(SB)/4, $0xab1c5ed5
  1159DATA K256<>+48(SB)/4, $0x3956c25b
  1160DATA K256<>+52(SB)/4, $0x59f111f1
  1161DATA K256<>+56(SB)/4, $0x923f82a4
  1162DATA K256<>+60(SB)/4, $0xab1c5ed5
  1163DATA K256<>+64(SB)/4, $0xd807aa98
  1164DATA K256<>+68(SB)/4, $0x12835b01
  1165DATA K256<>+72(SB)/4, $0x243185be
  1166DATA K256<>+76(SB)/4, $0x550c7dc3
  1167DATA K256<>+80(SB)/4, $0xd807aa98
  1168DATA K256<>+84(SB)/4, $0x12835b01
  1169DATA K256<>+88(SB)/4, $0x243185be
  1170DATA K256<>+92(SB)/4, $0x550c7dc3
  1171DATA K256<>+96(SB)/4, $0x72be5d74
  1172DATA K256<>+100(SB)/4, $0x80deb1fe
  1173DATA K256<>+104(SB)/4, $0x9bdc06a7
  1174DATA K256<>+108(SB)/4, $0xc19bf174
  1175DATA K256<>+112(SB)/4, $0x72be5d74
  1176DATA K256<>+116(SB)/4, $0x80deb1fe
  1177DATA K256<>+120(SB)/4, $0x9bdc06a7
  1178DATA K256<>+124(SB)/4, $0xc19bf174
  1179DATA K256<>+128(SB)/4, $0xe49b69c1
  1180DATA K256<>+132(SB)/4, $0xefbe4786
  1181DATA K256<>+136(SB)/4, $0x0fc19dc6
  1182DATA K256<>+140(SB)/4, $0x240ca1cc
  1183DATA K256<>+144(SB)/4, $0xe49b69c1
  1184DATA K256<>+148(SB)/4, $0xefbe4786
  1185DATA K256<>+152(SB)/4, $0x0fc19dc6
  1186DATA K256<>+156(SB)/4, $0x240ca1cc
  1187DATA K256<>+160(SB)/4, $0x2de92c6f
  1188DATA K256<>+164(SB)/4, $0x4a7484aa
  1189DATA K256<>+168(SB)/4, $0x5cb0a9dc
  1190DATA K256<>+172(SB)/4, $0x76f988da
  1191DATA K256<>+176(SB)/4, $0x2de92c6f
  1192DATA K256<>+180(SB)/4, $0x4a7484aa
  1193DATA K256<>+184(SB)/4, $0x5cb0a9dc
  1194DATA K256<>+188(SB)/4, $0x76f988da
  1195DATA K256<>+192(SB)/4, $0x983e5152
  1196DATA K256<>+196(SB)/4, $0xa831c66d
  1197DATA K256<>+200(SB)/4, $0xb00327c8
  1198DATA K256<>+204(SB)/4, $0xbf597fc7
  1199DATA K256<>+208(SB)/4, $0x983e5152
  1200DATA K256<>+212(SB)/4, $0xa831c66d
  1201DATA K256<>+216(SB)/4, $0xb00327c8
  1202DATA K256<>+220(SB)/4, $0xbf597fc7
  1203DATA K256<>+224(SB)/4, $0xc6e00bf3
  1204DATA K256<>+228(SB)/4, $0xd5a79147
  1205DATA K256<>+232(SB)/4, $0x06ca6351
  1206DATA K256<>+236(SB)/4, $0x14292967
  1207DATA K256<>+240(SB)/4, $0xc6e00bf3
  1208DATA K256<>+244(SB)/4, $0xd5a79147
  1209DATA K256<>+248(SB)/4, $0x06ca6351
  1210DATA K256<>+252(SB)/4, $0x14292967
  1211DATA K256<>+256(SB)/4, $0x27b70a85
  1212DATA K256<>+260(SB)/4, $0x2e1b2138
  1213DATA K256<>+264(SB)/4, $0x4d2c6dfc
  1214DATA K256<>+268(SB)/4, $0x53380d13
  1215DATA K256<>+272(SB)/4, $0x27b70a85
  1216DATA K256<>+276(SB)/4, $0x2e1b2138
  1217DATA K256<>+280(SB)/4, $0x4d2c6dfc
  1218DATA K256<>+284(SB)/4, $0x53380d13
  1219DATA K256<>+288(SB)/4, $0x650a7354
  1220DATA K256<>+292(SB)/4, $0x766a0abb
  1221DATA K256<>+296(SB)/4, $0x81c2c92e
  1222DATA K256<>+300(SB)/4, $0x92722c85
  1223DATA K256<>+304(SB)/4, $0x650a7354
  1224DATA K256<>+308(SB)/4, $0x766a0abb
  1225DATA K256<>+312(SB)/4, $0x81c2c92e
  1226DATA K256<>+316(SB)/4, $0x92722c85
  1227DATA K256<>+320(SB)/4, $0xa2bfe8a1
  1228DATA K256<>+324(SB)/4, $0xa81a664b
  1229DATA K256<>+328(SB)/4, $0xc24b8b70
  1230DATA K256<>+332(SB)/4, $0xc76c51a3
  1231DATA K256<>+336(SB)/4, $0xa2bfe8a1
  1232DATA K256<>+340(SB)/4, $0xa81a664b
  1233DATA K256<>+344(SB)/4, $0xc24b8b70
  1234DATA K256<>+348(SB)/4, $0xc76c51a3
  1235DATA K256<>+352(SB)/4, $0xd192e819
  1236DATA K256<>+356(SB)/4, $0xd6990624
  1237DATA K256<>+360(SB)/4, $0xf40e3585
  1238DATA K256<>+364(SB)/4, $0x106aa070
  1239DATA K256<>+368(SB)/4, $0xd192e819
  1240DATA K256<>+372(SB)/4, $0xd6990624
  1241DATA K256<>+376(SB)/4, $0xf40e3585
  1242DATA K256<>+380(SB)/4, $0x106aa070
  1243DATA K256<>+384(SB)/4, $0x19a4c116
  1244DATA K256<>+388(SB)/4, $0x1e376c08
  1245DATA K256<>+392(SB)/4, $0x2748774c
  1246DATA K256<>+396(SB)/4, $0x34b0bcb5
  1247DATA K256<>+400(SB)/4, $0x19a4c116
  1248DATA K256<>+404(SB)/4, $0x1e376c08
  1249DATA K256<>+408(SB)/4, $0x2748774c
  1250DATA K256<>+412(SB)/4, $0x34b0bcb5
  1251DATA K256<>+416(SB)/4, $0x391c0cb3
  1252DATA K256<>+420(SB)/4, $0x4ed8aa4a
  1253DATA K256<>+424(SB)/4, $0x5b9cca4f
  1254DATA K256<>+428(SB)/4, $0x682e6ff3
  1255DATA K256<>+432(SB)/4, $0x391c0cb3
  1256DATA K256<>+436(SB)/4, $0x4ed8aa4a
  1257DATA K256<>+440(SB)/4, $0x5b9cca4f
  1258DATA K256<>+444(SB)/4, $0x682e6ff3
  1259DATA K256<>+448(SB)/4, $0x748f82ee
  1260DATA K256<>+452(SB)/4, $0x78a5636f
  1261DATA K256<>+456(SB)/4, $0x84c87814
  1262DATA K256<>+460(SB)/4, $0x8cc70208
  1263DATA K256<>+464(SB)/4, $0x748f82ee
  1264DATA K256<>+468(SB)/4, $0x78a5636f
  1265DATA K256<>+472(SB)/4, $0x84c87814
  1266DATA K256<>+476(SB)/4, $0x8cc70208
  1267DATA K256<>+480(SB)/4, $0x90befffa
  1268DATA K256<>+484(SB)/4, $0xa4506ceb
  1269DATA K256<>+488(SB)/4, $0xbef9a3f7
  1270DATA K256<>+492(SB)/4, $0xc67178f2
  1271DATA K256<>+496(SB)/4, $0x90befffa
  1272DATA K256<>+500(SB)/4, $0xa4506ceb
  1273DATA K256<>+504(SB)/4, $0xbef9a3f7
  1274DATA K256<>+508(SB)/4, $0xc67178f2
  1275GLOBL K256<>(SB), RODATA|NOPTR, $512
  1276
  1277DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100
  1278DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff
  1279DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100
  1280DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff
  1281GLOBL shuff_00BA<>(SB), RODATA, $32
  1282
  1283DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff
  1284DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100
  1285DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff
  1286DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100
  1287GLOBL shuff_DC00<>(SB), RODATA, $32
  1288
  1289// func blockSHANI(dig *Digest, p []byte)
  1290// Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
  1291TEXT ·blockSHANI(SB), $0-32
  1292	MOVQ    dig+0(FP), DI
  1293	MOVQ    p_base+8(FP), SI
  1294	MOVQ    p_len+16(FP), DX
  1295	SHRQ    $0x06, DX
  1296	SHLQ    $0x06, DX
  1297	CMPQ    DX, $0x00
  1298	JEQ     done
  1299	ADDQ    SI, DX
  1300	VMOVDQU (DI), X1
  1301	VMOVDQU 16(DI), X2
  1302	PSHUFD  $0xb1, X1, X1
  1303	PSHUFD  $0x1b, X2, X2
  1304	VMOVDQA X1, X7
  1305	PALIGNR $0x08, X2, X1
  1306	PBLENDW $0xf0, X7, X2
  1307	VMOVDQA flip_mask<>+0(SB), X8
  1308	LEAQ    K256<>+0(SB), AX
  1309
  1310roundLoop:
  1311	// save hash values for addition after rounds
  1312	VMOVDQA X1, X9
  1313	VMOVDQA X2, X10
  1314
  1315	// do rounds 0-59
  1316	VMOVDQU     (SI), X0
  1317	PSHUFB      X8, X0
  1318	VMOVDQA     X0, X3
  1319	PADDD       (AX), X0
  1320	SHA256RNDS2 X0, X1, X2
  1321	PSHUFD      $0x0e, X0, X0
  1322	SHA256RNDS2 X0, X2, X1
  1323	VMOVDQU     16(SI), X0
  1324	PSHUFB      X8, X0
  1325	VMOVDQA     X0, X4
  1326	PADDD       32(AX), X0
  1327	SHA256RNDS2 X0, X1, X2
  1328	PSHUFD      $0x0e, X0, X0
  1329	SHA256RNDS2 X0, X2, X1
  1330	SHA256MSG1  X4, X3
  1331	VMOVDQU     32(SI), X0
  1332	PSHUFB      X8, X0
  1333	VMOVDQA     X0, X5
  1334	PADDD       64(AX), X0
  1335	SHA256RNDS2 X0, X1, X2
  1336	PSHUFD      $0x0e, X0, X0
  1337	SHA256RNDS2 X0, X2, X1
  1338	SHA256MSG1  X5, X4
  1339	VMOVDQU     48(SI), X0
  1340	PSHUFB      X8, X0
  1341	VMOVDQA     X0, X6
  1342	PADDD       96(AX), X0
  1343	SHA256RNDS2 X0, X1, X2
  1344	VMOVDQA     X6, X7
  1345	PALIGNR     $0x04, X5, X7
  1346	PADDD       X7, X3
  1347	SHA256MSG2  X6, X3
  1348	PSHUFD      $0x0e, X0, X0
  1349	SHA256RNDS2 X0, X2, X1
  1350	SHA256MSG1  X6, X5
  1351	VMOVDQA     X3, X0
  1352	PADDD       128(AX), X0
  1353	SHA256RNDS2 X0, X1, X2
  1354	VMOVDQA     X3, X7
  1355	PALIGNR     $0x04, X6, X7
  1356	PADDD       X7, X4
  1357	SHA256MSG2  X3, X4
  1358	PSHUFD      $0x0e, X0, X0
  1359	SHA256RNDS2 X0, X2, X1
  1360	SHA256MSG1  X3, X6
  1361	VMOVDQA     X4, X0
  1362	PADDD       160(AX), X0
  1363	SHA256RNDS2 X0, X1, X2
  1364	VMOVDQA     X4, X7
  1365	PALIGNR     $0x04, X3, X7
  1366	PADDD       X7, X5
  1367	SHA256MSG2  X4, X5
  1368	PSHUFD      $0x0e, X0, X0
  1369	SHA256RNDS2 X0, X2, X1
  1370	SHA256MSG1  X4, X3
  1371	VMOVDQA     X5, X0
  1372	PADDD       192(AX), X0
  1373	SHA256RNDS2 X0, X1, X2
  1374	VMOVDQA     X5, X7
  1375	PALIGNR     $0x04, X4, X7
  1376	PADDD       X7, X6
  1377	SHA256MSG2  X5, X6
  1378	PSHUFD      $0x0e, X0, X0
  1379	SHA256RNDS2 X0, X2, X1
  1380	SHA256MSG1  X5, X4
  1381	VMOVDQA     X6, X0
  1382	PADDD       224(AX), X0
  1383	SHA256RNDS2 X0, X1, X2
  1384	VMOVDQA     X6, X7
  1385	PALIGNR     $0x04, X5, X7
  1386	PADDD       X7, X3
  1387	SHA256MSG2  X6, X3
  1388	PSHUFD      $0x0e, X0, X0
  1389	SHA256RNDS2 X0, X2, X1
  1390	SHA256MSG1  X6, X5
  1391	VMOVDQA     X3, X0
  1392	PADDD       256(AX), X0
  1393	SHA256RNDS2 X0, X1, X2
  1394	VMOVDQA     X3, X7
  1395	PALIGNR     $0x04, X6, X7
  1396	PADDD       X7, X4
  1397	SHA256MSG2  X3, X4
  1398	PSHUFD      $0x0e, X0, X0
  1399	SHA256RNDS2 X0, X2, X1
  1400	SHA256MSG1  X3, X6
  1401	VMOVDQA     X4, X0
  1402	PADDD       288(AX), X0
  1403	SHA256RNDS2 X0, X1, X2
  1404	VMOVDQA     X4, X7
  1405	PALIGNR     $0x04, X3, X7
  1406	PADDD       X7, X5
  1407	SHA256MSG2  X4, X5
  1408	PSHUFD      $0x0e, X0, X0
  1409	SHA256RNDS2 X0, X2, X1
  1410	SHA256MSG1  X4, X3
  1411	VMOVDQA     X5, X0
  1412	PADDD       320(AX), X0
  1413	SHA256RNDS2 X0, X1, X2
  1414	VMOVDQA     X5, X7
  1415	PALIGNR     $0x04, X4, X7
  1416	PADDD       X7, X6
  1417	SHA256MSG2  X5, X6
  1418	PSHUFD      $0x0e, X0, X0
  1419	SHA256RNDS2 X0, X2, X1
  1420	SHA256MSG1  X5, X4
  1421	VMOVDQA     X6, X0
  1422	PADDD       352(AX), X0
  1423	SHA256RNDS2 X0, X1, X2
  1424	VMOVDQA     X6, X7
  1425	PALIGNR     $0x04, X5, X7
  1426	PADDD       X7, X3
  1427	SHA256MSG2  X6, X3
  1428	PSHUFD      $0x0e, X0, X0
  1429	SHA256RNDS2 X0, X2, X1
  1430	SHA256MSG1  X6, X5
  1431	VMOVDQA     X3, X0
  1432	PADDD       384(AX), X0
  1433	SHA256RNDS2 X0, X1, X2
  1434	VMOVDQA     X3, X7
  1435	PALIGNR     $0x04, X6, X7
  1436	PADDD       X7, X4
  1437	SHA256MSG2  X3, X4
  1438	PSHUFD      $0x0e, X0, X0
  1439	SHA256RNDS2 X0, X2, X1
  1440	SHA256MSG1  X3, X6
  1441	VMOVDQA     X4, X0
  1442	PADDD       416(AX), X0
  1443	SHA256RNDS2 X0, X1, X2
  1444	VMOVDQA     X4, X7
  1445	PALIGNR     $0x04, X3, X7
  1446	PADDD       X7, X5
  1447	SHA256MSG2  X4, X5
  1448	PSHUFD      $0x0e, X0, X0
  1449	SHA256RNDS2 X0, X2, X1
  1450	VMOVDQA     X5, X0
  1451	PADDD       448(AX), X0
  1452	SHA256RNDS2 X0, X1, X2
  1453	VMOVDQA     X5, X7
  1454	PALIGNR     $0x04, X4, X7
  1455	PADDD       X7, X6
  1456	SHA256MSG2  X5, X6
  1457	PSHUFD      $0x0e, X0, X0
  1458	SHA256RNDS2 X0, X2, X1
  1459
  1460	// do rounds 60-63
  1461	VMOVDQA     X6, X0
  1462	PADDD       480(AX), X0
  1463	SHA256RNDS2 X0, X1, X2
  1464	PSHUFD      $0x0e, X0, X0
  1465	SHA256RNDS2 X0, X2, X1
  1466
  1467	// add current hash values with previously saved
  1468	PADDD X9, X1
  1469	PADDD X10, X2
  1470
  1471	// advance data pointer; loop until buffer empty
  1472	ADDQ $0x40, SI
  1473	CMPQ DX, SI
  1474	JNE  roundLoop
  1475
  1476	// write hash values back in the correct order
  1477	PSHUFD  $0x1b, X1, X1
  1478	PSHUFD  $0xb1, X2, X2
  1479	VMOVDQA X1, X7
  1480	PBLENDW $0xf0, X2, X1
  1481	PALIGNR $0x08, X7, X2
  1482	VMOVDQU X1, (DI)
  1483	VMOVDQU X2, 16(DI)
  1484
  1485done:
  1486	RET

View as plain text