memclr_loong64.s

Documentation: runtime

     1// Copyright 2022 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "textflag.h"
     7
     8// Register map
     9//
    10// R4: ptr
    11// R5: n
    12// R6: ptrend
    13// R7: tmp
    14
    15// Algorithm:
    16//
    17// 1. if lasx is enabled:
    18//        THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
    19//    else if lsx is enabled:
    20//        THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
    21//    else
    22//        THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
    23//
    24// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
    25// The handling is divided into distinct cases based on the size of count:
    26//   a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
    27//      clr_9through16, clr_17through32, clr_33through64,
    28//   b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
    29//   c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
    30//      lasx_clr_65through128, lasx_clr_129through256
    31//
    32// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
    33// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
    34// a LOOPBLOCKS-byte loop is executed to zero out memory.
    35// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
    36// processing is performed, invoking the corresponding case based on the size of n.
    37//
    38// example:
    39//    THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
    40//
    41//    ptr           newptr                           ptrend
    42//     |               |<----count after correction---->|
    43//     |<-------------count before correction---------->|
    44//     |<--8-(ptr&7)-->|               |<---64 bytes--->|
    45//     +------------------------------------------------+
    46//     |   Head        |      Body     |      Tail      |
    47//     +---------------+---------------+----------------+
    48//    newptr = ptr - (ptr & 7) + 8
    49//    count = count - 8 + (ptr & 7)
    50
    51// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
    52TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
    53	BEQ	R5, clr_0
    54	ADDV	R4, R5, R6
    55tail:
    56	// <=64 bytes, clear directly, not check aligned
    57	SGTU	$2, R5, R7
    58	BNE	R7, clr_1
    59	SGTU	$3, R5, R7
    60	BNE	R7, clr_2
    61	SGTU	$4, R5, R7
    62	BNE	R7, clr_3
    63	SGTU	$5, R5, R7
    64	BNE	R7, clr_4
    65	SGTU	$8, R5, R7
    66	BNE	R7, clr_5through7
    67	SGTU	$9, R5, R7
    68	BNE	R7, clr_8
    69	SGTU	$17, R5, R7
    70	BNE	R7, clr_9through16
    71
    72	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
    73	BNE	R7, lasx_tail
    74	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
    75	BNE	R7, lsx_tail
    76
    77	SGTU	$33, R5, R7
    78	BNE	R7, clr_17through32
    79	SGTU	$65, R5, R7
    80	BNE	R7, clr_33through64
    81	JMP	clr_large
    82
    83lasx_tail:
    84	// X0 = 0
    85	XVXORV	X0, X0, X0
    86
    87	SGTU	$33, R5, R7
    88	BNE	R7, lasx_clr_17through32
    89	SGTU	$65, R5, R7
    90	BNE	R7, lasx_clr_33through64
    91	SGTU	$129, R5, R7
    92	BNE	R7, lasx_clr_65through128
    93	SGTU	$257, R5, R7
    94	BNE	R7, lasx_clr_129through256
    95	JMP	lasx_clr_large
    96
    97lsx_tail:
    98	// V0 = 0
    99	VXORV	V0, V0, V0
   100
   101	SGTU	$33, R5, R7
   102	BNE	R7, lsx_clr_17through32
   103	SGTU	$65, R5, R7
   104	BNE	R7, lsx_clr_33through64
   105	SGTU	$129, R5, R7
   106	BNE	R7, lsx_clr_65through128
   107	JMP	lsx_clr_large
   108
   109	// use simd 256 instructions to implement memclr
   110	// n > 256 bytes, check 32-byte alignment
   111lasx_clr_large:
   112	AND	$31, R4, R7
   113	BEQ	R7, lasx_clr_256loop
   114	XVMOVQ	X0, (R4)
   115	SUBV	R7, R4
   116	ADDV	R7, R5
   117	SUBV	$32, R5 // newn = n - (32 - (ptr & 31))
   118	ADDV	$32, R4 // newptr = ptr + (32 - (ptr & 31))
   119	SGTU	$257, R5, R7
   120	BNE	R7, lasx_clr_129through256
   121lasx_clr_256loop:
   122	SUBV	$256, R5
   123	SGTU	$256, R5, R7
   124	XVMOVQ	X0, 0(R4)
   125	XVMOVQ	X0, 32(R4)
   126	XVMOVQ	X0, 64(R4)
   127	XVMOVQ	X0, 96(R4)
   128	XVMOVQ	X0, 128(R4)
   129	XVMOVQ	X0, 160(R4)
   130	XVMOVQ	X0, 192(R4)
   131	XVMOVQ	X0, 224(R4)
   132	ADDV	$256, R4
   133	BEQ	R7, lasx_clr_256loop
   134
   135	// remaining_length is 0
   136	BEQ	R5, clr_0
   137
   138	// 128 < remaining_length < 256
   139	SGTU	$129, R5, R7
   140	BEQ	R7, lasx_clr_129through256
   141
   142	// 64 < remaining_length <= 128
   143	SGTU	$65, R5, R7
   144	BEQ	R7, lasx_clr_65through128
   145
   146	// 32 < remaining_length <= 64
   147	SGTU	$33, R5, R7
   148	BEQ	R7, lasx_clr_33through64
   149
   150	// 16 < remaining_length <= 32
   151	SGTU	$17, R5, R7
   152	BEQ	R7, lasx_clr_17through32
   153
   154	// 0 < remaining_length <= 16
   155	JMP	tail
   156
   157	// use simd 128 instructions to implement memclr
   158	// n > 128 bytes, check 16-byte alignment
   159lsx_clr_large:
   160	// check 16-byte alignment
   161	AND	$15, R4, R7
   162	BEQ	R7, lsx_clr_128loop
   163	VMOVQ	V0, (R4)
   164	SUBV	R7, R4
   165	ADDV	R7, R5
   166	SUBV	$16, R5 // newn = n - (16 - (ptr & 15))
   167	ADDV	$16, R4 // newptr = ptr + (16 - (ptr & 15))
   168	SGTU	$129, R5, R7
   169	BNE	R7, lsx_clr_65through128
   170lsx_clr_128loop:
   171	SUBV	$128, R5
   172	SGTU	$128, R5, R7
   173	VMOVQ	V0, 0(R4)
   174	VMOVQ	V0, 16(R4)
   175	VMOVQ	V0, 32(R4)
   176	VMOVQ	V0, 48(R4)
   177	VMOVQ	V0, 64(R4)
   178	VMOVQ	V0, 80(R4)
   179	VMOVQ	V0, 96(R4)
   180	VMOVQ	V0, 112(R4)
   181	ADDV	$128, R4
   182	BEQ	R7, lsx_clr_128loop
   183
   184	// remaining_length is 0
   185	BEQ	R5, clr_0
   186
   187	// 64 < remaining_length <= 128
   188	SGTU	$65, R5, R7
   189	BEQ	R7, lsx_clr_65through128
   190
   191	// 32 < remaining_length <= 64
   192	SGTU	$33, R5, R7
   193	BEQ	R7, lsx_clr_33through64
   194
   195	// 16 < remaining_length <= 32
   196	SGTU	$17, R5, R7
   197	BEQ	R7, lsx_clr_17through32
   198
   199	// 0 < remaining_length <= 16
   200	JMP	tail
   201
   202	// use general instructions to implement memclr
   203	// n > 64 bytes, check 16-byte alignment
   204clr_large:
   205	AND	$7, R4, R7
   206	BEQ	R7, clr_64loop
   207	MOVV	R0, (R4)
   208	SUBV	R7, R4
   209	ADDV	R7, R5
   210	ADDV	$8, R4	// newptr = ptr + (8 - (ptr & 7))
   211	SUBV	$8, R5	// newn = n - (8 - (ptr & 7))
   212	MOVV	$64, R7
   213	BLT	R5, R7, clr_33through64
   214clr_64loop:
   215	SUBV	$64, R5
   216	SGTU    $64, R5, R7
   217	MOVV	R0, (R4)
   218	MOVV	R0, 8(R4)
   219	MOVV	R0, 16(R4)
   220	MOVV	R0, 24(R4)
   221	MOVV	R0, 32(R4)
   222	MOVV	R0, 40(R4)
   223	MOVV	R0, 48(R4)
   224	MOVV	R0, 56(R4)
   225	ADDV	$64, R4
   226	BEQ     R7, clr_64loop
   227
   228	// remaining_length is 0
   229	BEQ	R5, clr_0
   230
   231	// 32 < remaining_length < 64
   232	SGTU	$33, R5, R7
   233	BEQ	R7, clr_33through64
   234
   235	// 16 < remaining_length <= 32
   236	SGTU	$17, R5, R7
   237	BEQ	R7, clr_17through32
   238
   239	// 0 < remaining_length <= 16
   240	JMP	tail
   241
   242clr_0:
   243	RET
   244clr_1:
   245	MOVB	R0, (R4)
   246	RET
   247clr_2:
   248	MOVH	R0, (R4)
   249	RET
   250clr_3:
   251	MOVH	R0, (R4)
   252	MOVB	R0, 2(R4)
   253	RET
   254clr_4:
   255	MOVW	R0, (R4)
   256	RET
   257clr_5through7:
   258	MOVW	R0, (R4)
   259	MOVW	R0, -4(R6)
   260	RET
   261clr_8:
   262	MOVV	R0, (R4)
   263	RET
   264clr_9through16:
   265	MOVV	R0, (R4)
   266	MOVV	R0, -8(R6)
   267	RET
   268clr_17through32:
   269	MOVV	R0, (R4)
   270	MOVV	R0, 8(R4)
   271	MOVV	R0, -16(R6)
   272	MOVV	R0, -8(R6)
   273	RET
   274clr_33through64:
   275	MOVV	R0, (R4)
   276	MOVV	R0, 8(R4)
   277	MOVV	R0, 16(R4)
   278	MOVV	R0, 24(R4)
   279	MOVV	R0, -32(R6)
   280	MOVV	R0, -24(R6)
   281	MOVV	R0, -16(R6)
   282	MOVV	R0, -8(R6)
   283	RET
   284
   285lasx_clr_17through32:
   286	VMOVQ	V0, 0(R4)
   287	VMOVQ	V0, -16(R6)
   288	RET
   289lasx_clr_33through64:
   290	XVMOVQ	X0, 0(R4)
   291	XVMOVQ	X0, -32(R6)
   292	RET
   293lasx_clr_65through128:
   294	XVMOVQ	X0, 0(R4)
   295	XVMOVQ	X0, 32(R4)
   296	XVMOVQ	X0, -64(R6)
   297	XVMOVQ	X0, -32(R6)
   298	RET
   299lasx_clr_129through256:
   300	XVMOVQ	X0, 0(R4)
   301	XVMOVQ	X0, 32(R4)
   302	XVMOVQ	X0, 64(R4)
   303	XVMOVQ	X0, 96(R4)
   304	XVMOVQ	X0, -128(R6)
   305	XVMOVQ	X0, -96(R6)
   306	XVMOVQ	X0, -64(R6)
   307	XVMOVQ	X0, -32(R6)
   308	RET
   309
   310lsx_clr_17through32:
   311	VMOVQ	V0, 0(R4)
   312	VMOVQ	V0, -16(R6)
   313	RET
   314lsx_clr_33through64:
   315	VMOVQ	V0, 0(R4)
   316	VMOVQ	V0, 16(R4)
   317	VMOVQ	V0, -32(R6)
   318	VMOVQ	V0, -16(R6)
   319	RET
   320lsx_clr_65through128:
   321	VMOVQ	V0, 0(R4)
   322	VMOVQ	V0, 16(R4)
   323	VMOVQ	V0, 32(R4)
   324	VMOVQ	V0, 48(R4)
   325	VMOVQ	V0, -64(R6)
   326	VMOVQ	V0, -48(R6)
   327	VMOVQ	V0, -32(R6)
   328	VMOVQ	V0, -16(R6)
   329	RET
View as plain text