scan_amd64.s

Documentation: internal/runtime/gc/scan

     1// Copyright 2025 The Go Authors. All rights reserved.
     2// Use of this source code is governed by a BSD-style
     3// license that can be found in the LICENSE file.
     4
     5#include "go_asm.h"
     6#include "textflag.h"
     7
     8// Test-only.
     9TEXT ·ExpandAVX512(SB), NOSPLIT, $0-24
    10	MOVQ sizeClass+0(FP), CX
    11	MOVQ packed+8(FP), AX
    12
    13	// Call the expander for this size class
    14	LEAQ ·gcExpandersAVX512(SB), BX
    15	CALL (BX)(CX*8)
    16
    17	MOVQ unpacked+16(FP), DI // Expanded output bitmap pointer
    18	VMOVDQU64 Z1, 0(DI)
    19	VMOVDQU64 Z2, 64(DI)
    20	VZEROUPPER
    21	RET
    22
    23TEXT ·scanSpanPackedAVX512(SB), NOSPLIT, $256-44
    24	// Z1+Z2 = Expand the grey object mask into a grey word mask
    25	MOVQ objMarks+16(FP), AX
    26	MOVQ sizeClass+24(FP), CX
    27	LEAQ ·gcExpandersAVX512(SB), BX
    28	CALL (BX)(CX*8)
    29
    30	// Z3+Z4 = Load the pointer mask
    31	MOVQ ptrMask+32(FP), AX
    32	VMOVDQU64 0(AX), Z3
    33	VMOVDQU64 64(AX), Z4
    34
    35	// Z1+Z2 = Combine the grey word mask with the pointer mask to get the scan mask
    36	VPANDQ Z1, Z3, Z1
    37	VPANDQ Z2, Z4, Z2
    38
    39	// Now each bit of Z1+Z2 represents one word of the span.
    40	// Thus, each byte covers 64 bytes of memory, which is also how
    41	// much we can fix in a Z register.
    42	//
    43	// We do a load/compress for each 64 byte frame.
    44	//
    45	// Z3+Z4 [128]uint8 = Number of memory words to scan in each 64 byte frame
    46	VPOPCNTB Z1, Z3 // Requires BITALG
    47	VPOPCNTB Z2, Z4
    48
    49	// Store the scan mask and word counts at 0(SP) and 128(SP).
    50	//
    51	// TODO: Is it better to read directly from the registers?
    52	VMOVDQU64 Z1, 0(SP)
    53	VMOVDQU64 Z2, 64(SP)
    54	VMOVDQU64 Z3, 128(SP)
    55	VMOVDQU64 Z4, 192(SP)
    56
    57	// SI = Current address in span
    58	MOVQ mem+0(FP), SI
    59	// DI = Scan buffer base
    60	MOVQ bufp+8(FP), DI
    61	// DX = Index in scan buffer, (DI)(DX*8) = Current position in scan buffer
    62	MOVQ $0, DX
    63
    64	// AX = address in scan mask, 128(AX) = address in popcount
    65	LEAQ 0(SP), AX
    66
    67	// Loop over the 64 byte frames in this span.
    68	// BX = 1 past the end of the scan mask
    69	LEAQ 128(SP), BX
    70
    71	// Align loop to a cache line so that performance is less sensitive
    72	// to how this function ends up laid out in memory. This is a hot
    73	// function in the GC, and this is a tight loop. We don't want
    74	// performance to waver wildly due to unrelated changes.
    75	PCALIGN $64
    76loop:
    77	// CX = Fetch the mask of words to load from this frame.
    78	MOVBQZX 0(AX), CX
    79	// Skip empty frames.
    80	TESTQ CX, CX
    81	JZ skip
    82
    83	// Load the 64 byte frame.
    84	KMOVB CX, K1
    85	VMOVDQA64 0(SI), Z1
    86
    87	// Collect just the pointers from the greyed objects into the scan buffer,
    88	// i.e., copy the word indices in the mask from Z1 into contiguous memory.
    89	//
    90	// N.B. VPCOMPRESSQ supports a memory destination. Unfortunately, on
    91	// AMD Genoa / Zen 4, using VPCOMPRESSQ with a memory destination
    92	// imposes a severe performance penalty of around an order of magnitude
    93	// compared to a register destination.
    94	//
    95	// This workaround is unfortunate on other microarchitectures, where a
    96	// memory destination is slightly faster than adding an additional move
    97	// instruction, but no where near an order of magnitude. It would be
    98	// nice to have a Genoa-only variant here.
    99	//
   100	// AMD Turin / Zen 5 fixes this issue.
   101	//
   102	// See
   103	// https://lemire.me/blog/2025/02/14/avx-512-gotcha-avoid-compressing-words-to-memory-with-amd-zen-4-processors/.
   104	VPCOMPRESSQ Z1, K1, Z2
   105	VMOVDQU64 Z2, (DI)(DX*8)
   106
   107	// Advance the scan buffer position by the number of pointers.
   108	MOVBQZX 128(AX), CX
   109	ADDQ CX, DX
   110
   111skip:
   112	ADDQ $64, SI
   113	ADDQ $1, AX
   114	CMPQ AX, BX
   115	JB loop
   116
   117end:
   118	MOVL DX, count+40(FP)
   119	VZEROUPPER
   120	RET
View as plain text