The December 2017 Optimization Reference Manual has two sections describing how to use the new AVX-512 conflict detection instructions for histogram calculation.
There are numerous issues with section 15.16.1 and the example code in 15-18. In particular, the code in the conflict_loop segment just doesn't work as described.
vmovaps zmm4, all_1 // {1, 1, …, 1}
vmovaps zmm5, all_negative_1
vmovaps zmm6, all_31
vmovaps zmm7, all_bins_minus_1
mov ebx, num_inputs
mov r10, pInput
mov r15, pHistogram
histogram_loop:
vpandd zmm3, [r10+rcx*4], zmm7
vpconflictd zmm0, zmm3
kxnorw k1, k1, k1
vmovaps zmm2, zmm4
vpxord zmm1, zmm1, zmm1
vpgatherdd zmm1{k1}, [r15+zmm3*4]
vptestmd k1, zmm0, zmm0
kortestw k1, k1
je update
vplzcntd zmm0, zmm0
vpsubd zmm0, zmm6, zmm0
conflict_loop:
vpermd zmm8{k1}{z}, zmm2, zmm0
vpermd zmm0{k1}, zmm0, zmm0
vpaddd zmm2{k1}, zmm2, zmm8
vpcmpd k1, 4, zmm5, zmm0
kortestw k1, k1
jne conflict_loop
update:
vpaddd zmm0, zmm2, zmm1
kxnorw k1, k1, k1
addq rcx, 16
vpscatterdd [r15+zmm3*4]{k1}, zmm0
cmpl ebx, ecx
jb histogram_loop
Section 17.2.3 has another example of using AVX-512 CD intrinsics. This one is also incorrect, but less so. The two `vpsubd zmm1, zmm1, zmm5` are a mistake -- only one `vpsubd zmm1, zmm1, zmm5` should be necessary. Additionally, no indication is given as to the value in the following addresses: [rip+0x185c], [rip+0x1884], [rip+0x18ba]. The loop termination logic in Resolve_conflicts seems redundant, too.
Top:
vmovups zmm4, [rsp+rdx*4+0x40]
vpxord zmm1, zmm1, zmm1
kmovw k2, k1
vpconflictd zmm2, zmm4
vpgatherdd zmm1{k2}, [rax+zmm4*4]
vptestmd k0, zmm2, [rip+0x185c]
kmovw ecx, k0
vpaddd zmm3, zmm1, zmm0
test ecx, ecx
jz <No_conflicts>
vmovups zmm1, [rip+0x1884]
vptestmd k0, zmm2, [rip+0x18ba]
vplzcntd zmm5, zmm2
xor bl, bl
kmovw ecx, k0
vpsubd zmm1, zmm1, zmm5
vpsubd zmm1, zmm1, zmm5
Resolve_conflicts:
vpbroadcastd zmm5, ecx
kmovw k2, ecx
vpermd zmm3{k2}, zmm1, zmm3
vpaddd zmm3{k2}, zmm3, zmm0
vptestmd k0{k2}, zmm5, zmm2
kmovw esi, k0
and ecx, esi
jz <No_conflicts>
add bl, 0x1
cmp bl, 0x10
jb <Resolve_conflicts>
No_conflicts:
kmovw k2, k1
vpscatterdd [rax+zmm4*4]{k2}, zmm3
add edx, 0x10
cmp edx, 0x400
jb <Top>
I've managed to massage the second example into something that appears to work, as follows:
include ksamd64.inc
OP_EQ equ 0
OP_NEQ equ 4
;
; Define constant variables.
;
ZMM_ALIGN equ 64
YMM_ALIGN equ 32
XMM_ALIGN equ 16
_DATA$00 SEGMENT PAGE 'DATA'
align ZMM_ALIGN
public AllOnes
AllOnes dd 16 dup (1)
align ZMM_ALIGN
public AllNegativeOnes
AllNegativeOnes dd 16 dup (-1)
align ZMM_ALIGN
public AllBinsMinusOne
AllBinsMinusOne dd 16 dup (254)
align ZMM_ALIGN
public AllThirtyOne
AllThirtyOne dd 16 dup (31)
align ZMM_ALIGN
Input1544 dd 5, 3, 3, 1, 8, 2, 50, 1, 0, 7, 6, 4, 9, 3, 10, 3
Permute1544 dd -1, -1, 1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, 2, -1, 13
Conflict1544 dd 0, 0, 2, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 6, 0, 8198
Counts1544 dd 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 4
public Input1544
public Permute1544
public Conflict1544
public Counts1544
Input1544v2 dd 5, 3, 3, 1, 8, 2, 50, 1, 0, 7, 6, 4, 9, 3, 10, 3
dd 5, 3, 3, 1, 8, 2, 50, 1, 0, 7, 6, 4, 9, 3, 10, 3
_DATA$00 ends
NESTED_ENTRY Histo1710, _TEXT$00
;
; Begin prologue. Allocate stack space and save non-volatile registers.
;
alloc_stack LOCALS_SIZE
save_reg rbp, Locals.SavedRbp ; Save non-volatile rbp.
save_reg rbx, Locals.SavedRbx ; Save non-volatile rbx.
save_reg rdi, Locals.SavedRdi ; Save non-volatile rdi.
save_reg rsi, Locals.SavedRsi ; Save non-volatile rsi.
save_reg r12, Locals.SavedR12 ; Save non-volatile r12.
save_reg r13, Locals.SavedR13 ; Save non-volatile r13.
save_reg r14, Locals.SavedR14 ; Save non-volatile r14.
save_reg r15, Locals.SavedR15 ; Save non-volatile r15.
save_xmm128 xmm6, Locals.SavedXmm6 ; Save non-volatile xmm6.
save_xmm128 xmm7, Locals.SavedXmm7 ; Save non-volatile xmm7.
save_xmm128 xmm8, Locals.SavedXmm8 ; Save non-volatile xmm8.
save_xmm128 xmm9, Locals.SavedXmm9 ; Save non-volatile xmm9.
save_xmm128 xmm10, Locals.SavedXmm10 ; Save non-volatile xmm10.
save_xmm128 xmm11, Locals.SavedXmm11 ; Save non-volatile xmm11.
save_xmm128 xmm12, Locals.SavedXmm12 ; Save non-volatile xmm12.
save_xmm128 xmm13, Locals.SavedXmm13 ; Save non-volatile xmm13.
save_xmm128 xmm14, Locals.SavedXmm14 ; Save non-volatile xmm14.
save_xmm128 xmm15, Locals.SavedXmm15 ; Save non-volatile xmm15.
END_PROLOGUE
mov Locals.HomeRcx[rsp], rcx ; Home rcx.
mov Locals.HomeRdx[rsp], rdx ; Home rdx.
mov Locals.HomeR8[rsp], r8 ; Home r8.
mov Locals.HomeR9[rsp], r9 ; Home r9.
vmovntdqa zmm28, zmmword ptr [AllOnes]
vmovntdqa zmm29, zmmword ptr [AllNegativeOnes]
vmovntdqa zmm30, zmmword ptr [AllBinsMinusOne]
vmovntdqa zmm31, zmmword ptr [AllThirtyOne]
mov rax, rdx
xor rdx, rdx
lea r10, Input1544v2
Top:
;vmovups zmm4, [rsp+rdx*4+0x40]
vmovntdqa zmm4, zmmword ptr [r10]
add r10, 40h
;vmovups zmm4, [rsp+rdx*4+0x40]
vpxord zmm1, zmm1, zmm1
;
; kmovw k2, k1
;
; What's k1?! Assume it's all 1s for now given that it's fed into vpgatherdd.
;
kxnorw k1, k1, k1
kmovw k2, k1
vpconflictd zmm2, zmm4
vpgatherdd zmm1{k2}, [rax+zmm4*4]
;
; vptestmd k0, zmm2, [rip+0x185c]
;
; What's [rip+0x185c]? Guess -1 as it's being used to compare the vpconflictd
; result, then determining if there are conflicts.
;
;vptestmd k0, zmm2, [rip+0x185c]
vptestmd k0, zmm2, zmm29 ; Test against AllNegativeOnes
kmovw ecx, k0
;
; vpaddd zmm3, zmm1, zmm0
;
; What's zmm0? Assume all 1s, so use zmm28.
;
vmovaps zmm0, zmm28
;vpaddd zmm3, zmm1, zmm0
vpaddd zmm3, zmm1, zmm0
test ecx, ecx
jz No_conflicts
;
; vmovups zmm1, [rip+0x1884]
;
; What's [rip+0x1884]?
;
; Try:
; - AllThirtyOne (31).
; ;- AllOnes (zmm28)
;
;vmovups zmm1, [rip+0x1884]
vmovaps zmm1, zmm31
;
; vptestmd k0, zmm2, [rip+0x18ba]
;
; What's [rip+0x18ba]?
;
; Try:
;
; - AllNegativeOnes
;
;vptestmd k0, zmm2, [rip+0x18ba]
vptestmd k0, zmm2, zmm29
vplzcntd zmm5, zmm2
xor bl, bl
kmovw ecx, k0
;
; XXX: why two vpsubds here?
;
vpsubd zmm1, zmm1, zmm5
;vpsubd zmm1, zmm1, zmm5
Resolve_conflicts:
vpbroadcastd zmm5, ecx
kmovw k2, ecx
; The vpermd doesn't appear to have any effect.
;vpermd zmm3{k2}, zmm1, zmm3
vpaddd zmm3{k2}, zmm3, zmm0
vptestmd k0{k2}, zmm5, zmm2
kmovw esi, k0
and ecx, esi
jz No_conflicts
add bl, 1h
cmp bl, 10h
jb Resolve_conflicts
No_conflicts:
kmovw k2, k1
vpscatterdd [rax+zmm4*4]{k2}, zmm3
add edx, 10h
cmp edx, 20h
jb Top
;
; Indicate success.
;
mov rax, 1
;
; Restore non-volatile registers.
;
Th199:
mov rbp, Locals.SavedRbp[rsp]
mov rbx, Locals.SavedRbx[rsp]
mov rdi, Locals.SavedRdi[rsp]
mov rsi, Locals.SavedRsi[rsp]
mov r12, Locals.SavedR12[rsp]
mov r13, Locals.SavedR13[rsp]
mov r14, Locals.SavedR14[rsp]
mov r15, Locals.SavedR15[rsp]
movdqa xmm6, Locals.SavedXmm6[rsp]
movdqa xmm7, Locals.SavedXmm7[rsp]
movdqa xmm8, Locals.SavedXmm8[rsp]
movdqa xmm9, Locals.SavedXmm9[rsp]
movdqa xmm10, Locals.SavedXmm10[rsp]
movdqa xmm11, Locals.SavedXmm11[rsp]
movdqa xmm12, Locals.SavedXmm12[rsp]
movdqa xmm13, Locals.SavedXmm13[rsp]
movdqa xmm14, Locals.SavedXmm14[rsp]
movdqa xmm15, Locals.SavedXmm15[rsp]
;
; Begin epilogue. Deallocate stack space and return.
;
add rsp, LOCALS_SIZE
ret
NESTED_END Histo1710, _TEXT$00
I'm confused as to the purpose of the vpermd instructions in both examples. Even in the latter example, which actually works, I can remove vpermd and it has no effect on the histogram calculation. I believe the mask update logic takes care of the "conflict permutation" referred to in section 15.
Can someone review both sections and provide some insight?