Get the latest tech news
x86 amd64 SIMD instruction list: SSE to AVX512
glish Japanese x86/x64 SIMD Instruction List (SSE to AVX512) MMX register (64-bit) instructions are omitted. S1=SSE S2=SSE2 S3=SSE3 SS3=SSSE3 S4.1=SSE4.1 S4.2=SSE4.2 V1=AVX V2=AVX2 V5=AVX512 #=64-bit mode only Instructions marked with * become scalar instructions (only the lowest element is calculated) when PS/PD/DQ is changed to SS/SD/SI.
IntegerFloating-PointYMM lane (128-bit)QWORDDWORDWORDBYTEDoubleSingleHalf ?MM whole ?MM/mem MOVDQA(S2 _mm_load_si128 _mm_store_si128 MOVDQU(S2 _mm_loadu_si128 _mm_storeu_si128 MOVAPD(S2 _mm_load_pd _mm_loadr_pd _mm_store_pd _mm_storer_pd MOVUPD(S2 _mm_loadu_pd _mm_storeu_pd MOVAPS(S1 _mm_load_ps _mm_loadr_ps _mm_store_ps _mm_storer_ps MOVUPS(S1 _mm_loadu_ps _mm_storeu_ps VMOVDQA64(V5..._mm_mask_load_epi64_mm_mask_store_epi64etc VMOVDQU64(V5..._mm_mask_loadu_epi64_mm_mask_store_epi64etc VMOVDQA32(V5..._mm_mask_load_epi32_mm_mask_store_epi32etc VMOVDQU32(V5..._mm_mask_loadu_epi32_mm_mask_storeu_epi32etc VMOVDQU16(V5+BW..._mm_mask_loadu_epi16_mm_mask_storeu_epi16etc VMOVDQU8(V5+BW..._mm_mask_loadu_epi8_mm_mask_storeu_epi8etc XMM upper half mem MOVHPD(S2 _mm_loadh_pd _mm_storeh_pd MOVHPS(S1 _mm_loadh_pi _mm_storeh_pi XMM upper half XMM lower half MOVHLPS(S1_mm_movehl_ps MOVLHPS(S1 _mm_movelh_ps XMM lower half mem MOVQ(S2 _mm_loadl_epi64 _mm_storel_epi64 MOVLPD(S2 _mm_loadl_pd _mm_storel_pd MOVLPS(S1 _mm_loadl_pi _mm_storel_pi XMM lowest 1 elem r/m MOVQ(S2# _mm_cvtsi64_si128 _mm_cvtsi128_si64 MOVD(S2 _mm_cvtsi32_si128 _mm_cvtsi128_si32 VMOVW(V5+FP16 _mm_cvtsi16_si128 _mm_cvtsi128_si16 XMM lowest 1 elem XMM/mem MOVQ(S2_mm_move_epi64 MOVSD(S2 _mm_load_sd _mm_store_sd _mm_move_sd MOVSS(S1 _mm_load_ss _mm_store_ss _mm_move_ss VMOVSH(V5+FP16 _mm_load_sh _mm_store_sh _mm_move_sh XMM whole 1 elem TIP 2_mm_set1_epi64x VPBROADCASTQ(V2_mm_broadcastq_epi64 TIP 2_mm_set1_epi32 VPBROADCASTD(V2_mm_broadcastd_epi32 TIP 2_mm_set1_epi16 VPBROADCASTW(V2_mm_broadcastw_epi16_mm_set1_epi8 VPBROADCASTB(V2_mm_broadcastb_epi8 TIP 2_mm_set1_pd _mm_load1_pd MOVDDUP(S3 _mm_movedup_pd _mm_loaddup_pd TIP 2_mm_set1_ps _mm_load1_ps VBROADCASTSS from mem (V1 from XMM (V2_mm_broadcast_ss YMM / ZMM whole 1 elem VPBROADCASTQ(V2_mm256_broadcastq_epi64 VPBROADCASTD(V2_mm256_broadcastd_epi32 VPBROADCASTW(V2_mm256_broadcastw_epi16 VPBROADCASTB(V2_mm256_broadcastb_epi8 VBROADCASTSD from mem (V1 from XMM (V2_mm256_broadcast_sd VBROADCASTSS from mem (V1 from XMM (V2_mm256_broadcast_ss VBROADCASTF128(V1 _mm256_broadcast_ps _mm256_broadcast_pd VBROADCASTI128(V2_mm256_broadcastsi128_si256 YMM / ZMM whole 2/4/8 elems VBROADCASTI64X2(V5+DQ..._mm512_broadcast_i64x2 VBROADCASTI64X4(V5_mm512_broadcast_i64x4 VBROADCASTI32X2(V5+DQ..._mm512_broadcast_i32x2 VBROADCASTI32X4(V5..._mm512_broadcast_i32x4 VBROADCASTI32X8(V5+DQ_mm512_broadcast_i32x8 VBROADCASTF64X2(V5+DQ..._mm512_broadcast_f64x2 VBROADCASTF64X4(V5_mm512_broadcast_f64x4 VBROADCASTF32X2(V5+DQ..._mm512_broadcast_f32x2 VBROADCASTF32X4(V5..._mm512_broadcast_f32x4 VBROADCASTF32X8(V5+DQ_mm512_broadcast_f32x8 ?MM multiple elems _mm_set_epi64x _mm_setr_epi64x _mm_set_epi32 _mm_setr_epi32 _mm_set_epi16 _mm_setr_epi16 _mm_set_epi8 _mm_setr_epi8 _mm_set_pd _mm_setr_pd _mm_set_ps _mm_setr_ps ?MM whole zero TIP 1_mm_setzero_si128 TIP 1_mm_setzero_pd TIP 1_mm_setzero_psextract PEXTRQ(S4.1#_mm_extract_epi64 PEXTRD(S4.1_mm_extract_epi32 PEXTRW to r (S2 PEXTRW to r/m (S4.1_mm_extract_epi16 PEXTRB(S4.1_mm_extract_epi8 -> MOVHPD(S2 _mm_loadh_pd _mm_storeh_pd -> MOVLPD(S2 _mm_loadl_pd _mm_storel_pd EXTRACTPS(S4.1_mm_extract_ps VEXTRACTF128(V1 _mm256_extractf128_ps _mm256_extractf128_pd _mm256_extractf128_si256 VEXTRACTI128(V2_mm256_extracti128_si256 VEXTRACTI64X2(V5+DQ..._mm512_extracti64x2_epi64 VEXTRACTI64X4(V5_mm512_extracti64x4_epi64 VEXTRACTI32X4(V5..._mm512_extracti32x4_epi32 VEXTRACTI32X8(V5+DQ_mm512_extracti32x8_epi32 VEXTRACTF64X2(V5+DQ..._mm512_extractf64x2_pd VEXTRACTF64X4(V5_mm512_extractf64x4_pd VEXTRACTF32X4(V5..._mm512_extractf32x4_ps VEXTRACTF32X8(V5+DQ_mm512_extractf32x8_psinsert PINSRQ(S4.1#_mm_insert_epi64 PINSRD(S4.1_mm_insert_epi32 PINSRW(S2_mm_insert_epi16 PINSRB(S4.1_mm_insert_epi8 -> MOVHPD(S2 _mm_loadh_pd _mm_storeh_pd -> MOVLPD(S2 _mm_loadl_pd _mm_storel_pd INSERTPS(S4.1_mm_insert_ps VINSERTF128(V1 _mm256_insertf128_ps _mm256_insertf128_pd _mm256_insertf128_si256 VINSERTI128(V2_mm256_inserti128_si256 VINSERTI64X2(V5+DQ..._mm512_inserrti64x2 VINSERTI64X4(V5..._mm512_inserti64x4 VINSERTI32X4(V5..._mm512_inserti32x4 VINSERTI32X8(V5+DQ_mm512_inserti32x8 VINSERTF64X2(V5+DQ..._mm512_insertf64x2 VINSERTF64X4(V5_mm512_insertf64x4 VINSERTF32X4(V5..._mm512_insertf32x4 VINSERTF32X8(V5+DQ_mm512_insertf32x8unpack PUNPCKHQDQ(S2 _mm_unpackhi_epi64 PUNPCKLQDQ(S2_mm_unpacklo_epi64 PUNPCKHDQ(S2_mm_unpackhi_epi32 PUNPCKLDQ(S2_mm_unpacklo_epi32 PUNPCKHWD(S2_mm_unpackhi_epi16 PUNPCKLWD(S2_mm_unpacklo_epi16 PUNPCKHBW(S2_mm_unpackhi_epi8 PUNPCKLBW(S2_mm_unpacklo_epi8 UNPCKHPD(S2_mm_unpackhi_pd UNPCKLPD(S2_mm_unpacklo_pd UNPCKHPS(S1_mm_unpackhi_ps UNPCKLPS(S1_mm_unpacklo_ps shuffle/permute VPERMQ(V2_mm256_permute4x64_epi64 VPERMI2Q(V5..._mm_permutex2var_epi64 VPERMT2Q(V5..._mm_permutex2var_epi64 PSHUFD(S2_mm_shuffle_epi32 VPERMD(V2_mm256_permutevar8x32_epi32_mm256_permutexvar_epi32 VPERMI2D(V5..._mm_permutex2var_epi32 VPERMT2D(V5..._mm_permutex2var_epi32 PSHUFHW(S2_mm_shufflehi_epi16 PSHUFLW(S2_mm_shufflelo_epi16 VPERMW(V5+BW..._mm_permutexvar_epi16 VPERMI2W(V5+BW..._mm_permutex2var_epi16 VPERMT2W(V5+BW..._mm_permutex2var_epi16 PSHUFB(SS3_mm_shuffle_epi8 VPERMB(V5+VBMI..._mm_permutexvar_epi8 VPERMI2B(V5+VBMI..._mm_permutex2var_epi8 VPERMT2B(V5+VBMI..._mm_permutex2var_epi8 SHUFPD(S2_mm_shuffle_pd VPERMILPD(V1 _mm_permute_pd _mm_permutevar_pd VPERMPD(V2_mm256_permute4x64_pd VPERMI2PD(V5..._mm_permutex2var_pd VPERMT2PD(V5..._mm_permutex2var_pd SHUFPS(S1_mm_shuffle_ps VPERMILPS(V1 _mm_permute_ps _mm_permutevar_ps VPERMPS(V2_mm256_permutevar8x32_ps VPERMI2PS(V5..._mm_permutex2var_ps VPERMT2PS(V5..._mm_permutex2var_ps VPERM2F128(V1 _mm256_permute2f128_ps _mm256_permute2f128_pd _mm256_permute2f128_si256 VPERM2I128(V2_mm256_permute2x128_si256 VSHUFI64X2(V5..._mm512_shuffle_i64x2 VSHUFI32X4(V5..._mm512_shuffle_i32x4 VSHUFF64X2(V5..._mm512_shuffle_f64x2 VSHUFF32X4(V5..._mm512_shuffle_f32x4 blend VPBLENDMQ(V5..._mm_mask_blend_epi32 VPBLENDD(V2_mm_blend_epi32 VPBLENDMD(V5..._mm_mask_blend_epi32 PBLENDW(S4.1_mm_blend_epi16 VPBLENDMW(V5+BW..._mm_mask_blend_epi16 PBLENDVB(S4.1_mm_blendv_epi8 VPBLENDMB(V5+BW..._mm_mask_blend_epi8 BLENDPD(S4.1_mm_blend_pd BLENDVPD(S4.1_mm_blendv_pd VBLENDMPD(V5..._mm_mask_blend_pd BLENDPS(S4.1_mm_blend_ps BLENDVPS(S4.1_mm_blendv_ps VBLENDMPS(V5..._mm_mask_blend_psmove and duplicate MOVDDUP(S3 _mm_movedup_pd _mm_loaddup_pd MOVSHDUP(S3_mm_movehdup_ps MOVSLDUP(S3_mm_moveldup_psmask move VPMASKMOVQ(V2 _mm_maskload_epi64 _mm_maskstore_epi64 VPMASKMOVD(V2 _mm_maskload_epi32 _mm_maskstore_epi32 VMASKMOVPD(V1 _mm_maskload_pd _mm_maskstore_pd VMASKMOVPS(V1 _mm_maskload_ps _mm_maskstore_ps extract highest bit PMOVMSKB(S2_mm_movemask_epi8 MOVMSKPD(S2_mm_movemask_pd MOVMSKPS(S1_mm_movemask_ps VPMOVQ2M(V5+DQ..._mm_movepi64_mask VPMOVD2M(V5+DQ..._mm_movepi32_mask VPMOVW2M(V5+BW..._mm_movepi16_mask VPMOVB2M(V5+BW..._mm_movepi8_maskgather VPGATHERDQ(V2 _mm_i32gather_epi64 _mm_mask_i32gather_epi64 VPGATHERQQ(V2 _mm_i64gather_epi64 _mm_mask_i64gather_epi64 VPGATHERDD(V2 _mm_i32gather_epi32 _mm_mask_i32gather_epi32 VPGATHERQD(V2 _mm_i64gather_epi32 _mm_mask_i64gather_epi32 VGATHERDPD(V2 _mm_i32gather_pd _mm_mask_i32gather_pd VGATHERQPD(V2 _mm_i64gather_pd _mm_mask_i64gather_pd VGATHERDPS(V2 _mm_i32gather_ps _mm_mask_i32gather_ps VGATHERQPS(V2 _mm_i64gather_ps _mm_mask_i64gather_ps scatter VPSCATTERDQ(V5... _mm_i32scatter_epi64 _mm_mask_i32scatter_epi64 VPSCATTERQQ(V5... _mm_i64scatter_epi64 _mm_mask_i64scatter_epi64 VPSCATTERDD(V5... _mm_i32scatter_epi32 _mm_mask_i32scatter_epi32 VPSCATTERQD(V5... _mm_i64scatter_epi32 _mm_mask_i64scatter_epi32 VSCATTERDPD(V5... _mm_i32scatter_pd _mm_mask_i32scatter_pd VSCATTERQPD(V5... _mm_i64scatter_pd _mm_mask_i64scatter_pd VSCATTERDPS(V5... _mm_i32scatter_ps _mm_mask_i32scatter_ps VSCATTERQPS(V5... _mm_i64scatter_ps _mm_mask_i64scatter_ps compress VPCOMPRESSQ(V5..._mm_mask_compress_epi64_mm_mask_compressstoreu_epi64 VPCOMPRESSD(V5..._mm_mask_compress_epi32_mm_mask_compressstoreu_epi32 VPCOMPRESSW(V5+VBMI2..._mm_mask_compress_epi16_mm_mask_compressstoreu_epi16 VPCOMPRESSB(V5+VBMI2..._mm_mask_compress_epi8_mm_mask_compressstoreu_epi8 VCOMPRESSPD(V5..._mm_mask_compress_pd_mm_mask_compressstoreu_pd VCOMPRESSPS(V5..._mm_mask_compress_ps_mm_mask_compressstoreu_psexpand VPEXPANDQ(V5..._mm_mask_expand_epi64_mm_mask_expandloadu_epi64 VPEXPANDD(V5..._mm_mask_expand_epi32_mm_mask_expandloadu_epi32 VPEXPANDW(V5+VBMI2..._mm_mask_expand_epi16_mm_mask_expandloadu_epi16 VPEXPANDB(V5+VBMI2..._mm_mask_expand_epi8_mm_mask_expandloadu_epi8 VEXPANDPD(V5..._mm_mask_expand_pd_mm_mask_expandloadu_pd VEXPANDPS(V5..._mm_mask_expand_ps_mm_mask_expandloadu_psalign right VALIGNQ(V5..._mm_alignr_epi64 VALIGND(V5..._mm_alignr_epi32 PALIGNR(SS3_mm_alignr_epi8expand Opmask bits VPMOVM2Q(V5+DQ..._mm_movm_epi64 VPMOVM2D(V5+DQ..._mm_movm_epi32 VPMOVM2W(V5+BW..._mm_movm_epi16 VPMOVM2B(V5+BW..._mm_movm_epi8 Floating-PointDoubleSingleHalf compare scalar values to set flag register COMISD(S2 _mm_comieq_sd _mm_comilt_sd _mm_comile_sd _mm_comigt_sd _mm_comige_sd _mm_comineq_sd UCOMISD(S2 _mm_ucomieq_sd _mm_ucomilt_sd _mm_ucomile_sd _mm_ucomigt_sd _mm_ucomige_sd _mm_ucomineq_sd COMISS(S1 _mm_comieq_ss _mm_comilt_ss _mm_comile_ss _mm_comigt_ss _mm_comige_ss _mm_comineq_ss UCOMISS(S1 _mm_ucomieq_ss _mm_ucomilt_ss _mm_ucomile_ss _mm_ucomigt_ss _mm_ucomige_ss _mm_ucomineq_ss VCOMISH(V5+FP16 _mm_comieq_sh _mm_comilt_sh _mm_comile_sh _mm_comigt_sh _mm_comige_sh _mm_comineq_sh VUCOMISH(V5+FP16 _mm_ucomieq_sh _mm_ucomilt_sh _mm_ucomile_sh _mm_ucomigt_sh _mm_ucomige_sh _mm_ucomineq_sh IntegerFloating-PointQWORDDWORDWORDBYTEDoubleSingleHalfand PAND(S2_mm_and_si128 ANDPD(S2_mm_and_pd ANDPS(S1_mm_and_ps VPANDQ(V5..._mm512_and_epi64etc VPANDD(V5..._mm512_and_epi32etcand not PANDN(S2_mm_andnot_si128 ANDNPD(S2_mm_andnot_pd ANDNPS(S1_mm_andnot_ps VPANDNQ(V5..._mm512_andnot_epi64etc VPANDND(V5..._mm512_andnot_epi32etcor POR(S2_mm_or_si128 ORPD(S2_mm_or_pd ORPS(S1_mm_or_ps VPORQ(V5..._mm512_or_epi64etc VPORD(V5..._mm512_or_epi32etcxor PXOR(S2_mm_xor_si128 XORPD(S2_mm_xor_pd XORPS(S1_mm_xor_ps VPXORQ(V5..._mm512_xor_epi64etc VPXORD(V5..._mm512_xor_epi32etctest PTEST(S4.1 _mm_testz_si128 _mm_testc_si128 _mm_testnzc_si128 VTESTPD(V1 _mm_testz_pd _mm_testc_pd _mm_testnzc_pd VTESTPS(V1 _mm_testz_ps _mm_testc_ps _mm_testnzc_ps VPTESTMQ(V5..._mm_test_epi64_mask VPTESTNMQ(V5..._mm_testn_epi64_mask VPTESTMD(V5..._mm_test_epi32_mask VPTESTNMD(V5..._mm_testn_epi32_mask VPTESTMW(V5+BW..._mm_test_epi16_mask VPTESTNMW(V5+BW..._mm_testn_epi16_mask VPTESTMB(V5+BW..._mm_test_epi8_mask VPTESTNMB(V5+BW..._mm_testn_epi8_maskternary operation VPTERNLOGQ(V5... _mm_ternarylogic_epi64 VPTERNLOGD(V5... _mm_ternarylogic_epi32 IntegerQWORDDWORDWORDBYTEshift left logical PSLLQ(S2 _mm_slli_epi64 _mm_sll_epi64 PSLLD(S2 _mm_slli_epi32 _mm_sll_epi32 PSLLW(S2 _mm_slli_epi16 _mm_sll_epi16 VPSLLVQ(V2_mm_sllv_epi64 VPSLLVD(V2_mm_sllv_epi32 VPSLLVW(V5+BW..._mm_sllv_epi16shift right logical PSRLQ(S2 _mm_srli_epi64 _mm_srl_epi64 PSRLD(S2 _mm_srli_epi32 _mm_srl_epi32 PSRLW(S2 _mm_srli_epi16 _mm_srl_epi16 VPSRLVQ(V2_mm_srlv_epi64 VPSRLVD(V2_mm_srlv_epi32 VPSRLVW(V5+BW..._mm_srlv_epi16shift right arithmetic VPSRAQ(V5... _mm_srai_epi64 _mm_sra_epi64 PSRAD(S2 _mm_srai_epi32 _mm_sra_epi32 PSRAW(S2 _mm_srai_epi16 _mm_sra_epi16 VPSRAVQ(V5..._mm_srav_epi64 VPSRAVD(V2_mm_srav_epi32 VPSRAVW(V5+BW..._mm_srav_epi16rotate left VPROLQ(V5..._mm_rol_epi64 VPROLD(V5..._mm_rol_epi32 VPROLVQ(V5..._mm_rolv_epi64 VPROLVD(V5..._mm_rolv_epi32rotate right VPRORQ(V5..._mm_ror_epi64 VPRORD(V5..._mm_ror_epi32 VPRORVQ(V5..._mm_rorv_epi64 VPRORVD(V5..._mm_rorv_epi32shift left logical double VPSHLDQ(V5+VBMI2..._mm_shldi_epi64 VPSHLDD(V5+VBMI2..._mm_shldi_epi32 VPSHLDW(V5+VBMI2..._mm_shldi_epi16 VPSHLDVQ(V5+VBMI2..._mm_shldv_epi64 VPSHLDVD(V5+VBMI2..._mm_shldv_epi32 VPSHLDVW(V5+VBMI2..._mm_shldv_epi16shift right logical double VPSHRDQ(V5+VBMI2..._mm_shrdi_epi64 VPSHRDD(V5+VBMI2..._mm_shrdi_epi32 VPSHRDW(V5+VBMI2..._mm_shrdi_epi16 VPSHRDVQ(V5+VBMI2..._mm_shrdv_epi64 VPSHRDVD(V5+VBMI2..._mm_shrdv_epi32 VPSHRDVW(V5+VBMI2..._mm_shrdv_epi16 128-bitshift left logical PSLLDQ(S2_mm_slli_si128shift right logical PSRLDQ(S2_mm_srli_si128packed align right PALIGNR(SS3_mm_alignr_epi8 explicit lengthimplicit lengthreturn index PCMPESTRI(S4.2 _mm_cmpestri _mm_cmpestra _mm_cmpestrc _mm_cmpestro _mm_cmpestrs _mm_cmpestrz PCMPISTRI(S4.2 _mm_cmpistri _mm_cmpistra _mm_cmpistrc _mm_cmpistro _mm_cmpistrs _mm_cmpistrz return mask PCMPESTRM(S4.2 _mm_cmpestrm _mm_cmpestra _mm_cmpestrc _mm_cmpestro _mm_cmpestrs _mm_cmpestrz PCMPISTRM(S4.2 _mm_cmpistrm _mm_cmpistra _mm_cmpistrc _mm_cmpistro _mm_cmpistrs _mm_cmpistrz LDMXCSR(S1_mm_setcsrLoad MXCSR register STMXCSR(S1_mm_getcsrSave MXCSR register state PSADBW(S2_mm_sad_epu8Compute sum of absolute differences MPSADBW(S4.1_mm_mpsadbw_epu8 Performs eight 4-byte wide Sum of Absolute Differences operations to produce eight word integers. VPCONFLICTQ(V5+CD..._mm512_conflict_epi64 VPCONFLICTD(V5+CD..._mm512_conflict_epi32Detect Conflicts Within a Vector of Packed Dword/Qword Values into Dense Memory/ Register VP2INTERSECTQ(V5+VP2INTERSECT..._mm512_2intersect_epi64 VP2INTERSECTD(V5+VP2INTERSECT..._mm512_2intersect_epi32Compute Intersection Between DWORDS/QUADWORDS to a Pair of Mask Registers VPLZCNTQ(V5+CD..._mm_lzcnt_epi64 VPLZCNTD(V5+CD..._mm_lzcnt_epi32Count the Number of Leading Zero Bits for Packed Dword, Packed Qword Values VFIXUPIMMPD* (V5..._mm512_fixupimm_pd VFIXUPIMMPS* (V5..._mm512_fixupimm_psFix Up Special Packed Float64/32 Values VFPCLASSPD* (V5..._mm512_fpclass_pd_mask VFPCLASSPS* (V5..._mm512_fpclass_ps_mask VFPCLASSPH* (V5+FP16..._mm512_fpclass_ph_maskTests Types Of a Packed Float64/32 Values VRANGEPD* (V5+DQ..._mm_range_pd VRANGEPS* (V5+DQ..._mm_range_pd Range Restriction Calculation For Packed Pairs of Float64/32 Values VGETEXPPD* (V5..._mm512_getexp_pd VGETEXPPS* (V5..._mm512_getexp_ps VGETEXPPH* (V5+FP16..._mm512_getexp_ph Convert Exponents of Packed DP/SP FP Values to FP Values VGETMANTPD* (V5..._mm512_getmant_pd VGETMANTPS* (V5..._mm512_getmant_ps VGETMANTPH* (V5+FP16..._mm512_getmant_ph Extract Float64/32 Vector of Normalized Mantissas from Float64/32 Vector AESDEC(AESNI_mm_aesdec_si128Perform an AES decryption round using an 128-bit state and a round key AESDECLAST(AESNI_mm_aesdeclast_si128Perform the last AES decryption round using an 128-bit state and a round key AESENC(AESNI_mm_aesenc_si128Perform an AES encryption round using an 128-bit state and a round key AESENCLAST(AESNI_mm_aesenclast_si128Perform the last AES encryption round using an 128-bit state and a round key AESIMC(AESNI_mm_aesimc_si128Perform an inverse mix column transformation primitive AESKEYGENASSIST(AESNI_mm_aeskeygenassist_si128Assist the creation of round keys with a key expansion schedule PCLMULQDQ(PCLMULQDQ_mm_clmulepi64_si128Perform carryless multiplication of two 64-bit numbers SHA1RNDS4(SHA_mm_sha1rnds4_epu32Perform Four Rounds of SHA1 Operation SHA1NEXTE(SHA_mm_sha1nexte_epu32Calculate SHA1 State Variable E after Four Rounds SHA1MSG1(SHA_mm_sha1msg1_epu32Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords SHA1MSG2(SHA_mm_sha1msg2_epu32Perform a Final Calculation for the Next Four SHA1 Message Dwords SHA256RNDS2(SHA_mm_sha256rnds2_epu32Perform Two Rounds of SHA256 Operation SHA256MSG1(SHA_mm_sha256msg1_epu32Perform an Intermediate Calculation for the Next Four SHA256 Message SHA256MSG2(SHA_mm_sha256msg2_epu32Perform a Final Calculation for the Next Four SHA256 Message Dwords GF2P8AFFINEQB(GFNI..._mm_gf2p8affine_epi64_epi8Galois Field Affine Transformation GF2P8AFFINEINVQB(GFNI..._mm_gf2p8affineinv_epi64_epi8Galois Field Affine Transformation Inverse GF2P8MULB(GFNI..._mm_gf2p8mul_epi8Galois Field Multiply Bytes VFMULCPH* (V5+FP16..._mm_fmul_pch_mm_mul_pch VFCMULCPH* (V5+FP16..._mm_fcmul_pch_mm_cmul_pchComplex Multiply FP16 Values VFMADDCPH* (V5+FP16..._mm_fmadd_pch VFCMADDCPH* (V5+FP16..._mm_fcmadd_pchComplex Multiply and Accumulate FP16 Values VCVTNE2PS2BF16(V5+BF16..._mm_cvtne2ps_pbhConvert Two Packed Single Data to One Packed BF16 Data VCVTNEPS2BF16(V5+BF16..._mm_cvtneps_pbhConvert Packed Single Data to Packed BF16 Data VDPBF16PS(V5+BF16..._mm_dpbf16_psDot Product of BF16 Pairs Accumulated Into Packed Single Precision VPMADD52HUQ(V5+IFMA..._mm_madd52hi_epu64Packed Multiply of Unsigned 52-Bit Integers and Add the High 52-Bit Products to Qword Accumulators VPMADD52LUQ(V5+IFMA..._mm_madd52lo_epu64Packed Multiply of Unsigned 52-Bit Integers and Add the Low 52-Bit Products to Qword Accumulators VPMULTISHIFTQB(V5+VBMI..._mm_multishift_epi64_epi8Select Packed Unaligned Bytes From Quadword Sources VPSHUFBITQMB(V5+BITALG..._mm_bitshuffle_epi64_maskShuffle Bits From Quadword Elements Using Byte Indexes Into MaskVZEROALL (V1_mm256_zeroallZero all YMM registersVZEROUPPER (V1_mm256_zeroupperZero upper 128 bits of all YMM registersMOVNTPS (S1_mm_stream_ps Non-temporal store of four packed single-precision floating-point values from an XMM register into memory MASKMOVDQU (S2_mm_maskmoveu_si128Non-temporal store of selected bytes from an XMM register into memoryMOVNTPD (S2_mm_stream_pd Non-temporal store of two packed double-precision floating-point values from an XMM register into memory MOVNTDQ (S2_mm_stream_si128Non-temporal store of double quadword from an XMM register into memoryLDDQU (S3_mm_lddqu_si128Special 128-bit unaligned load designed to avoid cache line splitsMOVNTDQA (S4.1_mm_stream_load_si128 Provides a non-temporal hint that can cause adjacent 16-byte items within an aligned 64-byte region (a streaming line) to be fetched and held in a small set of temporary buffers ("streaming load buffers"). Detailed information about this alglorithm including how to calculate the magic number is in "Hacker's Delight" book written by Henry S. Warren Jr.
Or read this on Hacker News