diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index 8237f0eb5..dc2719d14 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -22,480 +22,61 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale - * Copyright (c) 2022 Tino Reichardt + * Copyright (c) 2022-2023 Tino Reichardt * * This is converted assembly: SSE2 -> ARMv8-A * Used tools: SIMDe https://github.com/simd-everywhere/simde + * + * Should work on FreeBSD, Linux and macOS + * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh */ #if defined(__aarch64__) .text - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 -.LCPI0_0: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI0_1: - .xword 0 - .xword -4294967296 -.LCPI0_2: - .xword -1 - .xword 4294967295 + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 +.Lsec_end0: .text .globl zfs_blake3_compress_in_place_sse2 .p2align 2 .type zfs_blake3_compress_in_place_sse2,@function zfs_blake3_compress_in_place_sse2: .cfi_startproc - ldp q3, q2, [x0] - ldp q5, q6, [x1] - add x10, x1, #32 - lsr x11, x3, #32 - fmov s4, w3 - ld2 { v17.4s, v18.4s }, [x10] - adrp x10, .LCPI0_2 - and w8, w2, #0xff - mov v4.s[1], w11 - ldr q1, [x10, :lo12:.LCPI0_2] - and w9, w4, #0xff - adrp x12, .LCPI0_0 - mov v4.s[2], w8 - uzp1 v19.4s, v5.4s, v6.4s - add v3.4s, v2.4s, v3.4s - ldr q7, [x12, :lo12:.LCPI0_0] - mov v4.s[3], w9 - add v3.4s, v3.4s, v19.4s - uzp2 v5.4s, v5.4s, v6.4s - ext v21.16b, v18.16b, v18.16b, #12 - uzp1 v6.4s, v19.4s, v19.4s - ext v22.16b, v19.16b, v19.16b, #12 - eor v4.16b, v3.16b, v4.16b - ext v20.16b, v17.16b, v17.16b, #12 - ext v6.16b, v6.16b, v19.16b, #8 - ext v19.16b, v19.16b, v22.16b, #12 - zip1 v22.2d, v21.2d, v5.2d - rev32 v24.8h, v4.8h - mov v4.16b, v1.16b - zip2 v23.4s, v5.4s, v21.4s - uzp2 v6.4s, v6.4s, v5.4s - bsl v4.16b, v22.16b, v20.16b - add v3.4s, v3.4s, v5.4s - zip1 v5.4s, v23.4s, v20.4s - zip1 v22.4s, v20.4s, v23.4s - add v23.4s, v24.4s, v7.4s - ext v7.16b, v6.16b, v6.16b, #4 - ext v25.16b, v4.16b, v4.16b, #12 - ext v5.16b, v22.16b, v5.16b, #8 - eor v2.16b, v23.16b, v2.16b - uzp1 v4.4s, v4.4s, v25.4s - uzp1 v22.4s, v7.4s, v7.4s - ext v25.16b, v7.16b, v7.16b, #12 - ext v22.16b, v22.16b, v7.16b, #8 - ext v7.16b, v7.16b, v25.16b, #12 - ushr v25.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v2.16b, v2.16b, v25.16b - add v3.4s, v3.4s, v2.4s - eor v24.16b, v3.16b, v24.16b - add v3.4s, v3.4s, v17.4s - ushr v17.4s, v24.4s, #8 - shl v18.4s, v24.4s, #24 - orr v17.16b, v18.16b, v17.16b - add v18.4s, v17.4s, v23.4s - eor v2.16b, v18.16b, v2.16b - ushr v23.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v3.16b, v3.16b, v3.16b, #12 - orr v2.16b, v2.16b, v23.16b - ext v17.16b, v17.16b, v17.16b, #8 - add v3.4s, v2.4s, v3.4s - adrp x11, .LCPI0_1 - eor v17.16b, v3.16b, v17.16b - ldr q16, [x11, :lo12:.LCPI0_1] - ext v18.16b, v18.16b, v18.16b, #4 - rev32 v24.8h, v17.8h - movi v0.2d, #0xffffffff00000000 - add v23.4s, v3.4s, v21.4s - mov v21.s[1], v20.s[2] - add v20.4s, v18.4s, v24.4s - bit v19.16b, v21.16b, v0.16b - eor v3.16b, v20.16b, v2.16b - uzp2 v2.4s, v22.4s, v19.4s - zip1 v17.2d, v5.2d, v19.2d - zip2 v18.4s, v19.4s, v5.4s - ushr v21.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - ext v22.16b, v2.16b, v2.16b, #4 - bsl v16.16b, v4.16b, v17.16b - zip1 v17.4s, v18.4s, v4.4s - zip1 v18.4s, v4.4s, v18.4s - orr v21.16b, v3.16b, v21.16b - ext v25.16b, v16.16b, v16.16b, #12 - ext v3.16b, v18.16b, v17.16b, #8 - uzp1 v18.4s, v22.4s, v22.4s - ext v26.16b, v22.16b, v22.16b, #12 - add v23.4s, v23.4s, v21.4s - uzp1 v17.4s, v16.4s, v25.4s - ext v16.16b, v18.16b, v22.16b, #8 - ext v18.16b, v22.16b, v26.16b, #12 - eor v22.16b, v23.16b, v24.16b - add v6.4s, v23.4s, v6.4s - ushr v23.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v23.16b - add v20.4s, v22.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ushr v23.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v21.16b, v21.16b, v23.16b - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v21.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v20.16b, v20.16b, v20.16b, #12 - add v6.4s, v6.4s, v19.4s - rev32 v19.8h, v22.8h - add v20.4s, v20.4s, v19.4s - eor v21.16b, v20.16b, v21.16b - ushr v22.4s, v21.4s, #12 - shl v21.4s, v21.4s, #20 - orr v21.16b, v21.16b, v22.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ushr v22.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v22.16b - add v20.4s, v19.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #12 - ushr v22.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - add v6.4s, v6.4s, v4.4s - orr v21.16b, v21.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ext v20.16b, v20.16b, v20.16b, #4 - rev32 v19.8h, v19.8h - add v20.4s, v20.4s, v19.4s - add v6.4s, v6.4s, v5.4s - mov v5.s[1], v4.s[2] - eor v4.16b, v20.16b, v21.16b - ushr v21.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v21.16b, v4.16b, v21.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - add v2.4s, v6.4s, v2.4s - ushr v6.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v6.16b, v19.16b, v6.16b - add v19.4s, v6.4s, v20.4s - eor v20.16b, v19.16b, v21.16b - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v20.4s, v2.4s - eor v6.16b, v2.16b, v6.16b - ext v19.16b, v19.16b, v19.16b, #12 - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v6.4s - mov v22.16b, v0.16b - eor v20.16b, v19.16b, v20.16b - bsl v22.16b, v5.16b, v7.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - add v2.4s, v2.4s, v22.4s - orr v20.16b, v20.16b, v21.16b - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - ushr v21.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v6.16b, v6.16b, v21.16b - add v19.4s, v6.4s, v19.4s - eor v20.16b, v19.16b, v20.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v2.4s, v2.4s, v17.4s - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - uzp2 v5.4s, v16.4s, v22.4s - zip1 v7.2d, v3.2d, v22.2d - zip2 v16.4s, v22.4s, v3.4s - ext v19.16b, v19.16b, v19.16b, #4 - rev32 v22.8h, v6.8h - ext v23.16b, v5.16b, v5.16b, #4 - bif v7.16b, v17.16b, v1.16b - zip1 v24.4s, v16.4s, v17.4s - zip1 v16.4s, v17.4s, v16.4s - add v21.4s, v2.4s, v3.4s - mov v3.s[1], v17.s[2] - add v17.4s, v19.4s, v22.4s - mov v19.16b, v0.16b - ext v25.16b, v7.16b, v7.16b, #12 - ext v4.16b, v16.16b, v24.16b, #8 - uzp1 v16.4s, v23.4s, v23.4s - bsl v19.16b, v3.16b, v18.16b - eor v2.16b, v17.16b, v20.16b - uzp1 v7.4s, v7.4s, v25.4s - ext v25.16b, v16.16b, v23.16b, #8 - zip1 v3.2d, v4.2d, v19.2d - ushr v20.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ext v24.16b, v23.16b, v23.16b, #12 - uzp2 v6.4s, v25.4s, v19.4s - zip2 v18.4s, v19.4s, v4.4s - bif v3.16b, v7.16b, v1.16b - orr v20.16b, v2.16b, v20.16b - ext v16.16b, v23.16b, v24.16b, #12 - ext v23.16b, v6.16b, v6.16b, #4 - zip1 v24.4s, v18.4s, v7.4s - zip1 v18.4s, v7.4s, v18.4s - ext v25.16b, v3.16b, v3.16b, #12 - add v21.4s, v21.4s, v20.4s - ext v2.16b, v18.16b, v24.16b, #8 - uzp1 v18.4s, v23.4s, v23.4s - ext v24.16b, v23.16b, v23.16b, #12 - uzp1 v3.4s, v3.4s, v25.4s - eor v22.16b, v21.16b, v22.16b - ext v25.16b, v18.16b, v23.16b, #8 - dup v18.4s, v2.s[3] - ext v23.16b, v23.16b, v24.16b, #12 - add v5.4s, v21.4s, v5.4s - trn1 v21.4s, v3.4s, v3.4s - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - ext v18.16b, v21.16b, v18.16b, #8 - orr v21.16b, v22.16b, v24.16b - add v17.4s, v21.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ushr v22.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v20.16b, v20.16b, v22.16b - ext v21.16b, v21.16b, v21.16b, #8 - add v5.4s, v20.4s, v5.4s - eor v21.16b, v5.16b, v21.16b - ext v17.16b, v17.16b, v17.16b, #12 - add v5.4s, v5.4s, v19.4s - rev32 v19.8h, v21.8h - add v17.4s, v17.4s, v19.4s - eor v20.16b, v17.16b, v20.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v21.16b - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ushr v21.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v21.16b - add v17.4s, v19.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ext v5.16b, v5.16b, v5.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v5.4s, v5.4s, v7.4s - orr v20.16b, v20.16b, v21.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ext v17.16b, v17.16b, v17.16b, #4 - rev32 v22.8h, v19.8h - add v21.4s, v5.4s, v4.4s - mov v4.s[1], v7.s[2] - add v19.4s, v17.4s, v22.4s - bit v16.16b, v4.16b, v0.16b - eor v5.16b, v19.16b, v20.16b - uzp2 v4.4s, v25.4s, v16.4s - zip1 v7.2d, v2.2d, v16.2d - zip2 v17.4s, v16.4s, v2.4s - ushr v20.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ext v24.16b, v4.16b, v4.16b, #4 - bif v7.16b, v3.16b, v1.16b - zip1 v25.4s, v17.4s, v3.4s - zip1 v17.4s, v3.4s, v17.4s - orr v20.16b, v5.16b, v20.16b - ext v26.16b, v7.16b, v7.16b, #12 - ext v5.16b, v17.16b, v25.16b, #8 - uzp1 v17.4s, v24.4s, v24.4s - ext v25.16b, v24.16b, v24.16b, #12 - bit v23.16b, v18.16b, v0.16b - add v21.4s, v21.4s, v20.4s - uzp1 v7.4s, v7.4s, v26.4s - ext v26.16b, v17.16b, v24.16b, #8 - ext v17.16b, v24.16b, v25.16b, #12 - eor v22.16b, v21.16b, v22.16b - add v6.4s, v21.4s, v6.4s - zip1 v21.2d, v5.2d, v23.2d - zip2 v24.4s, v23.4s, v5.4s - bif v21.16b, v7.16b, v1.16b - zip1 v1.4s, v24.4s, v7.4s - zip1 v24.4s, v7.4s, v24.4s - ext v1.16b, v24.16b, v1.16b, #8 - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v24.16b - add v19.4s, v22.4s, v19.4s - ext v24.16b, v21.16b, v21.16b, #12 - eor v20.16b, v19.16b, v20.16b - uzp1 v21.4s, v21.4s, v24.4s - ushr v24.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - orr v20.16b, v20.16b, v24.16b - ext v6.16b, v6.16b, v6.16b, #4 - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v20.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #12 - add v6.4s, v6.4s, v16.4s - rev32 v16.8h, v22.8h - add v19.4s, v19.4s, v16.4s - eor v20.16b, v19.16b, v20.16b - ushr v22.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v22.16b - add v6.4s, v6.4s, v20.4s - eor v16.16b, v6.16b, v16.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v6.4s, v3.4s - ushr v6.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v6.16b, v16.16b, v6.16b - add v16.4s, v6.4s, v19.4s - eor v19.16b, v16.16b, v20.16b - ushr v20.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v20.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v3.4s, v3.4s, v19.4s - eor v6.16b, v3.16b, v6.16b - ext v16.16b, v16.16b, v16.16b, #4 - add v2.4s, v3.4s, v2.4s - rev32 v3.8h, v6.8h - add v6.4s, v16.4s, v3.4s - eor v16.16b, v6.16b, v19.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - orr v16.16b, v16.16b, v19.16b - add v2.4s, v2.4s, v16.4s - eor v3.16b, v2.16b, v3.16b - add v2.4s, v2.4s, v4.4s - ushr v4.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v4.16b - add v4.4s, v3.4s, v6.4s - eor v6.16b, v4.16b, v16.16b - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v6.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v6.16b, v4.16b, v6.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - add v2.4s, v2.4s, v23.4s - orr v6.16b, v6.16b, v16.16b - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ushr v16.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v16.16b - add v4.4s, v3.4s, v4.4s - eor v6.16b, v4.16b, v6.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - add v2.4s, v2.4s, v7.4s - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #4 - rev32 v3.8h, v3.8h - add v2.4s, v2.4s, v5.4s - mov v5.s[1], v7.s[2] - add v4.4s, v4.4s, v3.4s - bsl v0.16b, v5.16b, v17.16b - eor v5.16b, v4.16b, v6.16b - ushr v6.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v6.16b - add v2.4s, v2.4s, v5.4s - eor v3.16b, v2.16b, v3.16b - ushr v6.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v6.16b - add v4.4s, v3.4s, v4.4s - uzp2 v18.4s, v26.4s, v18.4s - eor v5.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v18.4s - ushr v6.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v5.16b, v5.16b, v6.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v5.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - add v0.4s, v2.4s, v0.4s - rev32 v2.8h, v3.8h - add v3.4s, v4.4s, v2.4s - eor v4.16b, v3.16b, v5.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v4.16b, v4.16b, v5.16b - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ushr v5.4s, v2.4s, #8 - shl v2.4s, v2.4s, #24 - orr v2.16b, v2.16b, v5.16b - add v3.4s, v2.4s, v3.4s - eor v4.16b, v3.16b, v4.16b - ext v0.16b, v0.16b, v0.16b, #12 - ushr v5.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - add v0.4s, v0.4s, v21.4s - orr v4.16b, v4.16b, v5.16b - ext v2.16b, v2.16b, v2.16b, #8 - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ext v3.16b, v3.16b, v3.16b, #4 - add v0.4s, v0.4s, v1.4s - rev32 v1.8h, v2.8h - add v2.4s, v3.4s, v1.4s - eor v3.16b, v2.16b, v4.16b - ushr v4.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v4.16b - add v0.4s, v0.4s, v3.4s - eor v1.16b, v0.16b, v1.16b - ushr v4.4s, v1.4s, #8 - shl v1.4s, v1.4s, #24 - orr v1.16b, v1.16b, v4.16b - add v2.4s, v1.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v0.16b, v0.16b, v0.16b, #4 - ext v2.16b, v2.16b, v2.16b, #12 - ushr v4.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - ext v1.16b, v1.16b, v1.16b, #8 + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + str x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x19, x0 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x19 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b - orr v2.16b, v3.16b, v4.16b - eor v1.16b, v2.16b, v1.16b - stp q0, q1, [x0] + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr x19, [sp, #80] + add sp, sp, #96 + hint #29 ret .Lfunc_end0: .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2 @@ -504,403 +85,39 @@ zfs_blake3_compress_in_place_sse2: .section .rodata.cst16,"aM",@progbits,16 .p2align 4 .LCPI1_0: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI1_1: - .xword 0 - .xword -4294967296 -.LCPI1_2: - .xword -1 - .xword 4294967295 + .xword -4942790177982912921 + .xword -6534734903820487822 .text - .globl zfs_blake3_compress_xof_sse2 .p2align 2 - .type zfs_blake3_compress_xof_sse2,@function -zfs_blake3_compress_xof_sse2: + .type compress_pre,@function +compress_pre: .cfi_startproc - ldp q3, q2, [x0] - ldp q5, q6, [x1] - add x10, x1, #32 - lsr x11, x3, #32 - fmov s4, w3 - ld2 { v17.4s, v18.4s }, [x10] - adrp x10, .LCPI1_2 - and w8, w2, #0xff - mov v4.s[1], w11 - ldr q1, [x10, :lo12:.LCPI1_2] - and w9, w4, #0xff - adrp x12, .LCPI1_0 - mov v4.s[2], w8 - uzp1 v19.4s, v5.4s, v6.4s - add v3.4s, v2.4s, v3.4s - ldr q7, [x12, :lo12:.LCPI1_0] - mov v4.s[3], w9 - add v3.4s, v3.4s, v19.4s - uzp2 v5.4s, v5.4s, v6.4s - ext v21.16b, v18.16b, v18.16b, #12 - uzp1 v6.4s, v19.4s, v19.4s - ext v22.16b, v19.16b, v19.16b, #12 - eor v4.16b, v3.16b, v4.16b - ext v20.16b, v17.16b, v17.16b, #12 - ext v6.16b, v6.16b, v19.16b, #8 - ext v19.16b, v19.16b, v22.16b, #12 - zip1 v22.2d, v21.2d, v5.2d - rev32 v24.8h, v4.8h - mov v4.16b, v1.16b - zip2 v23.4s, v5.4s, v21.4s - uzp2 v6.4s, v6.4s, v5.4s - bsl v4.16b, v22.16b, v20.16b - add v3.4s, v3.4s, v5.4s - zip1 v5.4s, v23.4s, v20.4s - zip1 v22.4s, v20.4s, v23.4s - add v23.4s, v24.4s, v7.4s - ext v7.16b, v6.16b, v6.16b, #4 - ext v25.16b, v4.16b, v4.16b, #12 - ext v5.16b, v22.16b, v5.16b, #8 - eor v2.16b, v23.16b, v2.16b - uzp1 v4.4s, v4.4s, v25.4s - uzp1 v22.4s, v7.4s, v7.4s - ext v25.16b, v7.16b, v7.16b, #12 - ext v22.16b, v22.16b, v7.16b, #8 - ext v7.16b, v7.16b, v25.16b, #12 - ushr v25.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v2.16b, v2.16b, v25.16b - add v3.4s, v3.4s, v2.4s - eor v24.16b, v3.16b, v24.16b - add v3.4s, v3.4s, v17.4s - ushr v17.4s, v24.4s, #8 - shl v18.4s, v24.4s, #24 - orr v17.16b, v18.16b, v17.16b - add v18.4s, v17.4s, v23.4s - eor v2.16b, v18.16b, v2.16b - ushr v23.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v3.16b, v3.16b, v3.16b, #12 - orr v2.16b, v2.16b, v23.16b - ext v17.16b, v17.16b, v17.16b, #8 - add v3.4s, v2.4s, v3.4s - adrp x11, .LCPI1_1 - eor v17.16b, v3.16b, v17.16b - ldr q16, [x11, :lo12:.LCPI1_1] - ext v18.16b, v18.16b, v18.16b, #4 - rev32 v24.8h, v17.8h - movi v0.2d, #0xffffffff00000000 - add v23.4s, v3.4s, v21.4s - mov v21.s[1], v20.s[2] - add v20.4s, v18.4s, v24.4s - bit v19.16b, v21.16b, v0.16b - eor v3.16b, v20.16b, v2.16b - uzp2 v2.4s, v22.4s, v19.4s - zip1 v17.2d, v5.2d, v19.2d - zip2 v18.4s, v19.4s, v5.4s - ushr v21.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - ext v22.16b, v2.16b, v2.16b, #4 - bsl v16.16b, v4.16b, v17.16b - zip1 v17.4s, v18.4s, v4.4s - zip1 v18.4s, v4.4s, v18.4s - orr v21.16b, v3.16b, v21.16b - ext v25.16b, v16.16b, v16.16b, #12 - ext v3.16b, v18.16b, v17.16b, #8 - uzp1 v18.4s, v22.4s, v22.4s - ext v26.16b, v22.16b, v22.16b, #12 - add v23.4s, v23.4s, v21.4s - uzp1 v17.4s, v16.4s, v25.4s - ext v16.16b, v18.16b, v22.16b, #8 - ext v18.16b, v22.16b, v26.16b, #12 - eor v22.16b, v23.16b, v24.16b - add v6.4s, v23.4s, v6.4s - ushr v23.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v23.16b - add v20.4s, v22.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ushr v23.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v21.16b, v21.16b, v23.16b - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v21.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v20.16b, v20.16b, v20.16b, #12 - add v6.4s, v6.4s, v19.4s - rev32 v19.8h, v22.8h - add v20.4s, v20.4s, v19.4s - eor v21.16b, v20.16b, v21.16b - ushr v22.4s, v21.4s, #12 - shl v21.4s, v21.4s, #20 - orr v21.16b, v21.16b, v22.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ushr v22.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v22.16b - add v20.4s, v19.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #12 - ushr v22.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - add v6.4s, v6.4s, v4.4s - orr v21.16b, v21.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ext v20.16b, v20.16b, v20.16b, #4 - rev32 v19.8h, v19.8h - add v20.4s, v20.4s, v19.4s - add v6.4s, v6.4s, v5.4s - mov v5.s[1], v4.s[2] - eor v4.16b, v20.16b, v21.16b - ushr v21.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v21.16b, v4.16b, v21.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - add v2.4s, v6.4s, v2.4s - ushr v6.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v6.16b, v19.16b, v6.16b - add v19.4s, v6.4s, v20.4s - eor v20.16b, v19.16b, v21.16b - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v20.4s, v2.4s - eor v6.16b, v2.16b, v6.16b - ext v19.16b, v19.16b, v19.16b, #12 - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v6.4s - mov v22.16b, v0.16b - eor v20.16b, v19.16b, v20.16b - bsl v22.16b, v5.16b, v7.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - add v2.4s, v2.4s, v22.4s - orr v20.16b, v20.16b, v21.16b - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - ushr v21.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v6.16b, v6.16b, v21.16b - add v19.4s, v6.4s, v19.4s - eor v20.16b, v19.16b, v20.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v2.4s, v2.4s, v17.4s - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - uzp2 v5.4s, v16.4s, v22.4s - zip1 v7.2d, v3.2d, v22.2d - zip2 v16.4s, v22.4s, v3.4s - ext v19.16b, v19.16b, v19.16b, #4 - rev32 v22.8h, v6.8h - ext v23.16b, v5.16b, v5.16b, #4 - bif v7.16b, v17.16b, v1.16b - zip1 v24.4s, v16.4s, v17.4s - zip1 v16.4s, v17.4s, v16.4s - add v21.4s, v2.4s, v3.4s - mov v3.s[1], v17.s[2] - add v17.4s, v19.4s, v22.4s - mov v19.16b, v0.16b - ext v25.16b, v7.16b, v7.16b, #12 - ext v4.16b, v16.16b, v24.16b, #8 - uzp1 v16.4s, v23.4s, v23.4s - bsl v19.16b, v3.16b, v18.16b - eor v2.16b, v17.16b, v20.16b - uzp1 v7.4s, v7.4s, v25.4s - ext v25.16b, v16.16b, v23.16b, #8 - zip1 v3.2d, v4.2d, v19.2d - ushr v20.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ext v24.16b, v23.16b, v23.16b, #12 - uzp2 v6.4s, v25.4s, v19.4s - zip2 v18.4s, v19.4s, v4.4s - bif v3.16b, v7.16b, v1.16b - orr v20.16b, v2.16b, v20.16b - ext v16.16b, v23.16b, v24.16b, #12 - ext v23.16b, v6.16b, v6.16b, #4 - zip1 v24.4s, v18.4s, v7.4s - zip1 v18.4s, v7.4s, v18.4s - ext v25.16b, v3.16b, v3.16b, #12 - add v21.4s, v21.4s, v20.4s - ext v2.16b, v18.16b, v24.16b, #8 - uzp1 v18.4s, v23.4s, v23.4s - ext v24.16b, v23.16b, v23.16b, #12 - uzp1 v3.4s, v3.4s, v25.4s - eor v22.16b, v21.16b, v22.16b - ext v25.16b, v18.16b, v23.16b, #8 - dup v18.4s, v2.s[3] - ext v23.16b, v23.16b, v24.16b, #12 - add v5.4s, v21.4s, v5.4s - trn1 v21.4s, v3.4s, v3.4s - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - ext v18.16b, v21.16b, v18.16b, #8 - orr v21.16b, v22.16b, v24.16b - add v17.4s, v21.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ushr v22.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v20.16b, v20.16b, v22.16b - ext v21.16b, v21.16b, v21.16b, #8 - add v5.4s, v20.4s, v5.4s - eor v21.16b, v5.16b, v21.16b - ext v17.16b, v17.16b, v17.16b, #12 - add v5.4s, v5.4s, v19.4s - rev32 v19.8h, v21.8h - add v17.4s, v17.4s, v19.4s - eor v20.16b, v17.16b, v20.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v21.16b - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ushr v21.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v21.16b - add v17.4s, v19.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ext v5.16b, v5.16b, v5.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v5.4s, v5.4s, v7.4s - orr v20.16b, v20.16b, v21.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ext v17.16b, v17.16b, v17.16b, #4 - rev32 v22.8h, v19.8h - add v21.4s, v5.4s, v4.4s - mov v4.s[1], v7.s[2] - add v19.4s, v17.4s, v22.4s - bit v16.16b, v4.16b, v0.16b - eor v5.16b, v19.16b, v20.16b - uzp2 v4.4s, v25.4s, v16.4s - zip1 v7.2d, v2.2d, v16.2d - zip2 v17.4s, v16.4s, v2.4s - ushr v20.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ext v24.16b, v4.16b, v4.16b, #4 - bif v7.16b, v3.16b, v1.16b - zip1 v25.4s, v17.4s, v3.4s - zip1 v17.4s, v3.4s, v17.4s - orr v20.16b, v5.16b, v20.16b - ext v26.16b, v7.16b, v7.16b, #12 - ext v5.16b, v17.16b, v25.16b, #8 - uzp1 v17.4s, v24.4s, v24.4s - ext v25.16b, v24.16b, v24.16b, #12 - bit v23.16b, v18.16b, v0.16b - add v21.4s, v21.4s, v20.4s - uzp1 v7.4s, v7.4s, v26.4s - ext v26.16b, v17.16b, v24.16b, #8 - ext v17.16b, v24.16b, v25.16b, #12 - eor v22.16b, v21.16b, v22.16b - add v6.4s, v21.4s, v6.4s - zip1 v21.2d, v5.2d, v23.2d - zip2 v24.4s, v23.4s, v5.4s - bif v21.16b, v7.16b, v1.16b - zip1 v1.4s, v24.4s, v7.4s - zip1 v24.4s, v7.4s, v24.4s - ext v1.16b, v24.16b, v1.16b, #8 - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v24.16b - add v19.4s, v22.4s, v19.4s - ext v24.16b, v21.16b, v21.16b, #12 - eor v20.16b, v19.16b, v20.16b - uzp1 v21.4s, v21.4s, v24.4s - ushr v24.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - orr v20.16b, v20.16b, v24.16b - ext v6.16b, v6.16b, v6.16b, #4 - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v20.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #12 - add v6.4s, v6.4s, v16.4s - rev32 v16.8h, v22.8h - add v19.4s, v19.4s, v16.4s - eor v20.16b, v19.16b, v20.16b - ushr v22.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v22.16b - add v6.4s, v6.4s, v20.4s - eor v16.16b, v6.16b, v16.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v6.4s, v3.4s - ushr v6.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v6.16b, v16.16b, v6.16b - add v16.4s, v6.4s, v19.4s - eor v19.16b, v16.16b, v20.16b - ushr v20.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v20.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v3.4s, v3.4s, v19.4s - eor v6.16b, v3.16b, v6.16b - ext v16.16b, v16.16b, v16.16b, #4 - add v2.4s, v3.4s, v2.4s - rev32 v3.8h, v6.8h - add v6.4s, v16.4s, v3.4s - eor v16.16b, v6.16b, v19.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - orr v16.16b, v16.16b, v19.16b - add v2.4s, v2.4s, v16.4s - eor v3.16b, v2.16b, v3.16b - add v2.4s, v2.4s, v4.4s - ushr v4.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v4.16b - add v4.4s, v3.4s, v6.4s - eor v6.16b, v4.16b, v16.16b - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v6.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v6.16b, v4.16b, v6.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - add v2.4s, v2.4s, v23.4s - orr v6.16b, v6.16b, v16.16b - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ushr v16.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v16.16b - add v4.4s, v3.4s, v4.4s - eor v6.16b, v4.16b, v6.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - add v2.4s, v2.4s, v7.4s - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #4 - rev32 v3.8h, v3.8h + hint #34 + fmov s1, w3 + movi d0, #0x0000ff000000ff + ldr q2, [x1] + fmov d3, x4 + adrp x8, .LCPI1_0 + mov v1.s[1], w5 + str q2, [x0] + ldr q4, [x8, :lo12:.LCPI1_0] + add x8, x2, #32 + ldr q5, [x1, #16] + and v0.8b, v1.8b, v0.8b + stp q5, q4, [x0, #16] + mov v3.d[1], v0.d[0] + str q3, [x0, #48] + ldp q0, q6, [x2] + uzp1 v1.4s, v0.4s, v6.4s + uzp2 v0.4s, v0.4s, v6.4s + add v2.4s, v2.4s, v1.4s + uzp1 v18.4s, v1.4s, v1.4s add v2.4s, v2.4s, v5.4s - mov v5.s[1], v7.s[2] - add v4.4s, v4.4s, v3.4s - bsl v0.16b, v5.16b, v17.16b - eor v5.16b, v4.16b, v6.16b + eor v3.16b, v2.16b, v3.16b + add v2.4s, v2.4s, v0.4s + rev32 v3.8h, v3.8h + add v4.4s, v3.4s, v4.4s + eor v5.16b, v4.16b, v5.16b ushr v6.4s, v5.4s, #12 shl v5.4s, v5.4s, #20 orr v5.16b, v5.16b, v6.16b @@ -909,78 +126,477 @@ zfs_blake3_compress_xof_sse2: ushr v6.4s, v3.4s, #8 shl v3.4s, v3.4s, #24 orr v3.16b, v3.16b, v6.16b + ld2 { v6.4s, v7.4s }, [x8] add v4.4s, v3.4s, v4.4s - uzp2 v18.4s, v26.4s, v18.4s + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v2.4s, v6.4s eor v5.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v18.4s - ushr v6.4s, v5.4s, #7 + ext v4.16b, v4.16b, v4.16b, #4 + ext v6.16b, v6.16b, v6.16b, #12 + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v7.16b, v7.16b, #12 + add v2.4s, v2.4s, v5.4s + mov v7.16b, v16.16b + eor v3.16b, v3.16b, v2.16b + add v2.4s, v2.4s, v16.4s + mov v7.s[1], v6.s[2] + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v5.16b, v4.16b, v5.16b + ushr v17.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v17.16b + add v2.4s, v2.4s, v5.4s + eor v3.16b, v2.16b, v3.16b + ushr v17.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v17.16b + ext v17.16b, v18.16b, v1.16b, #8 + add v4.4s, v3.4s, v4.4s + uzp2 v17.4s, v17.4s, v0.4s + ext v3.16b, v3.16b, v3.16b, #8 + eor v5.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v17.4s + ext v4.16b, v4.16b, v4.16b, #12 + ushr v18.4s, v5.4s, #7 shl v5.4s, v5.4s, #25 ext v2.16b, v2.16b, v2.16b, #4 - orr v5.16b, v5.16b, v6.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v5.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - add v0.4s, v2.4s, v0.4s - rev32 v2.8h, v3.8h - add v3.4s, v4.4s, v2.4s - eor v4.16b, v3.16b, v5.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v4.16b, v4.16b, v5.16b - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ushr v5.4s, v2.4s, #8 + orr v5.16b, v5.16b, v18.16b + ext v18.16b, v1.16b, v1.16b, #12 + add v2.4s, v2.4s, v5.4s + ext v1.16b, v1.16b, v18.16b, #12 + zip1 v18.2d, v16.2d, v0.2d + zip2 v0.4s, v0.4s, v16.4s + eor v3.16b, v3.16b, v2.16b + rev64 v1.4s, v1.4s + mov v18.s[3], v6.s[3] + zip1 v16.4s, v0.4s, v6.4s + rev32 v3.8h, v3.8h + trn2 v1.4s, v1.4s, v7.4s + zip1 v0.4s, v6.4s, v0.4s + add v4.4s, v4.4s, v3.4s + add v2.4s, v2.4s, v1.4s + ext v6.16b, v0.16b, v16.16b, #8 + eor v5.16b, v4.16b, v5.16b + ushr v7.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v7.16b + add v7.4s, v2.4s, v5.4s + eor v2.16b, v7.16b, v3.16b + ext v7.16b, v7.16b, v7.16b, #12 + ushr v3.4s, v2.4s, #8 shl v2.4s, v2.4s, #24 - orr v2.16b, v2.16b, v5.16b - add v3.4s, v2.4s, v3.4s - eor v4.16b, v3.16b, v4.16b - ext v0.16b, v0.16b, v0.16b, #12 - ushr v5.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - add v0.4s, v0.4s, v21.4s - orr v4.16b, v4.16b, v5.16b - ext v2.16b, v2.16b, v2.16b, #8 - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ext v3.16b, v3.16b, v3.16b, #4 - add v0.4s, v0.4s, v1.4s - rev32 v1.8h, v2.8h - add v2.4s, v3.4s, v1.4s - eor v3.16b, v2.16b, v4.16b + orr v3.16b, v2.16b, v3.16b + ext v2.16b, v18.16b, v18.16b, #12 + add v4.4s, v3.4s, v4.4s + uzp1 v2.4s, v18.4s, v2.4s + ext v3.16b, v3.16b, v3.16b, #8 + eor v5.16b, v4.16b, v5.16b + add v7.4s, v7.4s, v2.4s + ext v4.16b, v4.16b, v4.16b, #4 + ushr v18.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v18.16b + add v7.4s, v7.4s, v5.4s + eor v3.16b, v3.16b, v7.16b + add v7.4s, v7.4s, v6.4s + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v5.16b, v4.16b, v5.16b + ushr v0.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v0.16b, v5.16b, v0.16b + add v5.4s, v7.4s, v0.4s + ext v7.16b, v17.16b, v17.16b, #4 + eor v3.16b, v5.16b, v3.16b + uzp1 v17.4s, v7.4s, v7.4s + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v4.4s, v3.4s, v4.4s + uzp2 v16.4s, v16.4s, v1.4s + ext v3.16b, v3.16b, v3.16b, #8 + eor v0.16b, v4.16b, v0.16b + add v5.4s, v5.4s, v16.4s + ext v4.16b, v4.16b, v4.16b, #12 + ushr v17.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v0.16b, v0.16b, v17.16b + ext v17.16b, v7.16b, v7.16b, #12 + add v5.4s, v5.4s, v0.4s + ext v7.16b, v7.16b, v17.16b, #12 + mov v17.16b, v6.16b + eor v3.16b, v3.16b, v5.16b + rev64 v7.4s, v7.4s + mov v17.s[1], v2.s[2] + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v18.16b, v4.16b, v0.16b + trn2 v0.4s, v7.4s, v17.4s + ushr v7.4s, v18.4s, #12 + shl v17.4s, v18.4s, #20 + add v5.4s, v5.4s, v0.4s + zip1 v18.2d, v6.2d, v1.2d + zip2 v1.4s, v1.4s, v6.4s + orr v7.16b, v17.16b, v7.16b + mov v18.s[3], v2.s[3] + zip1 v6.4s, v1.4s, v2.4s + add v5.4s, v5.4s, v7.4s + zip1 v1.4s, v2.4s, v1.4s + eor v3.16b, v5.16b, v3.16b + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v1.16b, v6.16b, #8 + ushr v17.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v17.16b, v3.16b, v17.16b + ext v3.16b, v18.16b, v18.16b, #12 + add v4.4s, v17.4s, v4.4s + uzp1 v3.4s, v18.4s, v3.4s + ext v17.16b, v17.16b, v17.16b, #8 + eor v7.16b, v4.16b, v7.16b + add v5.4s, v5.4s, v3.4s + ext v4.16b, v4.16b, v4.16b, #4 + ushr v18.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + orr v7.16b, v7.16b, v18.16b + add v5.4s, v5.4s, v7.4s + eor v17.16b, v17.16b, v5.16b + add v5.4s, v5.4s, v6.4s + rev32 v17.8h, v17.8h + add v4.4s, v4.4s, v17.4s + eor v2.16b, v4.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #4 + ushr v1.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v2.16b, v1.16b + add v2.4s, v5.4s, v1.4s + eor v5.16b, v2.16b, v17.16b + uzp1 v17.4s, v7.4s, v7.4s + ushr v16.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v4.4s, v5.4s, v4.4s + uzp2 v16.4s, v16.4s, v0.4s + ext v5.16b, v5.16b, v5.16b, #8 + eor v1.16b, v4.16b, v1.16b + add v2.4s, v2.4s, v16.4s + ext v4.16b, v4.16b, v4.16b, #12 + ushr v17.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v1.16b, v1.16b, v17.16b + ext v17.16b, v7.16b, v7.16b, #12 + add v2.4s, v2.4s, v1.4s + ext v7.16b, v7.16b, v17.16b, #12 + mov v17.16b, v6.16b + eor v5.16b, v5.16b, v2.16b + rev64 v7.4s, v7.4s + mov v17.s[1], v3.s[2] + rev32 v5.8h, v5.8h + add v4.4s, v4.4s, v5.4s + eor v18.16b, v4.16b, v1.16b + trn2 v1.4s, v7.4s, v17.4s + ushr v7.4s, v18.4s, #12 + shl v17.4s, v18.4s, #20 + add v2.4s, v2.4s, v1.4s + zip1 v18.2d, v6.2d, v0.2d + zip2 v0.4s, v0.4s, v6.4s + orr v7.16b, v17.16b, v7.16b + mov v18.s[3], v3.s[3] + add v2.4s, v2.4s, v7.4s + eor v5.16b, v2.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v17.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + orr v5.16b, v5.16b, v17.16b + add v17.4s, v5.4s, v4.4s + ext v4.16b, v18.16b, v18.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + eor v7.16b, v17.16b, v7.16b + uzp1 v4.4s, v18.4s, v4.4s + ext v17.16b, v17.16b, v17.16b, #4 + ushr v18.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + add v2.4s, v2.4s, v4.4s + orr v7.16b, v7.16b, v18.16b + add v2.4s, v2.4s, v7.4s + eor v5.16b, v5.16b, v2.16b + rev32 v5.8h, v5.8h + add v6.4s, v17.4s, v5.4s + zip1 v17.4s, v0.4s, v3.4s + zip1 v0.4s, v3.4s, v0.4s + eor v3.16b, v6.16b, v7.16b + ext v0.16b, v0.16b, v17.16b, #8 + ushr v7.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v2.4s, v2.4s, v0.4s + orr v3.16b, v3.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #4 + add v2.4s, v2.4s, v3.4s + uzp1 v17.4s, v7.4s, v7.4s + eor v5.16b, v2.16b, v5.16b + ushr v16.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v6.4s, v5.4s, v6.4s + uzp2 v16.4s, v16.4s, v1.4s + ext v5.16b, v5.16b, v5.16b, #8 + eor v3.16b, v6.16b, v3.16b + add v2.4s, v2.4s, v16.4s + ext v6.16b, v6.16b, v6.16b, #12 + ushr v17.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v3.16b, v3.16b, v17.16b + add v17.4s, v2.4s, v3.4s + eor v2.16b, v5.16b, v17.16b + ext v5.16b, v7.16b, v7.16b, #12 + rev32 v18.8h, v2.8h + ext v2.16b, v7.16b, v5.16b, #12 + mov v5.16b, v0.16b + add v6.4s, v6.4s, v18.4s + rev64 v2.4s, v2.4s + mov v5.s[1], v4.s[2] + eor v3.16b, v6.16b, v3.16b + trn2 v2.4s, v2.4s, v5.4s + ushr v5.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v7.4s, v17.4s, v2.4s + orr v3.16b, v3.16b, v5.16b + add v5.4s, v7.4s, v3.4s + eor v7.16b, v5.16b, v18.16b + zip1 v18.2d, v0.2d, v1.2d + ext v5.16b, v5.16b, v5.16b, #12 + zip2 v0.4s, v1.4s, v0.4s + ushr v17.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + mov v18.s[3], v4.s[3] + orr v7.16b, v7.16b, v17.16b + ext v17.16b, v18.16b, v18.16b, #12 + add v6.4s, v7.4s, v6.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v19.16b, v6.16b, v3.16b + uzp1 v3.4s, v18.4s, v17.4s + ext v6.16b, v6.16b, v6.16b, #4 + ushr v17.4s, v19.4s, #7 + shl v18.4s, v19.4s, #25 + add v5.4s, v5.4s, v3.4s + orr v17.16b, v18.16b, v17.16b + add v5.4s, v5.4s, v17.4s + eor v7.16b, v7.16b, v5.16b + rev32 v7.8h, v7.8h + add v1.4s, v6.4s, v7.4s + zip1 v6.4s, v0.4s, v4.4s + zip1 v0.4s, v4.4s, v0.4s + eor v4.16b, v1.16b, v17.16b + ext v6.16b, v0.16b, v6.16b, #8 + ushr v0.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v5.4s, v5.4s, v6.4s + zip1 v20.2d, v6.2d, v2.2d + orr v0.16b, v4.16b, v0.16b + mov v20.s[3], v3.s[3] + add v4.4s, v5.4s, v0.4s + eor v5.16b, v4.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #4 + ushr v16.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + uzp1 v17.4s, v7.4s, v7.4s + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v1.4s, v5.4s, v1.4s + uzp2 v16.4s, v16.4s, v2.4s + zip2 v2.4s, v2.4s, v6.4s + eor v0.16b, v1.16b, v0.16b + add v4.4s, v4.4s, v16.4s + ext v1.16b, v1.16b, v1.16b, #12 + ext v16.16b, v16.16b, v16.16b, #4 + ushr v17.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v0.16b, v17.16b + ext v0.16b, v5.16b, v5.16b, #8 + ext v5.16b, v7.16b, v7.16b, #12 + add v4.4s, v4.4s, v17.4s + eor v0.16b, v0.16b, v4.16b + rev32 v18.8h, v0.8h + ext v0.16b, v7.16b, v5.16b, #12 + mov v5.16b, v6.16b + add v7.4s, v1.4s, v18.4s + rev64 v1.4s, v0.4s + mov v5.s[1], v3.s[2] + eor v17.16b, v7.16b, v17.16b + trn2 v1.4s, v1.4s, v5.4s + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v1.4s + orr v17.16b, v17.16b, v19.16b + add v19.4s, v4.4s, v17.4s + eor v4.16b, v19.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + ushr v18.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + orr v18.16b, v4.16b, v18.16b + ext v4.16b, v20.16b, v20.16b, #12 + add v7.4s, v18.4s, v7.4s + uzp1 v4.4s, v20.4s, v4.4s + ext v18.16b, v18.16b, v18.16b, #8 + eor v17.16b, v7.16b, v17.16b + add v19.4s, v19.4s, v4.4s + ext v7.16b, v7.16b, v7.16b, #4 + ushr v20.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v17.16b, v17.16b, v20.16b + add v19.4s, v19.4s, v17.4s + eor v18.16b, v18.16b, v19.16b + rev32 v18.8h, v18.8h + add v6.4s, v7.4s, v18.4s + zip1 v7.4s, v2.4s, v3.4s + zip1 v2.4s, v3.4s, v2.4s + eor v3.16b, v6.16b, v17.16b + ext v2.16b, v2.16b, v7.16b, #8 + ushr v7.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v17.4s, v19.4s, v2.4s + zip1 v1.2d, v2.2d, v1.2d + zip2 v0.4s, v0.4s, v2.4s + orr v3.16b, v3.16b, v7.16b + mov v1.s[3], v4.s[3] + add v7.4s, v17.4s, v3.4s + eor v17.16b, v7.16b, v18.16b + ext v7.16b, v7.16b, v7.16b, #4 + ushr v18.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + orr v17.16b, v17.16b, v18.16b + ext v18.16b, v16.16b, v16.16b, #8 + add v6.4s, v17.4s, v6.4s + uzp2 v5.4s, v18.4s, v5.4s + eor v3.16b, v6.16b, v3.16b + ext v5.16b, v5.16b, v18.16b, #4 + ext v6.16b, v6.16b, v6.16b, #12 + ushr v18.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + add v5.4s, v7.4s, v5.4s + ext v7.16b, v17.16b, v17.16b, #8 + ext v17.16b, v16.16b, v16.16b, #12 + orr v3.16b, v3.16b, v18.16b + ext v16.16b, v16.16b, v17.16b, #12 + add v5.4s, v3.4s, v5.4s + mov v17.16b, v2.16b + rev64 v16.4s, v16.4s + eor v7.16b, v7.16b, v5.16b + mov v17.s[1], v4.s[2] + rev32 v7.8h, v7.8h + trn2 v16.4s, v16.4s, v17.4s + add v6.4s, v6.4s, v7.4s + add v5.4s, v5.4s, v16.4s + eor v3.16b, v6.16b, v3.16b + ushr v17.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v17.16b + add v5.4s, v5.4s, v3.4s + eor v7.16b, v5.16b, v7.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v16.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + orr v7.16b, v7.16b, v16.16b + ext v16.16b, v1.16b, v1.16b, #12 + add v6.4s, v7.4s, v6.4s + uzp1 v1.4s, v1.4s, v16.4s + eor v3.16b, v6.16b, v3.16b + add v1.4s, v5.4s, v1.4s + ext v5.16b, v7.16b, v7.16b, #8 + ext v6.16b, v6.16b, v6.16b, #4 + ushr v16.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + orr v3.16b, v3.16b, v16.16b + add v1.4s, v1.4s, v3.4s + eor v5.16b, v5.16b, v1.16b + rev32 v5.8h, v5.8h + add v2.4s, v6.4s, v5.4s + zip1 v6.4s, v0.4s, v4.4s + zip1 v0.4s, v4.4s, v0.4s + eor v3.16b, v2.16b, v3.16b + ext v0.16b, v0.16b, v6.16b, #8 ushr v4.4s, v3.4s, #12 shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v4.16b - add v0.4s, v0.4s, v3.4s - eor v1.16b, v0.16b, v1.16b - ushr v4.4s, v1.4s, #8 - shl v1.4s, v1.4s, #24 - orr v1.16b, v1.16b, v4.16b - add v2.4s, v1.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ushr v4.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 + add v0.4s, v1.4s, v0.4s + orr v1.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v1.4s + eor v3.16b, v0.16b, v5.16b ext v0.16b, v0.16b, v0.16b, #4 - ext v1.16b, v1.16b, v1.16b, #8 - ext v2.16b, v2.16b, v2.16b, #12 + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 orr v3.16b, v3.16b, v4.16b - eor v0.16b, v2.16b, v0.16b - eor v3.16b, v3.16b, v1.16b - stp q0, q3, [x5] - ldr q0, [x0] - eor v0.16b, v0.16b, v2.16b - str q0, [x5, #32] - ldr q0, [x0, #16] - eor v0.16b, v0.16b, v1.16b - str q0, [x5, #48] + add v2.4s, v3.4s, v2.4s + ext v3.16b, v3.16b, v3.16b, #8 + eor v1.16b, v2.16b, v1.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v4.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + stp q2, q3, [x0, #32] + orr v1.16b, v1.16b, v4.16b + stp q0, q1, [x0] ret .Lfunc_end1: - .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2 + .size compress_pre, .Lfunc_end1-compress_pre + .cfi_endproc + + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: + .cfi_startproc + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + stp x20, x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x20, x0 + mov x19, x5 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x20 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] + eor v0.16b, v2.16b, v0.16b + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr q0, [x20] + eor v0.16b, v0.16b, v2.16b + str q0, [x19, #32] + ldr q0, [x20, #16] + eor v0.16b, v0.16b, v3.16b + str q0, [x19, #48] + ldp x20, x19, [sp, #80] + add sp, sp, #96 + hint #29 + ret +.Lfunc_end2: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end2-zfs_blake3_compress_xof_sse2 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 .p2align 4 -.LCPI2_0: +.LCPI3_0: .word 0 .word 1 .word 2 @@ -991,19 +607,21 @@ zfs_blake3_compress_xof_sse2: .type zfs_blake3_hash_many_sse2,@function zfs_blake3_hash_many_sse2: .cfi_startproc + hint #25 + .cfi_negate_ra_state stp d15, d14, [sp, #-160]! stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] stp d9, d8, [sp, #48] stp x29, x30, [sp, #64] + add x29, sp, #64 stp x28, x27, [sp, #80] stp x26, x25, [sp, #96] stp x24, x23, [sp, #112] stp x22, x21, [sp, #128] stp x20, x19, [sp, #144] - mov x29, sp - sub sp, sp, #384 - .cfi_def_cfa w29, 160 + sub sp, sp, #464 + .cfi_def_cfa w29, 96 .cfi_offset w19, -8 .cfi_offset w20, -16 .cfi_offset w21, -24 @@ -1024,1414 +642,1406 @@ zfs_blake3_hash_many_sse2: .cfi_offset b13, -144 .cfi_offset b14, -152 .cfi_offset b15, -160 - ldr x26, [x29, #168] - ldrb w27, [x29, #160] mov w19, w6 mov x20, x4 - mov x22, x2 - mov x28, x1 + mov x24, x1 + ldr x26, [x29, #104] + ldrb w27, [x29, #96] cmp x1, #4 - mov x24, x0 str x3, [sp, #40] - b.lo .LBB2_8 - adrp x9, .LCPI2_0 - ldr q0, [x9, :lo12:.LCPI2_0] - sbfx w11, w5, #0, #1 - dup v1.4s, w11 - mov w9, #58983 + b.lo .LBB3_6 + adrp x8, .LCPI3_0 + sbfx w9, w5, #0, #1 mov w10, #44677 - and v0.16b, v1.16b, v0.16b mov w11, #62322 - mov w12, #62778 - orr w8, w7, w19 - movk w9, #27145, lsl #16 movk w10, #47975, lsl #16 movk w11, #15470, lsl #16 + ldr q0, [x8, :lo12:.LCPI3_0] + dup v1.4s, w9 + mov w9, #58983 + orr w8, w7, w19 + movk w9, #27145, lsl #16 + and v0.16b, v1.16b, v0.16b + dup v1.4s, w11 + movi v24.4s, #64 + dup v2.4s, w9 + mov w9, #62778 + movk w9, #42319, lsl #16 str q0, [sp, #16] orr v0.4s, #128, lsl #24 - movk w12, #42319, lsl #16 + stp q2, q1, [sp, #48] str q0, [sp] -.LBB2_2: - ldr x0, [sp, #40] - mov x13, x0 - ld1r { v20.4s }, [x13], #4 - add x14, x0, #8 - add x15, x0, #12 - add x16, x0, #16 - add x17, x0, #20 - add x18, x0, #24 - add x0, x0, #28 - ld1r { v17.4s }, [x14] - ld1r { v6.4s }, [x15] - ld1r { v8.4s }, [x16] - ld1r { v9.4s }, [x17] - ld1r { v31.4s }, [x18] - ld1r { v26.4s }, [x13] - ld1r { v15.4s }, [x0] - cbz x22, .LBB2_7 + dup v0.4s, w10 + str q0, [sp, #80] + b .LBB3_3 +.LBB3_2: + zip1 v0.4s, v12.4s, v31.4s + add x10, x20, #4 + zip1 v1.4s, v29.4s, v30.4s + tst w5, #0x1 + zip1 v2.4s, v28.4s, v23.4s + csel x20, x10, x20, ne + zip1 v3.4s, v13.4s, v25.4s + add x0, x0, #32 + zip2 v6.4s, v12.4s, v31.4s + sub x24, x24, #4 + zip1 v4.2d, v0.2d, v1.2d + cmp x24, #3 + zip2 v7.4s, v29.4s, v30.4s + zip1 v5.2d, v2.2d, v3.2d + zip2 v0.2d, v0.2d, v1.2d + zip2 v1.2d, v2.2d, v3.2d + zip2 v2.4s, v28.4s, v23.4s + zip2 v3.4s, v13.4s, v25.4s + stp q4, q5, [x26] + zip2 v4.2d, v6.2d, v7.2d + stp q0, q1, [x26, #32] + zip1 v0.2d, v6.2d, v7.2d + zip1 v1.2d, v2.2d, v3.2d + zip2 v2.2d, v2.2d, v3.2d + stp q0, q1, [x26, #64] + stp q4, q2, [x26, #96] + add x26, x26, #128 + b.ls .LBB3_6 +.LBB3_3: + ldr x14, [sp, #40] + mov x10, x14 + add x11, x14, #8 + add x12, x14, #12 + add x13, x14, #16 + ld1r { v12.4s }, [x10], #4 + ld1r { v29.4s }, [x11] + add x11, x14, #20 + ld1r { v30.4s }, [x12] + add x12, x14, #24 + ld1r { v28.4s }, [x13] + ld1r { v23.4s }, [x11] + add x11, x14, #28 + ld1r { v13.4s }, [x12] + ld1r { v31.4s }, [x10] + ld1r { v25.4s }, [x11] + cbz x2, .LBB3_2 ldr q1, [sp, #16] dup v0.4s, w20 - ldp x13, x14, [x24] - ldp x15, x16, [x24, #16] + lsr x12, x20, #32 + mov x10, xzr + ldp x13, x14, [x0, #16] add v1.4s, v0.4s, v1.4s + mov x15, x2 movi v0.4s, #128, lsl #24 - str q1, [sp, #64] + mov w4, w8 + str q1, [sp, #112] eor v0.16b, v1.16b, v0.16b ldr q1, [sp] - lsr x18, x20, #32 - mov x17, xzr cmgt v0.4s, v1.4s, v0.4s - dup v1.4s, w18 + dup v1.4s, w12 + ldp x11, x12, [x0] sub v0.4s, v1.4s, v0.4s - mov w18, w8 - str q0, [sp, #48] -.LBB2_4: - mov w2, #16 - bfi x2, x17, #6, #58 - ldr q1, [x13, x2] - ldr q3, [x14, x2] - ldr q2, [x15, x2] - ldr q4, [x16, x2] - mov w2, #32 - bfi x2, x17, #6, #58 - ldr q5, [x13, x2] - ldr q18, [x14, x2] - ldr q19, [x15, x2] - ldr q23, [x16, x2] - mov w2, #48 - lsl x3, x17, #6 - bfi x2, x17, #6, #58 - add x17, x17, #1 - ldr q0, [x13, x3] - ldr q21, [x14, x3] - ldr q7, [x15, x3] - ldr q16, [x16, x3] - cmp x17, x22 - ldr q13, [x13, x2] - ldr q14, [x14, x2] - ldr q29, [x15, x2] - ldr q10, [x16, x2] - csel w2, w27, wzr, eq - orr w18, w2, w18 - mov x0, xzr - and w18, w18, #0xff - add x3, x3, #256 -.LBB2_5: - ldr x2, [x24, x0] - add x0, x0, #8 - cmp x0, #32 - add x2, x2, x3 - prfm pldl1keep, [x2] - b.ne .LBB2_5 - dup v22.4s, w18 - str q22, [sp, #192] - zip1 v27.4s, v0.4s, v21.4s - zip2 v21.4s, v0.4s, v21.4s - zip1 v0.4s, v7.4s, v16.4s - zip2 v22.4s, v7.4s, v16.4s - zip1 v7.4s, v1.4s, v3.4s - zip1 v25.4s, v2.4s, v4.4s - zip2 v16.4s, v2.4s, v4.4s - zip1 v11.4s, v19.4s, v23.4s - zip2 v12.4s, v19.4s, v23.4s - zip1 v19.4s, v13.4s, v14.4s - zip2 v23.4s, v13.4s, v14.4s - zip1 v13.4s, v29.4s, v10.4s - zip2 v14.4s, v29.4s, v10.4s - add v10.4s, v20.4s, v8.4s - add v2.4s, v26.4s, v9.4s - ext v20.16b, v22.16b, v21.16b, #8 - ext v26.16b, v25.16b, v7.16b, #8 - zip2 v24.4s, v1.4s, v3.4s - add v1.4s, v6.4s, v15.4s - ext v6.16b, v0.16b, v27.16b, #8 - ext v20.16b, v21.16b, v20.16b, #8 - mov v21.d[1], v22.d[0] - ext v22.16b, v7.16b, v26.16b, #8 - mov v7.d[1], v25.d[0] - add v3.4s, v17.4s, v31.4s - str q1, [sp, #144] - ext v1.16b, v27.16b, v6.16b, #8 - mov v6.16b, v7.16b - zip1 v28.4s, v5.4s, v18.4s - stur q1, [x29, #-80] - mov v1.16b, v27.16b - mov v27.16b, v24.16b - add v3.4s, v3.4s, v6.4s - ldr q6, [sp, #64] - ext v29.16b, v16.16b, v24.16b, #8 - mov v1.d[1], v0.d[0] - ext v0.16b, v11.16b, v28.16b, #8 - mov v27.d[1], v16.d[0] - ext v16.16b, v14.16b, v23.16b, #8 - stur q7, [x29, #-144] - ext v7.16b, v24.16b, v29.16b, #8 - ext v29.16b, v28.16b, v0.16b, #8 - ext v0.16b, v23.16b, v16.16b, #8 - mov v23.d[1], v14.d[0] - stp q0, q23, [sp, #80] - add v0.4s, v10.4s, v1.4s - eor v16.16b, v0.16b, v6.16b - ldr q6, [sp, #48] - add v2.4s, v2.4s, v21.4s - mov v28.d[1], v11.d[0] - zip2 v18.4s, v5.4s, v18.4s - eor v10.16b, v2.16b, v6.16b - movi v6.4s, #64 - eor v11.16b, v3.16b, v6.16b - ldr q6, [sp, #144] - dup v17.4s, w9 - ext v30.16b, v12.16b, v18.16b, #8 - rev32 v16.8h, v16.8h - dup v5.4s, w10 - ext v25.16b, v18.16b, v30.16b, #8 - mov v30.16b, v23.16b - mov v23.16b, v1.16b - str q1, [sp, #160] - rev32 v10.8h, v10.8h - add v1.4s, v16.4s, v17.4s - add v17.4s, v6.4s, v27.4s - ldr q6, [sp, #192] - dup v4.4s, w11 - rev32 v11.8h, v11.8h - add v5.4s, v10.4s, v5.4s - eor v8.16b, v1.16b, v8.16b - stur q21, [x29, #-128] - mov v18.d[1], v12.d[0] - add v4.4s, v11.4s, v4.4s - eor v9.16b, v5.16b, v9.16b - ushr v12.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - ldur q21, [x29, #-80] - ext v26.16b, v13.16b, v19.16b, #8 - eor v31.16b, v4.16b, v31.16b - orr v8.16b, v8.16b, v12.16b - ushr v12.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - ext v26.16b, v19.16b, v26.16b, #8 - mov v19.d[1], v13.d[0] - orr v9.16b, v9.16b, v12.16b - ushr v12.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v17.16b, v6.16b - orr v31.16b, v31.16b, v12.16b - dup v12.4s, w12 - rev32 v13.8h, v13.8h - add v12.4s, v13.4s, v12.4s - add v0.4s, v0.4s, v21.4s - eor v14.16b, v12.16b, v15.16b - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v20.4s - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v22.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v7.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v13.16b, v17.16b, v13.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v13.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v28.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v18.4s - orr v14.16b, v14.16b, v15.16b - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v19.4s - rev32 v13.8h, v13.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v30.4s - add v4.4s, v4.4s, v13.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - mov v24.16b, v7.16b - stur q7, [x29, #-112] - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - mov v7.16b, v26.16b - add v3.4s, v3.4s, v26.4s - ldr q26, [sp, #80] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v5.4s, v11.4s - add v0.4s, v0.4s, v29.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v25.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v26.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v13.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - str q22, [sp, #128] - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - ldur q22, [x29, #-128] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - eor v8.16b, v5.16b, v8.16b - mov v6.16b, v18.16b - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - ldur q18, [x29, #-144] - orr v8.16b, v8.16b, v15.16b - add v0.4s, v0.4s, v22.4s - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v20.4s - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v24.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v18.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v13.16b, v17.16b, v13.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v13.8h, v13.8h - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v13.4s + str q0, [sp, #96] +.LBB3_5: + add x17, x11, x10 + add x21, x12, x10 + add x16, x13, x10 + add x6, x14, x10 + subs x15, x15, #1 + add x10, x10, #64 + ldp q0, q1, [x17] + csel w3, w27, wzr, eq + orr w3, w3, w4 + mov w4, w19 + and w3, w3, #0xff + ldp q3, q6, [x21] + dup v2.4s, w3 + zip1 v21.4s, v0.4s, v3.4s + zip2 v19.4s, v0.4s, v3.4s + ldp q5, q7, [x16] + zip1 v17.4s, v1.4s, v6.4s + zip2 v22.4s, v1.4s, v6.4s + ldp q16, q18, [x6] + zip1 v4.4s, v5.4s, v16.4s + zip2 v0.4s, v5.4s, v16.4s + ldp q26, q27, [x17, #32] + zip1 v1.4s, v7.4s, v18.4s + zip2 v3.4s, v7.4s, v18.4s + zip2 v20.2d, v19.2d, v0.2d + mov v19.d[1], v0.d[0] + dup v18.4s, w9 + ldp q8, q9, [x21, #32] + stur q19, [x29, #-208] + zip2 v7.4s, v26.4s, v8.4s + zip1 v10.4s, v26.4s, v8.4s + ldp q11, q5, [x16, #32] + zip2 v26.2d, v17.2d, v1.2d + stp q7, q26, [sp, #192] + mov v17.d[1], v1.d[0] + add v1.4s, v23.4s, v31.4s + ldp q16, q6, [x6, #32] + stur q17, [x29, #-256] + add v1.4s, v1.4s, v19.4s + zip1 v8.4s, v11.4s, v16.4s + zip2 v7.4s, v11.4s, v16.4s + zip1 v11.4s, v27.4s, v9.4s + zip2 v9.4s, v27.4s, v9.4s + zip2 v27.2d, v21.2d, v4.2d + mov v21.d[1], v4.d[0] + str q7, [sp, #224] + add v4.4s, v28.4s, v12.4s + zip1 v15.4s, v5.4s, v6.4s + zip2 v14.4s, v5.4s, v6.4s + stur q27, [x29, #-192] + zip2 v16.2d, v22.2d, v3.2d + stp q20, q21, [x29, #-240] + add v0.4s, v4.4s, v21.4s + ldp q6, q4, [sp, #96] + mov v22.d[1], v3.d[0] + add v5.4s, v25.4s, v30.4s + add v3.4s, v13.4s, v29.4s + eor v6.16b, v1.16b, v6.16b + add v1.4s, v1.4s, v20.4s + str q22, [sp, #256] + eor v4.16b, v0.16b, v4.16b + add v5.4s, v5.4s, v22.4s + add v3.4s, v3.4s, v17.4s + ldr q17, [sp, #48] + rev32 v6.8h, v6.8h + rev32 v4.8h, v4.8h + eor v2.16b, v5.16b, v2.16b + eor v7.16b, v3.16b, v24.16b add v0.4s, v0.4s, v27.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v6.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v23.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v7.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v13.16b, v17.16b, v13.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v13.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v21.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v19.4s - orr v14.16b, v14.16b, v15.16b - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v29.4s - str q28, [sp, #112] - rev32 v13.8h, v13.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v26.4s - add v4.4s, v4.4s, v13.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - ldp q28, q23, [sp, #112] - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ldr q21, [sp, #96] - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v5.4s, v11.4s - add v0.4s, v0.4s, v25.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v23.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v21.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v28.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v13.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - eor v8.16b, v5.16b, v8.16b - mov v30.16b, v29.16b - mov v29.16b, v25.16b - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - ldur q25, [x29, #-112] - orr v8.16b, v8.16b, v15.16b - add v0.4s, v0.4s, v20.4s - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v6.4s - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v7.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v25.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v13.16b, v17.16b, v13.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v13.8h, v13.8h - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v13.4s - add v0.4s, v0.4s, v18.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v19.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v22.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v21.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v13.16b, v17.16b, v13.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v13.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v27.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v30.4s - orr v14.16b, v14.16b, v15.16b - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v29.4s - rev32 v13.8h, v13.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v28.4s - add v4.4s, v4.4s, v13.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - ldr q24, [sp, #160] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v5.4s, v11.4s - stur q7, [x29, #-64] - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v8.16b, v5.16b, v8.16b - mov v7.16b, v26.16b - add v3.4s, v3.4s, v26.4s - ldur q26, [x29, #-80] - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - add v0.4s, v0.4s, v23.4s - orr v8.16b, v8.16b, v15.16b - add v15.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v24.4s - eor v0.16b, v15.16b, v13.16b - add v2.4s, v2.4s, v31.4s - ushr v13.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v26.4s - orr v0.16b, v0.16b, v13.16b - ushr v13.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v13.16b - ushr v13.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v0.4s, v4.4s - orr v10.16b, v10.16b, v13.16b - ushr v13.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v13.16b - ushr v13.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - orr v9.16b, v9.16b, v13.16b - ushr v13.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - add v1.4s, v10.4s, v1.4s - orr v31.16b, v31.16b, v13.16b - eor v13.16b, v1.16b, v14.16b - add v5.4s, v11.4s, v5.4s - ushr v14.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v13.16b, v13.16b, v14.16b - ushr v14.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - stur q6, [x29, #-96] - orr v8.16b, v8.16b, v14.16b - add v14.4s, v15.4s, v6.4s - ldur q6, [x29, #-64] - mov v18.16b, v19.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v18.4s - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v21.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v6.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v0.16b, v17.16b, v0.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v0.8h, v0.8h + add v21.4s, v4.4s, v17.4s + rev32 v31.8h, v2.8h + ldr q2, [sp, #80] + rev32 v7.8h, v7.8h + mov v27.16b, v16.16b + eor v17.16b, v21.16b, v28.16b + add v29.4s, v6.4s, v2.4s + ldr q2, [sp, #64] + add v24.4s, v31.4s, v18.4s str q27, [sp, #176] - mov v27.16b, v30.16b - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v0.4s - add v14.4s, v14.4s, v25.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v27.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v20.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v7.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - mov v30.16b, v23.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v0.16b, v17.16b, v0.16b - add v1.4s, v16.4s, v1.4s - ldur q23, [x29, #-144] - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v0.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v23.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v29.4s - orr v13.16b, v13.16b, v15.16b - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v30.4s - rev32 v0.8h, v0.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v30.4s, v7.4s, v2.4s + eor v18.16b, v29.16b, v23.16b + orr v12.16b, v17.16b, v19.16b + eor v17.16b, v30.16b, v13.16b + eor v19.16b, v24.16b, v25.16b + ushr v23.4s, v18.4s, #12 + shl v18.4s, v18.4s, #20 + ushr v25.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + ushr v28.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v13.16b, v18.16b, v23.16b + orr v25.16b, v17.16b, v25.16b + orr v2.16b, v19.16b, v28.16b + add v28.4s, v0.4s, v12.4s + add v0.4s, v3.4s, v26.4s + add v18.4s, v1.4s, v13.4s + add v3.4s, v5.4s, v16.4s + eor v1.16b, v28.16b, v4.16b + add v17.4s, v0.4s, v25.4s + eor v0.16b, v18.16b, v6.16b + add v19.4s, v3.4s, v2.4s + ushr v16.4s, v1.4s, #8 + shl v3.4s, v1.4s, #24 + eor v4.16b, v17.16b, v7.16b + ushr v6.4s, v0.4s, #8 + shl v1.4s, v0.4s, #24 + eor v5.16b, v19.16b, v31.16b + ushr v23.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + orr v7.16b, v3.16b, v16.16b + orr v6.16b, v1.16b, v6.16b + ushr v31.4s, v5.4s, #8 + shl v0.4s, v5.4s, #24 + orr v5.16b, v4.16b, v23.16b + add v4.4s, v7.4s, v21.4s + ldr q21, [sp, #192] + add v3.4s, v6.4s, v29.4s + orr v31.16b, v0.16b, v31.16b + add v23.4s, v5.4s, v30.4s + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v3.16b, v13.16b + add v16.4s, v31.4s, v24.4s + eor v20.16b, v23.16b, v25.16b + ushr v24.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v29.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v30.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v25.16b, v0.16b, v24.16b + orr v0.16b, v1.16b, v29.16b + mov v29.16b, v10.16b + orr v1.16b, v20.16b, v30.16b + mov v20.16b, v10.16b + mov v24.16b, v21.16b + ldr q20, [sp, #224] + mov v29.d[1], v8.d[0] + mov v13.16b, v9.16b + zip2 v30.2d, v10.2d, v8.2d + zip2 v8.2d, v21.2d, v20.2d + mov v26.16b, v11.16b + mov v24.d[1], v20.d[0] + add v20.4s, v28.4s, v29.4s + mov v13.d[1], v14.d[0] + str q8, [sp, #128] + eor v2.16b, v16.16b, v2.16b + mov v26.d[1], v15.d[0] + str q24, [sp, #192] + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v13.4s + ushr v12.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + zip2 v10.2d, v9.2d, v14.2d + add v18.4s, v18.4s, v24.4s add v17.4s, v17.4s, v26.4s - add v4.4s, v4.4s, v0.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - ldur q22, [x29, #-128] - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - ldr q26, [sp, #176] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v5.4s, v11.4s - add v14.4s, v14.4s, v24.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v22.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v28.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s - add v17.4s, v17.4s, v26.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v0.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - orr v8.16b, v8.16b, v15.16b - add v14.4s, v14.4s, v18.4s - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v27.4s - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v7.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s + mov v14.16b, v26.16b + eor v26.16b, v20.16b, v31.16b + stp q10, q30, [sp, #224] + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v12.16b + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v30.4s + zip2 v21.2d, v11.2d, v15.2d + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v10.4s + add v20.4s, v20.4s, v0.4s + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v8.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b add v17.4s, v17.4s, v21.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v0.16b, v17.16b, v0.16b - add v14.4s, v14.4s, v6.4s - ldur q6, [x29, #-96] - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v0.8h, v0.8h - stur q20, [x29, #-160] - mov v20.16b, v29.16b - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v0.4s - mov v19.16b, v29.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v20.4s - mov v19.16b, v28.16b - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v6.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v19.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v0.16b, v17.16b, v0.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v0.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v25.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v30.4s - orr v13.16b, v13.16b, v15.16b - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v24.4s - rev32 v0.8h, v0.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s - add v17.4s, v17.4s, v26.4s - mov v29.16b, v27.16b - add v4.4s, v4.4s, v0.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - ldur q27, [x29, #-160] - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ldur q6, [x29, #-80] - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v5.4s, v11.4s - add v14.4s, v14.4s, v22.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v27.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v6.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s - add v17.4s, v17.4s, v23.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v0.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - orr v8.16b, v8.16b, v15.16b - add v14.4s, v14.4s, v29.4s - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v20.4s - mov v28.16b, v7.16b - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v19.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v28.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v0.16b, v17.16b, v0.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v0.8h, v0.8h - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v0.4s - add v14.4s, v14.4s, v21.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v30.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - add v3.4s, v3.4s, v18.4s - orr v10.16b, v10.16b, v15.16b - add v15.4s, v3.4s, v31.4s - eor v3.16b, v15.16b, v11.16b - ushr v11.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v11.16b, v3.16b, v11.16b - add v3.4s, v17.4s, v6.4s - add v17.4s, v3.4s, v13.4s - eor v0.16b, v17.16b, v0.16b - ushr v3.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - add v1.4s, v16.4s, v1.4s - orr v0.16b, v0.16b, v3.16b - eor v3.16b, v1.16b, v8.16b - ushr v8.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - add v5.4s, v10.4s, v5.4s - orr v8.16b, v3.16b, v8.16b - eor v3.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - ushr v9.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - eor v31.16b, v4.16b, v31.16b - mov v7.16b, v23.16b - mov v23.16b, v28.16b - mov v28.16b, v6.16b - orr v3.16b, v3.16b, v9.16b - ushr v9.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - ldur q6, [x29, #-64] - orr v31.16b, v31.16b, v9.16b - add v9.4s, v0.4s, v12.4s - eor v12.16b, v9.16b, v13.16b - ushr v13.4s, v12.4s, #7 - shl v12.4s, v12.4s, #25 - orr v12.16b, v12.16b, v13.16b - add v13.4s, v14.4s, v6.4s - add v13.4s, v13.4s, v3.4s - eor v0.16b, v13.16b, v0.16b - add v2.4s, v2.4s, v24.4s - rev32 v14.8h, v0.8h - add v0.4s, v2.4s, v31.4s - add v6.4s, v4.4s, v14.4s - eor v2.16b, v0.16b, v16.16b - eor v3.16b, v6.16b, v3.16b - rev32 v16.8h, v2.8h - ushr v4.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v2.4s, v9.4s, v16.4s - orr v4.16b, v3.16b, v4.16b - eor v3.16b, v2.16b, v31.16b - ushr v31.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v31.16b - add v31.4s, v15.4s, v22.4s - add v31.4s, v31.4s, v12.4s - add v17.4s, v17.4s, v7.4s - eor v9.16b, v31.16b, v10.16b - add v17.4s, v17.4s, v8.4s - rev32 v9.8h, v9.8h - eor v11.16b, v17.16b, v11.16b - add v1.4s, v1.4s, v9.4s - rev32 v11.8h, v11.8h - eor v10.16b, v1.16b, v12.16b - add v5.4s, v5.4s, v11.4s - ushr v12.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v8.16b, v5.16b, v8.16b - orr v10.16b, v10.16b, v12.16b - ushr v12.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - orr v8.16b, v8.16b, v12.16b - add v12.4s, v13.4s, v27.4s - add v12.4s, v12.4s, v4.4s - eor v13.16b, v12.16b, v14.16b - ldur q14, [x29, #-96] - mov v25.16b, v29.16b - add v29.4s, v12.4s, v20.4s - add v20.4s, v31.4s, v26.4s - add v0.4s, v0.4s, v14.4s - add v0.4s, v0.4s, v3.4s - eor v16.16b, v0.16b, v16.16b - add v0.4s, v0.4s, v30.4s - ldur q30, [x29, #-112] - add v20.4s, v20.4s, v10.4s - eor v31.16b, v20.16b, v9.16b - add v20.4s, v20.4s, v28.4s - add v17.4s, v17.4s, v30.4s - add v17.4s, v17.4s, v8.4s - eor v9.16b, v17.16b, v11.16b - ushr v28.4s, v13.4s, #8 - shl v11.4s, v13.4s, #24 - orr v28.16b, v11.16b, v28.16b - ushr v11.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v16.16b, v16.16b, v11.16b - ushr v11.4s, v31.4s, #8 - shl v31.4s, v31.4s, #24 - add v6.4s, v28.4s, v6.4s - orr v31.16b, v31.16b, v11.16b - ushr v11.4s, v9.4s, #8 - shl v9.4s, v9.4s, #24 - add v2.4s, v16.4s, v2.4s - eor v4.16b, v6.16b, v4.16b - orr v9.16b, v9.16b, v11.16b - add v1.4s, v31.4s, v1.4s - eor v3.16b, v2.16b, v3.16b - ushr v11.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - add v5.4s, v9.4s, v5.4s - eor v10.16b, v1.16b, v10.16b - orr v4.16b, v4.16b, v11.16b - ushr v11.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v3.16b, v3.16b, v11.16b - ushr v11.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - orr v10.16b, v10.16b, v11.16b - ushr v11.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - orr v8.16b, v8.16b, v11.16b - add v29.4s, v29.4s, v8.4s - eor v16.16b, v29.16b, v16.16b - add v0.4s, v0.4s, v4.4s - mov v12.16b, v26.16b - add v17.4s, v17.4s, v19.4s - add v26.4s, v29.4s, v23.4s - eor v29.16b, v0.16b, v31.16b - add v20.4s, v20.4s, v3.4s - rev32 v16.8h, v16.8h - stur q18, [x29, #-176] - mov v18.16b, v27.16b - add v0.4s, v0.4s, v24.4s - eor v27.16b, v20.16b, v9.16b - add v17.4s, v17.4s, v10.4s - rev32 v24.8h, v29.8h - add v1.4s, v1.4s, v16.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + ldp q28, q12, [x29, #-256] + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v12.4s + mov v15.16b, v29.16b + ldur q29, [x29, #-208] + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + str q15, [sp, #160] + add v20.4s, v20.4s, v29.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v27.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v28.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + add v3.4s, v3.4s, v6.4s + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v24.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v22.4s + add v18.4s, v18.4s, v0.4s + mov v9.16b, v30.16b + mov v30.16b, v21.16b + ldur q21, [x29, #-224] + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + str q30, [sp, #144] + add v17.4s, v17.4s, v21.4s + ldur q21, [x29, #-192] + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v30.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v20.4s, v20.4s, v21.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v10.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v14.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v9.4s + ldr q9, [sp, #208] + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v8.4s + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v15.4s + add v20.4s, v20.4s, v0.4s + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v9.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v13.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v24.4s + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v12.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v30.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v27.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + add v3.4s, v3.4s, v6.4s + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v14.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v28.4s + add v18.4s, v18.4s, v0.4s + mov v10.16b, v13.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b add v20.4s, v20.4s, v25.4s - eor v25.16b, v17.16b, v28.16b - rev32 v27.8h, v27.8h - add v5.4s, v5.4s, v24.4s - eor v28.16b, v1.16b, v8.16b - rev32 v25.8h, v25.8h - add v6.4s, v6.4s, v27.4s - eor v4.16b, v5.16b, v4.16b - ushr v31.4s, v28.4s, #12 - shl v28.4s, v28.4s, #20 - add v2.4s, v2.4s, v25.4s - eor v3.16b, v6.16b, v3.16b - orr v28.16b, v28.16b, v31.16b - ushr v31.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - eor v29.16b, v2.16b, v10.16b - orr v4.16b, v4.16b, v31.16b - ushr v31.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v26.4s, v26.4s, v28.4s - orr v3.16b, v3.16b, v31.16b - ushr v31.4s, v29.4s, #12 - shl v29.4s, v29.4s, #20 - eor v16.16b, v26.16b, v16.16b - add v0.4s, v0.4s, v4.4s - add v17.4s, v17.4s, v12.4s - orr v29.16b, v29.16b, v31.16b - eor v24.16b, v0.16b, v24.16b - add v0.4s, v0.4s, v22.4s - add v20.4s, v20.4s, v3.4s - ushr v22.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - add v23.4s, v26.4s, v21.4s - eor v21.16b, v20.16b, v27.16b add v17.4s, v17.4s, v29.4s - orr v16.16b, v16.16b, v22.16b - ushr v22.4s, v24.4s, #8 - shl v24.4s, v24.4s, #24 - eor v25.16b, v17.16b, v25.16b - orr v22.16b, v24.16b, v22.16b + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v10.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + mov v22.16b, v8.16b + ldp q8, q28, [sp, #240] + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v20.4s, v20.4s, v28.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v15.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v8.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v22.4s + ldur q22, [x29, #-256] + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v9.4s + mov v13.16b, v12.16b + mov v12.16b, v27.16b + mov v27.16b, v9.16b + ldur q9, [x29, #-192] + mov v21.16b, v15.16b + ldr q15, [sp, #224] + ushr v11.4s, v1.4s, #12 + ldur q21, [x29, #-224] + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v9.4s + add v20.4s, v20.4s, v0.4s + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v21.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v15.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v14.4s + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v24.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v10.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v30.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + add v3.4s, v3.4s, v6.4s + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v8.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v12.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v13.4s + ldr q13, [sp, #160] + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v15.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v20.4s, v20.4s, v22.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v9.4s + mov v29.16b, v14.16b + ldr q14, [sp, #128] + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v14.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v27.4s + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v21.4s + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v28.4s + add v20.4s, v20.4s, v0.4s + mov v12.16b, v27.16b + ldur q27, [x29, #-208] + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v27.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v13.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v8.4s + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v29.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v15.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v10.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + add v3.4s, v3.4s, v6.4s + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v14.4s + mov v30.16b, v29.16b + mov v29.16b, v15.16b + ldr q15, [sp, #144] + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v15.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v24.4s + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v13.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + mov v9.16b, v28.16b + mov v28.16b, v10.16b + ldr q10, [sp, #176] + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v20.4s, v20.4s, v10.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v9.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v12.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + add v20.4s, v20.4s, v27.4s + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v22.4s + mov v9.16b, v22.16b + ldur q22, [x29, #-240] + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v22.4s + add v19.4s, v19.4s, v25.4s + mov v24.16b, v21.16b + ldur q21, [x29, #-192] + orr v2.16b, v2.16b, v11.16b + eor v26.16b, v20.16b, v26.16b + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + add v17.4s, v17.4s, v2.4s + shl v26.4s, v26.4s, #24 + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v23.4s, v26.4s, v23.4s + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + orr v6.16b, v6.16b, v11.16b + eor v0.16b, v23.16b, v0.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + add v4.4s, v6.4s, v4.4s + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v8.4s + add v18.4s, v18.4s, v14.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v13.4s + add v18.4s, v18.4s, v0.4s + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v29.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v6.16b, v18.16b + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + rev32 v6.8h, v6.8h + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + add v3.4s, v3.4s, v6.4s + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + orr v0.16b, v0.16b, v11.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v28.4s + add v18.4s, v18.4s, v12.4s + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v30.4s + add v18.4s, v18.4s, v0.4s + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v21.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v18.16b, v6.16b + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v7.16b, v7.16b, v31.16b + eor v26.16b, v19.16b, v26.16b + orr v6.16b, v6.16b, v11.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + add v3.4s, v6.4s, v3.4s + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + orr v0.16b, v0.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v15.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v24.4s + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v9.4s + mov v8.16b, v13.16b + ldur q13, [x29, #-208] + orr v2.16b, v2.16b, v11.16b + add v18.4s, v18.4s, v1.4s + add v17.4s, v17.4s, v13.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + eor v7.16b, v18.16b, v7.16b + add v17.4s, v17.4s, v2.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + rev32 v7.8h, v7.8h + eor v6.16b, v17.16b, v6.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + add v16.4s, v16.4s, v7.4s + rev32 v6.8h, v6.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + eor v1.16b, v16.16b, v1.16b + add v4.4s, v4.4s, v6.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + orr v0.16b, v0.16b, v31.16b + eor v2.16b, v4.16b, v2.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v20.4s, v20.4s, v22.4s + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v10.4s + mov v27.16b, v12.16b + mov v12.16b, v30.16b + mov v29.16b, v21.16b + mov v21.16b, v24.16b + ldr q24, [sp, #192] + mov v30.16b, v22.16b + ldr q22, [sp, #256] + orr v2.16b, v2.16b, v11.16b + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v24.4s + add v19.4s, v19.4s, v25.4s + add v17.4s, v17.4s, v22.4s + eor v26.16b, v20.16b, v26.16b + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + eor v6.16b, v17.16b, v6.16b + orr v26.16b, v26.16b, v31.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v23.4s, v26.4s, v23.4s + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + orr v6.16b, v6.16b, v11.16b + eor v0.16b, v23.16b, v0.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + add v4.4s, v6.4s, v4.4s + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + eor v2.16b, v4.16b, v2.16b + orr v0.16b, v0.16b, v31.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v20.4s, v20.4s, v14.4s + add v18.4s, v18.4s, v27.4s + ldr q27, [sp, #224] + orr v1.16b, v1.16b, v31.16b + orr v2.16b, v2.16b, v11.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v29.4s + add v18.4s, v18.4s, v0.4s + add v19.4s, v19.4s, v8.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v6.16b, v18.16b + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + rev32 v6.8h, v6.8h + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + add v3.4s, v3.4s, v6.4s + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v16.4s, v26.4s + ushr v29.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v1.16b, v23.16b, v1.16b + eor v2.16b, v16.16b, v2.16b + orr v25.16b, v25.16b, v29.16b + orr v0.16b, v0.16b, v31.16b + ushr v29.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + ushr v31.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v21.4s + ldr q21, [sp, #240] + add v20.4s, v20.4s, v27.4s + prfm pldl1keep, [x17, #256] + orr v1.16b, v1.16b, v29.16b + prfm pldl1keep, [x21, #256] + orr v2.16b, v2.16b, v31.16b + prfm pldl1keep, [x16, #256] + add v18.4s, v18.4s, v0.4s + prfm pldl1keep, [x6, #256] + add v17.4s, v17.4s, v21.4s + add v19.4s, v19.4s, v22.4s + add v20.4s, v20.4s, v25.4s + eor v6.16b, v18.16b, v6.16b + add v17.4s, v17.4s, v1.4s + add v19.4s, v19.4s, v2.4s + eor v7.16b, v20.16b, v7.16b + ushr v22.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + eor v5.16b, v17.16b, v5.16b + eor v26.16b, v19.16b, v26.16b + ushr v21.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + orr v6.16b, v6.16b, v22.16b + ushr v22.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + ushr v29.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + orr v7.16b, v7.16b, v21.16b + orr v5.16b, v5.16b, v22.16b + add v3.4s, v6.4s, v3.4s + orr v21.16b, v26.16b, v29.16b + add v4.4s, v7.4s, v4.4s + add v22.4s, v5.4s, v23.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v21.4s, v16.4s + eor v23.16b, v4.16b, v25.16b + eor v1.16b, v22.16b, v1.16b + ushr v25.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v2.16b, v16.16b, v2.16b + ushr v26.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + orr v0.16b, v0.16b, v25.16b + ushr v25.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v29.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v20.4s, v20.4s, v28.4s + orr v23.16b, v23.16b, v26.16b + orr v1.16b, v1.16b, v25.16b + orr v2.16b, v2.16b, v29.16b + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v13.4s + add v17.4s, v17.4s, v30.4s + add v19.4s, v19.4s, v10.4s + eor v21.16b, v20.16b, v21.16b + add v18.4s, v18.4s, v1.4s + add v17.4s, v17.4s, v2.4s + add v19.4s, v19.4s, v23.4s + rev32 v21.8h, v21.8h + eor v7.16b, v18.16b, v7.16b + eor v6.16b, v17.16b, v6.16b + eor v5.16b, v19.16b, v5.16b + add v22.4s, v22.4s, v21.4s + rev32 v7.8h, v7.8h + rev32 v6.8h, v6.8h + rev32 v5.8h, v5.8h + eor v0.16b, v22.16b, v0.16b + add v16.4s, v16.4s, v7.4s + add v4.4s, v4.4s, v6.4s + add v3.4s, v3.4s, v5.4s + ushr v25.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v1.16b, v16.16b, v1.16b + eor v2.16b, v4.16b, v2.16b + eor v23.16b, v3.16b, v23.16b + orr v0.16b, v0.16b, v25.16b + ushr v25.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + ushr v26.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ushr v27.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v1.16b, v1.16b, v25.16b + add v20.4s, v20.4s, v24.4s + orr v2.16b, v2.16b, v26.16b + orr v23.16b, v23.16b, v27.16b + add v18.4s, v18.4s, v12.4s + add v17.4s, v17.4s, v9.4s + add v19.4s, v19.4s, v15.4s + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v1.4s + add v17.4s, v17.4s, v2.4s + add v19.4s, v19.4s, v23.4s + eor v21.16b, v20.16b, v21.16b + eor v7.16b, v18.16b, v7.16b + eor v6.16b, v17.16b, v6.16b + eor v5.16b, v19.16b, v5.16b ushr v24.4s, v21.4s, #8 shl v21.4s, v21.4s, #24 + ushr v25.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v26.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + ushr v27.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 orr v21.16b, v21.16b, v24.16b - ushr v24.4s, v25.4s, #8 - shl v25.4s, v25.4s, #24 - add v1.4s, v16.4s, v1.4s - orr v24.16b, v25.16b, v24.16b - add v5.4s, v22.4s, v5.4s - eor v25.16b, v1.16b, v28.16b - add v6.4s, v21.4s, v6.4s - eor v4.16b, v5.16b, v4.16b - ushr v27.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - add v2.4s, v24.4s, v2.4s - eor v3.16b, v6.16b, v3.16b - orr v25.16b, v25.16b, v27.16b - ushr v27.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - ldur q19, [x29, #-176] - eor v26.16b, v2.16b, v29.16b - orr v4.16b, v4.16b, v27.16b - ushr v27.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - orr v3.16b, v3.16b, v27.16b - ushr v27.4s, v26.4s, #7 - shl v26.4s, v26.4s, #25 - add v20.4s, v20.4s, v18.4s - add v17.4s, v17.4s, v30.4s - orr v26.16b, v26.16b, v27.16b - add v0.4s, v0.4s, v3.4s - eor v16.16b, v0.16b, v16.16b - add v0.4s, v0.4s, v19.4s - add v19.4s, v20.4s, v26.4s - add v17.4s, v17.4s, v25.4s - eor v20.16b, v19.16b, v22.16b - add v7.4s, v19.4s, v7.4s - eor v19.16b, v17.16b, v21.16b - ldur q21, [x29, #-64] - add v23.4s, v23.4s, v4.4s - eor v24.16b, v23.16b, v24.16b - rev32 v16.8h, v16.8h - add v17.4s, v17.4s, v21.4s - rev32 v21.8h, v24.8h - add v6.4s, v6.4s, v21.4s - rev32 v20.8h, v20.8h - add v2.4s, v2.4s, v16.4s - eor v4.16b, v6.16b, v4.16b - rev32 v19.8h, v19.8h - add v1.4s, v1.4s, v20.4s - eor v3.16b, v2.16b, v3.16b - ushr v24.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v5.4s, v5.4s, v19.4s - eor v22.16b, v1.16b, v26.16b - orr v4.16b, v4.16b, v24.16b - ushr v24.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v18.4s, v23.4s, v14.4s - eor v23.16b, v5.16b, v25.16b - orr v3.16b, v3.16b, v24.16b - ushr v24.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - orr v22.16b, v22.16b, v24.16b - ushr v24.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - orr v23.16b, v23.16b, v24.16b - add v18.4s, v18.4s, v4.4s - add v0.4s, v0.4s, v3.4s - add v24.4s, v17.4s, v23.4s - eor v17.16b, v18.16b, v21.16b - add v7.4s, v7.4s, v22.4s - eor v16.16b, v0.16b, v16.16b - ushr v21.4s, v17.4s, #8 - shl v17.4s, v17.4s, #24 - eor v20.16b, v7.16b, v20.16b - orr v21.16b, v17.16b, v21.16b - ushr v17.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v19.16b, v24.16b, v19.16b - orr v16.16b, v16.16b, v17.16b - ushr v17.4s, v20.4s, #8 - shl v20.4s, v20.4s, #24 - orr v25.16b, v20.16b, v17.16b - ushr v17.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v17.16b - add v1.4s, v25.4s, v1.4s - eor v22.16b, v1.16b, v22.16b - eor v20.16b, v1.16b, v18.16b - add v1.4s, v19.4s, v5.4s - eor v26.16b, v1.16b, v0.16b - add v0.4s, v21.4s, v6.4s - eor v5.16b, v1.16b, v23.16b - eor v1.16b, v0.16b, v4.16b - eor v17.16b, v0.16b, v7.16b - add v0.4s, v16.4s, v2.4s - eor v2.16b, v0.16b, v3.16b - eor v6.16b, v0.16b, v24.16b - ushr v0.4s, v1.4s, #7 + orr v7.16b, v7.16b, v25.16b + orr v6.16b, v6.16b, v26.16b + orr v5.16b, v5.16b, v27.16b + add v22.4s, v21.4s, v22.4s + add v16.4s, v7.4s, v16.4s + add v4.4s, v6.4s, v4.4s + add v3.4s, v5.4s, v3.4s + eor v0.16b, v22.16b, v0.16b + eor v1.16b, v16.16b, v1.16b + eor v2.16b, v4.16b, v2.16b + eor v23.16b, v3.16b, v23.16b + ushr v24.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v25.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - orr v0.16b, v1.16b, v0.16b - ushr v1.4s, v2.4s, #7 + ushr v26.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - orr v1.16b, v2.16b, v1.16b - ushr v2.4s, v22.4s, #7 - shl v3.4s, v22.4s, #25 - orr v2.16b, v3.16b, v2.16b - ushr v3.4s, v5.4s, #7 - shl v4.4s, v5.4s, #25 - orr v3.16b, v4.16b, v3.16b - eor v8.16b, v16.16b, v3.16b - eor v9.16b, v25.16b, v0.16b - eor v31.16b, v1.16b, v19.16b - cmp x17, x22 - eor v15.16b, v2.16b, v21.16b - mov w18, w19 - b.ne .LBB2_4 -.LBB2_7: - zip1 v0.4s, v20.4s, v26.4s - zip2 v1.4s, v20.4s, v26.4s - zip1 v2.4s, v17.4s, v6.4s - zip2 v3.4s, v17.4s, v6.4s - zip1 v4.4s, v8.4s, v9.4s - zip2 v5.4s, v8.4s, v9.4s - zip1 v6.4s, v31.4s, v15.4s - zip2 v7.4s, v31.4s, v15.4s - add x13, x20, #4 - tst w5, #0x1 - sub x28, x28, #4 - zip1 v16.2d, v0.2d, v2.2d - zip2 v0.2d, v0.2d, v2.2d - zip1 v2.2d, v1.2d, v3.2d - zip2 v1.2d, v1.2d, v3.2d - zip1 v3.2d, v4.2d, v6.2d - zip2 v4.2d, v4.2d, v6.2d - zip1 v6.2d, v5.2d, v7.2d - zip2 v5.2d, v5.2d, v7.2d - add x24, x24, #32 - csel x20, x13, x20, ne - cmp x28, #3 - stp q16, q3, [x26] - stp q0, q4, [x26, #32] - stp q2, q6, [x26, #64] - stp q1, q5, [x26, #96] - add x26, x26, #128 - b.hi .LBB2_2 -.LBB2_8: - cbz x28, .LBB2_16 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + orr v0.16b, v0.16b, v24.16b + orr v1.16b, v1.16b, v25.16b + orr v2.16b, v2.16b, v26.16b + orr v23.16b, v23.16b, v27.16b + movi v24.4s, #64 + eor v12.16b, v4.16b, v20.16b + eor v31.16b, v18.16b, v3.16b + eor v29.16b, v17.16b, v22.16b + eor v30.16b, v16.16b, v19.16b + eor v28.16b, v7.16b, v23.16b + eor v23.16b, v6.16b, v0.16b + eor v13.16b, v1.16b, v5.16b + eor v25.16b, v2.16b, v21.16b + cbnz x15, .LBB3_5 + b .LBB3_2 +.LBB3_6: + cbz x24, .LBB3_14 orr w8, w7, w19 - and x21, x5, #0x1 - stur w8, [x29, #-64] -.LBB2_10: + and x22, x5, #0x1 + stur w8, [x29, #-192] +.LBB3_8: ldr x8, [sp, #40] - ldr x25, [x24] - ldur w4, [x29, #-64] - ldp q1, q0, [x8] - mov x8, x22 - stp q1, q0, [x29, #-48] -.LBB2_11: - subs x23, x8, #1 - b.eq .LBB2_13 - cbnz x8, .LBB2_14 - b .LBB2_15 -.LBB2_13: - orr w4, w4, w27 -.LBB2_14: - sub x0, x29, #48 - mov w2, #64 - mov x1, x25 - mov x3, x20 - bl zfs_blake3_compress_in_place_sse2 + mov x28, x0 + ldr x25, [x0] + mov x23, x2 + ldur w5, [x29, #-192] + ldp q0, q1, [x8] + mov x8, x2 + b .LBB3_11 +.LBB3_9: + orr w5, w5, w27 +.LBB3_10: + sub x0, x29, #144 + sub x1, x29, #176 + mov x2, x25 + mov w3, #64 + mov x4, x20 + bl compress_pre + ldp q0, q1, [x29, #-144] add x25, x25, #64 - mov x8, x23 - mov w4, w19 - b .LBB2_11 -.LBB2_15: - ldp q0, q1, [x29, #-48] - add x20, x20, x21 - add x24, x24, #8 - subs x28, x28, #1 - stp q0, q1, [x26], #32 - b.ne .LBB2_10 -.LBB2_16: - add sp, sp, #384 + mov x8, x21 + mov w5, w19 + ldp q2, q3, [x29, #-112] + eor v0.16b, v2.16b, v0.16b + eor v1.16b, v3.16b, v1.16b +.LBB3_11: + subs x21, x8, #1 + stp q0, q1, [x29, #-176] + b.eq .LBB3_9 + cbnz x8, .LBB3_10 + ldp q1, q0, [x29, #-176] + mov x0, x28 + add x20, x20, x22 + add x0, x28, #8 + subs x24, x24, #1 + mov x2, x23 + stp q1, q0, [x26], #32 + b.ne .LBB3_8 +.LBB3_14: + add sp, sp, #464 ldp x20, x19, [sp, #144] ldp x22, x21, [sp, #128] ldp x24, x23, [sp, #112] @@ -2442,9 +2052,10 @@ zfs_blake3_hash_many_sse2: ldp d11, d10, [sp, #32] ldp d13, d12, [sp, #16] ldp d15, d14, [sp], #160 + hint #29 ret -.Lfunc_end2: - .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2 +.Lfunc_end3: + .size zfs_blake3_hash_many_sse2, .Lfunc_end3-zfs_blake3_hash_many_sse2 .cfi_endproc .section ".note.GNU-stack","",@progbits -#endif +#endif \ No newline at end of file diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S index a05baec96..c4c2dfc5b 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -22,518 +22,61 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2022 Samuel Neves - * Copyright (c) 2022 Tino Reichardt + * Copyright (c) 2022-2023 Tino Reichardt * * This is converted assembly: SSE4.1 -> ARMv8-A * Used tools: SIMDe https://github.com/simd-everywhere/simde + * + * Should work on FreeBSD, Linux and macOS + * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh */ #if defined(__aarch64__) .text - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 -.LCPI0_0: - .byte 2 - .byte 3 - .byte 0 - .byte 1 - .byte 6 - .byte 7 - .byte 4 - .byte 5 - .byte 10 - .byte 11 - .byte 8 - .byte 9 - .byte 14 - .byte 15 - .byte 12 - .byte 13 -.LCPI0_1: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI0_2: - .byte 1 - .byte 2 - .byte 3 - .byte 0 - .byte 5 - .byte 6 - .byte 7 - .byte 4 - .byte 9 - .byte 10 - .byte 11 - .byte 8 - .byte 13 - .byte 14 - .byte 15 - .byte 12 -.LCPI0_3: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 20 - .byte 21 - .byte 22 - .byte 23 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 -.LCPI0_4: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 4 - .byte 5 - .byte 6 - .byte 7 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 +.Lsec_end0: .text .globl zfs_blake3_compress_in_place_sse41 .p2align 2 .type zfs_blake3_compress_in_place_sse41,@function zfs_blake3_compress_in_place_sse41: .cfi_startproc - ldp q7, q6, [x0] - ldp q17, q18, [x1] - add x12, x1, #32 - ld2 { v4.4s, v5.4s }, [x12] - lsr x10, x3, #32 - fmov s16, w3 - adrp x13, .LCPI0_0 - adrp x11, .LCPI0_1 - and w8, w2, #0xff - mov v16.s[1], w10 - ldr q0, [x13, :lo12:.LCPI0_0] - ldr q20, [x11, :lo12:.LCPI0_1] - adrp x11, .LCPI0_4 - and w9, w4, #0xff - ldr q2, [x11, :lo12:.LCPI0_4] - mov v16.s[2], w8 - uzp1 v21.4s, v17.4s, v18.4s - add v7.4s, v6.4s, v7.4s - adrp x12, .LCPI0_3 - mov v16.s[3], w9 - uzp2 v18.4s, v17.4s, v18.4s - add v7.4s, v7.4s, v21.4s - ext v17.16b, v5.16b, v5.16b, #12 - ldr q3, [x12, :lo12:.LCPI0_3] - ext v24.16b, v4.16b, v4.16b, #12 - eor v16.16b, v7.16b, v16.16b - mov v27.16b, v17.16b - uzp1 v19.4s, v21.4s, v21.4s - ext v25.16b, v21.16b, v21.16b, #12 - zip2 v28.4s, v18.4s, v17.4s - tbl v29.16b, { v16.16b }, v0.16b - mov v27.s[1], v24.s[2] - zip1 v23.2d, v17.2d, v18.2d - ext v19.16b, v19.16b, v21.16b, #8 - add v22.4s, v29.4s, v20.4s - ext v26.16b, v21.16b, v25.16b, #12 - tbl v20.16b, { v23.16b, v24.16b }, v2.16b - zip1 v21.4s, v28.4s, v24.4s - zip1 v23.4s, v24.4s, v28.4s - uzp2 v19.4s, v19.4s, v18.4s - eor v24.16b, v22.16b, v6.16b - ext v25.16b, v20.16b, v20.16b, #12 - ext v6.16b, v23.16b, v21.16b, #8 - add v7.4s, v7.4s, v18.4s - ext v18.16b, v19.16b, v19.16b, #4 - tbl v16.16b, { v26.16b, v27.16b }, v3.16b - uzp1 v21.4s, v20.4s, v25.4s - mov v26.16b, v6.16b - ext v23.16b, v18.16b, v18.16b, #12 - mov v26.s[1], v21.s[2] - adrp x10, .LCPI0_2 - ext v25.16b, v18.16b, v23.16b, #12 - uzp1 v23.4s, v18.4s, v18.4s - ldr q1, [x10, :lo12:.LCPI0_2] - ext v18.16b, v23.16b, v18.16b, #8 - ushr v23.4s, v24.4s, #12 - shl v24.4s, v24.4s, #20 - orr v23.16b, v24.16b, v23.16b - add v7.4s, v7.4s, v23.4s - eor v27.16b, v29.16b, v7.16b - add v4.4s, v7.4s, v4.4s - tbl v7.16b, { v25.16b, v26.16b }, v3.16b - tbl v26.16b, { v27.16b }, v1.16b - add v22.4s, v22.4s, v26.4s - uzp2 v18.4s, v18.4s, v16.4s - eor v23.16b, v23.16b, v22.16b - ext v5.16b, v18.16b, v18.16b, #4 - ushr v27.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - uzp1 v25.4s, v5.4s, v5.4s - orr v23.16b, v23.16b, v27.16b - ext v28.16b, v4.16b, v4.16b, #12 - ext v4.16b, v25.16b, v5.16b, #8 - ext v25.16b, v26.16b, v26.16b, #8 - add v26.4s, v28.4s, v23.4s - eor v25.16b, v26.16b, v25.16b - ext v22.16b, v22.16b, v22.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v22.4s, v22.4s, v25.4s - eor v23.16b, v23.16b, v22.16b - add v17.4s, v26.4s, v17.4s - ushr v26.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - orr v23.16b, v23.16b, v26.16b - add v17.4s, v17.4s, v23.4s - eor v25.16b, v25.16b, v17.16b - add v17.4s, v17.4s, v19.4s - tbl v19.16b, { v25.16b }, v1.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - ext v17.16b, v17.16b, v17.16b, #4 - orr v23.16b, v23.16b, v25.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v17.4s, v17.4s, v23.4s - eor v19.16b, v17.16b, v19.16b - ext v22.16b, v22.16b, v22.16b, #12 - tbl v19.16b, { v19.16b }, v0.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - add v17.4s, v17.4s, v16.4s - orr v23.16b, v23.16b, v25.16b - add v17.4s, v17.4s, v23.4s - ext v25.16b, v17.16b, v17.16b, #12 - eor v17.16b, v19.16b, v17.16b - tbl v17.16b, { v17.16b }, v1.16b - add v19.4s, v22.4s, v17.4s - eor v22.16b, v23.16b, v19.16b - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v6.2d, v16.2d - ushr v23.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - zip2 v24.4s, v16.4s, v6.4s - tbl v26.16b, { v20.16b, v21.16b }, v2.16b - orr v22.16b, v22.16b, v23.16b - zip1 v16.4s, v24.4s, v21.4s - zip1 v20.4s, v21.4s, v24.4s - ext v21.16b, v26.16b, v26.16b, #12 - ext v17.16b, v17.16b, v17.16b, #8 - add v25.4s, v25.4s, v22.4s - ext v16.16b, v20.16b, v16.16b, #8 - uzp1 v21.4s, v26.4s, v21.4s - eor v26.16b, v25.16b, v17.16b - ext v19.16b, v19.16b, v19.16b, #4 - tbl v26.16b, { v26.16b }, v0.16b - mov v29.16b, v16.16b - add v19.4s, v19.4s, v26.4s - ext v27.16b, v5.16b, v5.16b, #12 - mov v29.s[1], v21.s[2] - eor v22.16b, v22.16b, v19.16b - ext v28.16b, v5.16b, v27.16b, #12 - ushr v27.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v25.4s, v6.4s - orr v22.16b, v22.16b, v27.16b - add v6.4s, v6.4s, v22.4s - eor v26.16b, v26.16b, v6.16b - add v6.4s, v6.4s, v18.4s - tbl v18.16b, { v26.16b }, v1.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v22.16b, v22.16b, v26.16b - ext v18.16b, v18.16b, v18.16b, #8 - add v6.4s, v6.4s, v22.4s - eor v18.16b, v6.16b, v18.16b - ext v19.16b, v19.16b, v19.16b, #12 - tbl v18.16b, { v18.16b }, v0.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v6.4s, v7.4s - orr v22.16b, v22.16b, v26.16b - add v6.4s, v6.4s, v22.4s - ext v26.16b, v6.16b, v6.16b, #12 - eor v6.16b, v18.16b, v6.16b - uzp2 v4.4s, v4.4s, v7.4s - zip2 v25.4s, v7.4s, v16.4s - add v26.4s, v26.4s, v21.4s - zip1 v20.2d, v16.2d, v7.2d - tbl v6.16b, { v6.16b }, v1.16b - ext v24.16b, v4.16b, v4.16b, #4 - tbl v27.16b, { v20.16b, v21.16b }, v2.16b - zip1 v7.4s, v25.4s, v21.4s - zip1 v20.4s, v21.4s, v25.4s - add v18.4s, v19.4s, v6.4s - uzp1 v5.4s, v24.4s, v24.4s - ext v21.16b, v27.16b, v27.16b, #12 - ext v7.16b, v20.16b, v7.16b, #8 - eor v19.16b, v22.16b, v18.16b - ext v5.16b, v5.16b, v24.16b, #8 - tbl v17.16b, { v28.16b, v29.16b }, v3.16b - uzp1 v21.4s, v27.4s, v21.4s - mov v28.16b, v7.16b - ushr v22.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v23.16b, v24.16b, v24.16b, #12 - uzp2 v5.4s, v5.4s, v17.4s - mov v28.s[1], v21.s[2] - orr v19.16b, v19.16b, v22.16b - ext v27.16b, v24.16b, v23.16b, #12 - ext v23.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #8 - ext v25.16b, v18.16b, v18.16b, #4 - add v18.4s, v26.4s, v19.4s - uzp1 v24.4s, v23.4s, v23.4s - eor v6.16b, v18.16b, v6.16b - ext v24.16b, v24.16b, v23.16b, #8 - add v16.4s, v18.4s, v16.4s - tbl v18.16b, { v27.16b, v28.16b }, v3.16b - tbl v27.16b, { v6.16b }, v0.16b - uzp2 v6.4s, v24.4s, v18.4s - add v24.4s, v25.4s, v27.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v25.16b - add v16.4s, v16.4s, v19.4s - eor v25.16b, v27.16b, v16.16b - add v4.4s, v16.4s, v4.4s - tbl v16.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v16.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v19.16b, v19.16b, v25.16b - ext v16.16b, v16.16b, v16.16b, #8 - add v4.4s, v4.4s, v19.4s - eor v16.16b, v4.16b, v16.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v25.16b, { v16.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v16.16b, v19.16b, v24.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v17.4s - orr v19.16b, v16.16b, v19.16b - add v27.4s, v4.4s, v19.4s - eor v25.16b, v25.16b, v27.16b - tbl v25.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v25.4s - zip2 v26.4s, v17.4s, v7.4s - ext v4.16b, v27.16b, v27.16b, #12 - eor v19.16b, v19.16b, v24.16b - add v28.4s, v4.4s, v21.4s - zip1 v20.2d, v7.2d, v17.2d - zip1 v4.4s, v26.4s, v21.4s - zip1 v17.4s, v21.4s, v26.4s - ushr v26.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v26.16b - ext v25.16b, v25.16b, v25.16b, #8 - add v27.4s, v28.4s, v19.4s - eor v25.16b, v27.16b, v25.16b - ext v24.16b, v24.16b, v24.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v19.16b, v19.16b, v24.16b - add v7.4s, v27.4s, v7.4s - ushr v27.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v27.16b - add v7.4s, v7.4s, v19.4s - eor v25.16b, v25.16b, v7.16b - add v5.4s, v7.4s, v5.4s - tbl v7.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v19.16b, v19.16b, v25.16b - ext v7.16b, v7.16b, v7.16b, #8 - add v5.4s, v5.4s, v19.4s - eor v7.16b, v5.16b, v7.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v7.16b, { v7.16b }, v0.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - tbl v16.16b, { v20.16b, v21.16b }, v2.16b - add v5.4s, v5.4s, v18.4s - orr v19.16b, v19.16b, v25.16b - ext v20.16b, v16.16b, v16.16b, #12 - ext v4.16b, v17.16b, v4.16b, #8 - add v5.4s, v5.4s, v19.4s - uzp1 v21.4s, v16.4s, v20.4s - mov v17.16b, v4.16b - ext v25.16b, v5.16b, v5.16b, #12 - mov v17.s[1], v21.s[2] - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v4.2d, v18.2d - ext v22.16b, v23.16b, v23.16b, #12 - zip2 v26.4s, v18.4s, v4.4s - tbl v18.16b, { v20.16b, v21.16b }, v2.16b - eor v5.16b, v7.16b, v5.16b - ext v16.16b, v23.16b, v22.16b, #12 - ext v22.16b, v6.16b, v6.16b, #4 - zip1 v27.4s, v26.4s, v21.4s - zip1 v20.4s, v21.4s, v26.4s - ext v21.16b, v18.16b, v18.16b, #12 - tbl v5.16b, { v5.16b }, v1.16b - ext v20.16b, v20.16b, v27.16b, #8 - uzp1 v27.4s, v18.4s, v21.4s - uzp1 v18.4s, v22.4s, v22.4s - add v21.4s, v24.4s, v5.4s - ext v18.16b, v18.16b, v22.16b, #8 - eor v19.16b, v19.16b, v21.16b - tbl v7.16b, { v16.16b, v17.16b }, v3.16b - uzp2 v18.4s, v18.4s, v17.4s - zip2 v16.4s, v16.4s, v20.4s - ushr v17.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v17.16b, v19.16b, v17.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v19.4s, v25.4s, v17.4s - eor v5.16b, v19.16b, v5.16b - ext v21.16b, v21.16b, v21.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b - add v4.4s, v19.4s, v4.4s - add v19.4s, v21.4s, v5.4s - eor v17.16b, v17.16b, v19.16b - ushr v21.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - orr v17.16b, v17.16b, v21.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - add v4.4s, v4.4s, v6.4s - add v6.4s, v19.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v17.16b, v17.16b, v19.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v17.4s - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #12 - tbl v5.16b, { v5.16b }, v0.16b - add v6.4s, v6.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - add v4.4s, v4.4s, v7.4s - orr v17.16b, v17.16b, v19.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - mov v29.16b, v20.16b - ext v4.16b, v4.16b, v4.16b, #12 - add v6.4s, v6.4s, v5.4s - mov v29.s[1], v27.s[2] - add v4.4s, v4.4s, v27.4s - zip1 v26.2d, v20.2d, v7.2d - zip1 v7.4s, v16.4s, v27.4s - zip1 v16.4s, v27.4s, v16.4s - eor v17.16b, v17.16b, v6.16b - ext v7.16b, v16.16b, v7.16b, #8 - ushr v16.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - orr v16.16b, v17.16b, v16.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v16.4s - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b - add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b - ushr v17.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v20.4s - orr v16.16b, v16.16b, v17.16b - add v4.4s, v4.4s, v16.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b - add v4.4s, v4.4s, v18.4s - ushr v17.4s, v16.4s, #7 - shl v16.4s, v16.4s, #25 - ext v23.16b, v22.16b, v22.16b, #12 - ext v4.16b, v4.16b, v4.16b, #4 - orr v16.16b, v16.16b, v17.16b - ext v28.16b, v22.16b, v23.16b, #12 - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v16.4s, v4.4s - tbl v3.16b, { v28.16b, v29.16b }, v3.16b - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v4.4s, v3.4s - tbl v4.16b, { v5.16b }, v0.16b - add v5.4s, v6.4s, v4.4s - eor v6.16b, v16.16b, v5.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - orr v6.16b, v6.16b, v16.16b - tbl v2.16b, { v26.16b, v27.16b }, v2.16b - add v3.4s, v3.4s, v6.4s - ext v19.16b, v2.16b, v2.16b, #12 - eor v4.16b, v4.16b, v3.16b - uzp1 v2.4s, v2.4s, v19.4s - ext v3.16b, v3.16b, v3.16b, #12 - tbl v4.16b, { v4.16b }, v1.16b - add v2.4s, v3.4s, v2.4s - add v3.4s, v5.4s, v4.4s - eor v5.16b, v6.16b, v3.16b - ushr v6.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v6.16b - ext v4.16b, v4.16b, v4.16b, #8 - add v2.4s, v2.4s, v5.4s - eor v4.16b, v2.16b, v4.16b - ext v3.16b, v3.16b, v3.16b, #4 - tbl v0.16b, { v4.16b }, v0.16b - add v3.4s, v3.4s, v0.4s - eor v4.16b, v5.16b, v3.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v2.4s, v2.4s, v7.4s - orr v4.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v4.4s - eor v0.16b, v0.16b, v2.16b - tbl v0.16b, { v0.16b }, v1.16b - add v1.4s, v3.4s, v0.4s - eor v3.16b, v4.16b, v1.16b - ext v2.16b, v2.16b, v2.16b, #4 - ext v1.16b, v1.16b, v1.16b, #12 - ushr v4.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - ext v0.16b, v0.16b, v0.16b, #8 - eor v1.16b, v2.16b, v1.16b - orr v2.16b, v3.16b, v4.16b + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + str x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x19, x0 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x19 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b - stp q1, q0, [x0] + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr x19, [sp, #80] + add sp, sp, #96 + hint #29 ret .Lfunc_end0: .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41 @@ -542,6 +85,9 @@ zfs_blake3_compress_in_place_sse41: .section .rodata.cst16,"aM",@progbits,16 .p2align 4 .LCPI1_0: + .xword -4942790177982912921 + .xword -6534734903820487822 +.LCPI1_1: .byte 2 .byte 3 .byte 0 @@ -558,11 +104,6 @@ zfs_blake3_compress_in_place_sse41: .byte 15 .byte 12 .byte 13 -.LCPI1_1: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 .LCPI1_2: .byte 1 .byte 2 @@ -580,488 +121,497 @@ zfs_blake3_compress_in_place_sse41: .byte 14 .byte 15 .byte 12 -.LCPI1_3: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 20 - .byte 21 - .byte 22 - .byte 23 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 -.LCPI1_4: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 4 - .byte 5 - .byte 6 - .byte 7 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 .text + .p2align 2 + .type compress_pre,@function +compress_pre: + .cfi_startproc + hint #34 + fmov s1, w3 + movi d0, #0x0000ff000000ff + ldr q2, [x1] + adrp x8, .LCPI1_0 + mov v1.s[1], w5 + str q2, [x0] + ldr q4, [x8, :lo12:.LCPI1_0] + ldr q5, [x1, #16] + adrp x8, .LCPI1_1 + and v0.8b, v1.8b, v0.8b + fmov d1, x4 + stp q5, q4, [x0, #16] + mov v1.d[1], v0.d[0] + str q1, [x0, #48] + ldp q6, q7, [x2] + uzp1 v3.4s, v6.4s, v7.4s + add v0.4s, v2.4s, v3.4s + uzp2 v2.4s, v6.4s, v7.4s + add v16.4s, v0.4s, v5.4s + ldr q0, [x8, :lo12:.LCPI1_1] + adrp x8, .LCPI1_2 + eor v1.16b, v16.16b, v1.16b + add v7.4s, v16.4s, v2.4s + tbl v1.16b, { v1.16b }, v0.16b + add v4.4s, v1.4s, v4.4s + eor v5.16b, v4.16b, v5.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v6.4s, v7.4s, v5.4s + eor v7.16b, v1.16b, v6.16b + ldr q1, [x8, :lo12:.LCPI1_2] + add x8, x2, #32 + tbl v7.16b, { v7.16b }, v1.16b + ld2 { v16.4s, v17.4s }, [x8] + add v4.4s, v4.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #8 + add v6.4s, v6.4s, v16.4s + eor v5.16b, v4.16b, v5.16b + ext v4.16b, v4.16b, v4.16b, #4 + ext v16.16b, v16.16b, v16.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ushr v18.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v18.16b + ext v18.16b, v17.16b, v17.16b, #12 + add v6.4s, v6.4s, v5.4s + mov v17.16b, v18.16b + eor v7.16b, v7.16b, v6.16b + add v6.4s, v6.4s, v18.4s + mov v17.s[1], v16.s[2] + tbl v7.16b, { v7.16b }, v0.16b + add v4.4s, v4.4s, v7.4s + eor v5.16b, v4.16b, v5.16b + ushr v19.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v19.16b + uzp1 v19.4s, v3.4s, v3.4s + add v6.4s, v6.4s, v5.4s + ext v19.16b, v19.16b, v3.16b, #8 + eor v7.16b, v7.16b, v6.16b + uzp2 v19.4s, v19.4s, v2.4s + tbl v7.16b, { v7.16b }, v1.16b + add v6.4s, v6.4s, v19.4s + add v4.4s, v4.4s, v7.4s + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #8 + eor v5.16b, v4.16b, v5.16b + ext v4.16b, v4.16b, v4.16b, #12 + ushr v20.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v20.16b + ext v20.16b, v3.16b, v3.16b, #12 + add v6.4s, v6.4s, v5.4s + ext v3.16b, v3.16b, v20.16b, #12 + eor v7.16b, v7.16b, v6.16b + rev64 v3.4s, v3.4s + tbl v7.16b, { v7.16b }, v0.16b + trn2 v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v7.4s + add v6.4s, v6.4s, v3.4s + eor v5.16b, v4.16b, v5.16b + ushr v17.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v17.16b + zip1 v17.2d, v18.2d, v2.2d + zip2 v2.4s, v2.4s, v18.4s + add v6.4s, v6.4s, v5.4s + mov v17.s[3], v16.s[3] + zip1 v18.4s, v2.4s, v16.4s + zip1 v2.4s, v16.4s, v2.4s + eor v7.16b, v7.16b, v6.16b + ext v6.16b, v6.16b, v6.16b, #12 + ext v16.16b, v2.16b, v18.16b, #8 + tbl v7.16b, { v7.16b }, v1.16b + add v20.4s, v4.4s, v7.4s + ext v4.16b, v17.16b, v17.16b, #12 + ext v7.16b, v7.16b, v7.16b, #8 + eor v5.16b, v20.16b, v5.16b + uzp1 v4.4s, v17.4s, v4.4s + ushr v17.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v5.16b, v5.16b, v17.16b + ext v17.16b, v20.16b, v20.16b, #4 + add v6.4s, v6.4s, v5.4s + eor v7.16b, v7.16b, v6.16b + add v6.4s, v6.4s, v16.4s + tbl v7.16b, { v7.16b }, v0.16b + add v17.4s, v17.4s, v7.4s + eor v5.16b, v17.16b, v5.16b + ushr v2.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v2.16b, v5.16b, v2.16b + add v5.4s, v6.4s, v2.4s + ext v6.16b, v19.16b, v19.16b, #4 + eor v7.16b, v7.16b, v5.16b + uzp1 v18.4s, v6.4s, v6.4s + tbl v7.16b, { v7.16b }, v1.16b + ext v18.16b, v18.16b, v6.16b, #8 + add v17.4s, v17.4s, v7.4s + uzp2 v18.4s, v18.4s, v3.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v2.16b, v17.16b, v2.16b + add v5.4s, v5.4s, v18.4s + ext v17.16b, v17.16b, v17.16b, #12 + ushr v19.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v2.16b, v2.16b, v19.16b + ext v19.16b, v6.16b, v6.16b, #12 + add v5.4s, v5.4s, v2.4s + ext v6.16b, v6.16b, v19.16b, #12 + mov v19.16b, v16.16b + eor v7.16b, v7.16b, v5.16b + rev64 v6.4s, v6.4s + mov v19.s[1], v4.s[2] + tbl v7.16b, { v7.16b }, v0.16b + add v17.4s, v17.4s, v7.4s + eor v20.16b, v17.16b, v2.16b + trn2 v2.4s, v6.4s, v19.4s + ushr v6.4s, v20.4s, #12 + shl v19.4s, v20.4s, #20 + add v5.4s, v5.4s, v2.4s + orr v6.16b, v19.16b, v6.16b + add v19.4s, v5.4s, v6.4s + eor v5.16b, v7.16b, v19.16b + zip1 v7.2d, v16.2d, v3.2d + zip2 v3.4s, v3.4s, v16.4s + tbl v20.16b, { v5.16b }, v1.16b + mov v7.s[3], v4.s[3] + add v17.4s, v17.4s, v20.4s + ext v5.16b, v7.16b, v7.16b, #12 + eor v6.16b, v17.16b, v6.16b + uzp1 v5.4s, v7.4s, v5.4s + ext v7.16b, v19.16b, v19.16b, #12 + ext v17.16b, v17.16b, v17.16b, #4 + ushr v19.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v7.4s, v7.4s, v5.4s + orr v6.16b, v6.16b, v19.16b + ext v19.16b, v20.16b, v20.16b, #8 + add v7.4s, v7.4s, v6.4s + eor v19.16b, v19.16b, v7.16b + tbl v19.16b, { v19.16b }, v0.16b + add v16.4s, v17.4s, v19.4s + zip1 v17.4s, v3.4s, v4.4s + zip1 v3.4s, v4.4s, v3.4s + eor v4.16b, v16.16b, v6.16b + ext v17.16b, v3.16b, v17.16b, #8 + ushr v3.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v6.4s, v7.4s, v17.4s + orr v3.16b, v4.16b, v3.16b + add v4.4s, v6.4s, v3.4s + ext v6.16b, v18.16b, v18.16b, #4 + eor v7.16b, v19.16b, v4.16b + uzp1 v18.4s, v6.4s, v6.4s + tbl v7.16b, { v7.16b }, v1.16b + ext v18.16b, v18.16b, v6.16b, #8 + add v16.4s, v16.4s, v7.4s + uzp2 v18.4s, v18.4s, v2.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v3.16b, v16.16b, v3.16b + add v4.4s, v4.4s, v18.4s + ext v16.16b, v16.16b, v16.16b, #12 + ushr v19.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v3.16b, v3.16b, v19.16b + ext v19.16b, v6.16b, v6.16b, #12 + add v4.4s, v4.4s, v3.4s + ext v6.16b, v6.16b, v19.16b, #12 + mov v19.16b, v17.16b + eor v7.16b, v7.16b, v4.16b + rev64 v6.4s, v6.4s + mov v19.s[1], v5.s[2] + tbl v7.16b, { v7.16b }, v0.16b + add v16.4s, v16.4s, v7.4s + eor v20.16b, v16.16b, v3.16b + trn2 v3.4s, v6.4s, v19.4s + ushr v6.4s, v20.4s, #12 + shl v19.4s, v20.4s, #20 + add v4.4s, v4.4s, v3.4s + orr v6.16b, v19.16b, v6.16b + zip1 v19.2d, v17.2d, v2.2d + zip2 v2.4s, v2.4s, v17.4s + add v4.4s, v4.4s, v6.4s + mov v19.s[3], v5.s[3] + zip1 v17.4s, v2.4s, v5.4s + zip1 v2.4s, v5.4s, v2.4s + eor v7.16b, v7.16b, v4.16b + ext v20.16b, v19.16b, v19.16b, #12 + ext v4.16b, v4.16b, v4.16b, #12 + ext v2.16b, v2.16b, v17.16b, #8 + tbl v7.16b, { v7.16b }, v1.16b + add v16.4s, v16.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v21.16b, v16.16b, v6.16b + uzp1 v6.4s, v19.4s, v20.4s + ext v16.16b, v16.16b, v16.16b, #4 + ushr v19.4s, v21.4s, #7 + shl v20.4s, v21.4s, #25 + add v4.4s, v4.4s, v6.4s + orr v19.16b, v20.16b, v19.16b + add v4.4s, v4.4s, v19.4s + eor v7.16b, v7.16b, v4.16b + add v4.4s, v4.4s, v2.4s + tbl v7.16b, { v7.16b }, v0.16b + add v16.4s, v16.4s, v7.4s + eor v5.16b, v16.16b, v19.16b + ushr v17.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v17.16b + ext v17.16b, v18.16b, v18.16b, #4 + add v4.4s, v4.4s, v5.4s + uzp1 v18.4s, v17.4s, v17.4s + eor v7.16b, v7.16b, v4.16b + ext v18.16b, v18.16b, v17.16b, #8 + tbl v7.16b, { v7.16b }, v1.16b + uzp2 v18.4s, v18.4s, v3.4s + add v16.4s, v16.4s, v7.4s + add v4.4s, v4.4s, v18.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v5.16b, v16.16b, v5.16b + ext v4.16b, v4.16b, v4.16b, #4 + ext v16.16b, v16.16b, v16.16b, #12 + ushr v19.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v19.16b + add v19.4s, v4.4s, v5.4s + eor v4.16b, v7.16b, v19.16b + ext v7.16b, v17.16b, v17.16b, #12 + tbl v20.16b, { v4.16b }, v0.16b + ext v4.16b, v17.16b, v7.16b, #12 + mov v7.16b, v2.16b + add v16.4s, v16.4s, v20.4s + rev64 v4.4s, v4.4s + mov v7.s[1], v6.s[2] + eor v5.16b, v16.16b, v5.16b + trn2 v4.4s, v4.4s, v7.4s + ushr v7.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v19.4s, v4.4s + zip1 v19.2d, v2.2d, v3.2d + zip2 v2.4s, v3.4s, v2.4s + orr v5.16b, v5.16b, v7.16b + mov v19.s[3], v6.s[3] + add v7.4s, v17.4s, v5.4s + eor v17.16b, v20.16b, v7.16b + ext v20.16b, v19.16b, v19.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + tbl v17.16b, { v17.16b }, v1.16b + add v16.4s, v16.4s, v17.4s + ext v17.16b, v17.16b, v17.16b, #8 + eor v21.16b, v16.16b, v5.16b + uzp1 v5.4s, v19.4s, v20.4s + ext v16.16b, v16.16b, v16.16b, #4 + ushr v19.4s, v21.4s, #7 + shl v20.4s, v21.4s, #25 + add v7.4s, v7.4s, v5.4s + orr v19.16b, v20.16b, v19.16b + add v7.4s, v7.4s, v19.4s + eor v17.16b, v17.16b, v7.16b + tbl v17.16b, { v17.16b }, v0.16b + add v3.4s, v16.4s, v17.4s + zip1 v16.4s, v2.4s, v6.4s + zip1 v2.4s, v6.4s, v2.4s + eor v6.16b, v3.16b, v19.16b + ext v16.16b, v2.16b, v16.16b, #8 + ushr v2.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + add v7.4s, v7.4s, v16.4s + orr v2.16b, v6.16b, v2.16b + add v6.4s, v7.4s, v2.4s + ext v7.16b, v18.16b, v18.16b, #4 + eor v17.16b, v17.16b, v6.16b + uzp1 v18.4s, v7.4s, v7.4s + tbl v17.16b, { v17.16b }, v1.16b + ext v18.16b, v18.16b, v7.16b, #8 + add v3.4s, v3.4s, v17.4s + uzp2 v18.4s, v18.4s, v4.4s + eor v2.16b, v3.16b, v2.16b + add v6.4s, v6.4s, v18.4s + ext v3.16b, v3.16b, v3.16b, #12 + ext v18.16b, v18.16b, v18.16b, #4 + ushr v19.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v19.16b, v2.16b, v19.16b + ext v2.16b, v17.16b, v17.16b, #8 + ext v17.16b, v7.16b, v7.16b, #12 + add v6.4s, v6.4s, v19.4s + eor v2.16b, v2.16b, v6.16b + tbl v20.16b, { v2.16b }, v0.16b + ext v2.16b, v7.16b, v17.16b, #12 + mov v7.16b, v16.16b + add v17.4s, v3.4s, v20.4s + rev64 v3.4s, v2.4s + mov v7.s[1], v5.s[2] + eor v19.16b, v17.16b, v19.16b + trn2 v3.4s, v3.4s, v7.4s + ushr v21.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + add v6.4s, v6.4s, v3.4s + orr v19.16b, v19.16b, v21.16b + add v21.4s, v6.4s, v19.4s + eor v6.16b, v20.16b, v21.16b + zip1 v20.2d, v16.2d, v4.2d + zip2 v4.4s, v4.4s, v16.4s + tbl v22.16b, { v6.16b }, v1.16b + mov v20.s[3], v5.s[3] + add v17.4s, v17.4s, v22.4s + ext v6.16b, v20.16b, v20.16b, #12 + eor v19.16b, v17.16b, v19.16b + uzp1 v6.4s, v20.4s, v6.4s + ext v20.16b, v21.16b, v21.16b, #12 + ext v17.16b, v17.16b, v17.16b, #4 + ushr v21.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + add v20.4s, v20.4s, v6.4s + orr v19.16b, v19.16b, v21.16b + ext v21.16b, v22.16b, v22.16b, #8 + add v20.4s, v20.4s, v19.4s + eor v21.16b, v21.16b, v20.16b + tbl v21.16b, { v21.16b }, v0.16b + add v16.4s, v17.4s, v21.4s + zip1 v17.4s, v4.4s, v5.4s + zip1 v4.4s, v5.4s, v4.4s + eor v5.16b, v16.16b, v19.16b + ext v4.16b, v4.16b, v17.16b, #8 + ushr v17.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v19.4s, v20.4s, v4.4s + ext v20.16b, v18.16b, v18.16b, #8 + zip1 v3.2d, v4.2d, v3.2d + orr v5.16b, v5.16b, v17.16b + zip2 v2.4s, v2.4s, v4.4s + uzp2 v7.4s, v20.4s, v7.4s + mov v3.s[3], v6.s[3] + add v17.4s, v19.4s, v5.4s + ext v7.16b, v7.16b, v20.16b, #4 + eor v19.16b, v21.16b, v17.16b + ext v17.16b, v17.16b, v17.16b, #4 + tbl v19.16b, { v19.16b }, v1.16b + add v7.4s, v17.4s, v7.4s + add v16.4s, v16.4s, v19.4s + ext v17.16b, v19.16b, v19.16b, #8 + ext v19.16b, v18.16b, v18.16b, #12 + eor v5.16b, v16.16b, v5.16b + ext v16.16b, v16.16b, v16.16b, #12 + ext v18.16b, v18.16b, v19.16b, #12 + mov v19.16b, v4.16b + ushr v20.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + rev64 v18.4s, v18.4s + mov v19.s[1], v6.s[2] + orr v5.16b, v5.16b, v20.16b + trn2 v18.4s, v18.4s, v19.4s + add v7.4s, v5.4s, v7.4s + eor v17.16b, v17.16b, v7.16b + add v7.4s, v7.4s, v18.4s + ext v18.16b, v3.16b, v3.16b, #12 + tbl v17.16b, { v17.16b }, v0.16b + uzp1 v3.4s, v3.4s, v18.4s + add v16.4s, v16.4s, v17.4s + eor v5.16b, v16.16b, v5.16b + ushr v19.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v19.16b + add v7.4s, v7.4s, v5.4s + eor v17.16b, v17.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #12 + tbl v17.16b, { v17.16b }, v1.16b + add v3.4s, v7.4s, v3.4s + add v16.4s, v16.4s, v17.4s + ext v7.16b, v17.16b, v17.16b, #8 + eor v5.16b, v16.16b, v5.16b + ext v16.16b, v16.16b, v16.16b, #4 + ushr v18.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v18.16b + add v3.4s, v3.4s, v5.4s + eor v7.16b, v7.16b, v3.16b + tbl v0.16b, { v7.16b }, v0.16b + zip1 v7.4s, v2.4s, v6.4s + zip1 v2.4s, v6.4s, v2.4s + add v4.4s, v16.4s, v0.4s + ext v2.16b, v2.16b, v7.16b, #8 + eor v5.16b, v4.16b, v5.16b + add v2.4s, v3.4s, v2.4s + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v3.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v3.4s + eor v0.16b, v0.16b, v2.16b + ext v2.16b, v2.16b, v2.16b, #4 + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v4.4s, v0.4s + ext v0.16b, v0.16b, v0.16b, #8 + eor v3.16b, v1.16b, v3.16b + ext v1.16b, v1.16b, v1.16b, #12 + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + stp q1, q0, [x0, #32] + orr v3.16b, v3.16b, v4.16b + stp q2, q3, [x0] + ret +.Lfunc_end1: + .size compress_pre, .Lfunc_end1-compress_pre + .cfi_endproc + .globl zfs_blake3_compress_xof_sse41 .p2align 2 .type zfs_blake3_compress_xof_sse41,@function zfs_blake3_compress_xof_sse41: .cfi_startproc - ldp q7, q6, [x0] - ldp q17, q18, [x1] - add x12, x1, #32 - ld2 { v4.4s, v5.4s }, [x12] - lsr x10, x3, #32 - fmov s16, w3 - adrp x13, .LCPI1_0 - adrp x11, .LCPI1_1 - and w8, w2, #0xff - mov v16.s[1], w10 - ldr q0, [x13, :lo12:.LCPI1_0] - ldr q20, [x11, :lo12:.LCPI1_1] - adrp x11, .LCPI1_4 - and w9, w4, #0xff - ldr q2, [x11, :lo12:.LCPI1_4] - mov v16.s[2], w8 - uzp1 v21.4s, v17.4s, v18.4s - add v7.4s, v6.4s, v7.4s - adrp x12, .LCPI1_3 - mov v16.s[3], w9 - uzp2 v18.4s, v17.4s, v18.4s - add v7.4s, v7.4s, v21.4s - ext v17.16b, v5.16b, v5.16b, #12 - ldr q3, [x12, :lo12:.LCPI1_3] - ext v24.16b, v4.16b, v4.16b, #12 - eor v16.16b, v7.16b, v16.16b - mov v27.16b, v17.16b - uzp1 v19.4s, v21.4s, v21.4s - ext v25.16b, v21.16b, v21.16b, #12 - zip2 v28.4s, v18.4s, v17.4s - tbl v29.16b, { v16.16b }, v0.16b - mov v27.s[1], v24.s[2] - zip1 v23.2d, v17.2d, v18.2d - ext v19.16b, v19.16b, v21.16b, #8 - add v22.4s, v29.4s, v20.4s - ext v26.16b, v21.16b, v25.16b, #12 - tbl v20.16b, { v23.16b, v24.16b }, v2.16b - zip1 v21.4s, v28.4s, v24.4s - zip1 v23.4s, v24.4s, v28.4s - uzp2 v19.4s, v19.4s, v18.4s - eor v24.16b, v22.16b, v6.16b - ext v25.16b, v20.16b, v20.16b, #12 - ext v6.16b, v23.16b, v21.16b, #8 - add v7.4s, v7.4s, v18.4s - ext v18.16b, v19.16b, v19.16b, #4 - tbl v16.16b, { v26.16b, v27.16b }, v3.16b - uzp1 v21.4s, v20.4s, v25.4s - mov v26.16b, v6.16b - ext v23.16b, v18.16b, v18.16b, #12 - mov v26.s[1], v21.s[2] - adrp x10, .LCPI1_2 - ext v25.16b, v18.16b, v23.16b, #12 - uzp1 v23.4s, v18.4s, v18.4s - ldr q1, [x10, :lo12:.LCPI1_2] - ext v18.16b, v23.16b, v18.16b, #8 - ushr v23.4s, v24.4s, #12 - shl v24.4s, v24.4s, #20 - orr v23.16b, v24.16b, v23.16b - add v7.4s, v7.4s, v23.4s - eor v27.16b, v29.16b, v7.16b - add v4.4s, v7.4s, v4.4s - tbl v7.16b, { v25.16b, v26.16b }, v3.16b - tbl v26.16b, { v27.16b }, v1.16b - add v22.4s, v22.4s, v26.4s - uzp2 v18.4s, v18.4s, v16.4s - eor v23.16b, v23.16b, v22.16b - ext v5.16b, v18.16b, v18.16b, #4 - ushr v27.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - uzp1 v25.4s, v5.4s, v5.4s - orr v23.16b, v23.16b, v27.16b - ext v28.16b, v4.16b, v4.16b, #12 - ext v4.16b, v25.16b, v5.16b, #8 - ext v25.16b, v26.16b, v26.16b, #8 - add v26.4s, v28.4s, v23.4s - eor v25.16b, v26.16b, v25.16b - ext v22.16b, v22.16b, v22.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v22.4s, v22.4s, v25.4s - eor v23.16b, v23.16b, v22.16b - add v17.4s, v26.4s, v17.4s - ushr v26.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - orr v23.16b, v23.16b, v26.16b - add v17.4s, v17.4s, v23.4s - eor v25.16b, v25.16b, v17.16b - add v17.4s, v17.4s, v19.4s - tbl v19.16b, { v25.16b }, v1.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - ext v17.16b, v17.16b, v17.16b, #4 - orr v23.16b, v23.16b, v25.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v17.4s, v17.4s, v23.4s - eor v19.16b, v17.16b, v19.16b - ext v22.16b, v22.16b, v22.16b, #12 - tbl v19.16b, { v19.16b }, v0.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - add v17.4s, v17.4s, v16.4s - orr v23.16b, v23.16b, v25.16b - add v17.4s, v17.4s, v23.4s - ext v25.16b, v17.16b, v17.16b, #12 - eor v17.16b, v19.16b, v17.16b - tbl v17.16b, { v17.16b }, v1.16b - add v19.4s, v22.4s, v17.4s - eor v22.16b, v23.16b, v19.16b - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v6.2d, v16.2d - ushr v23.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - zip2 v24.4s, v16.4s, v6.4s - tbl v26.16b, { v20.16b, v21.16b }, v2.16b - orr v22.16b, v22.16b, v23.16b - zip1 v16.4s, v24.4s, v21.4s - zip1 v20.4s, v21.4s, v24.4s - ext v21.16b, v26.16b, v26.16b, #12 - ext v17.16b, v17.16b, v17.16b, #8 - add v25.4s, v25.4s, v22.4s - ext v16.16b, v20.16b, v16.16b, #8 - uzp1 v21.4s, v26.4s, v21.4s - eor v26.16b, v25.16b, v17.16b - ext v19.16b, v19.16b, v19.16b, #4 - tbl v26.16b, { v26.16b }, v0.16b - mov v29.16b, v16.16b - add v19.4s, v19.4s, v26.4s - ext v27.16b, v5.16b, v5.16b, #12 - mov v29.s[1], v21.s[2] - eor v22.16b, v22.16b, v19.16b - ext v28.16b, v5.16b, v27.16b, #12 - ushr v27.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v25.4s, v6.4s - orr v22.16b, v22.16b, v27.16b - add v6.4s, v6.4s, v22.4s - eor v26.16b, v26.16b, v6.16b - add v6.4s, v6.4s, v18.4s - tbl v18.16b, { v26.16b }, v1.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v22.16b, v22.16b, v26.16b - ext v18.16b, v18.16b, v18.16b, #8 - add v6.4s, v6.4s, v22.4s - eor v18.16b, v6.16b, v18.16b - ext v19.16b, v19.16b, v19.16b, #12 - tbl v18.16b, { v18.16b }, v0.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v6.4s, v7.4s - orr v22.16b, v22.16b, v26.16b - add v6.4s, v6.4s, v22.4s - ext v26.16b, v6.16b, v6.16b, #12 - eor v6.16b, v18.16b, v6.16b - uzp2 v4.4s, v4.4s, v7.4s - zip2 v25.4s, v7.4s, v16.4s - add v26.4s, v26.4s, v21.4s - zip1 v20.2d, v16.2d, v7.2d - tbl v6.16b, { v6.16b }, v1.16b - ext v24.16b, v4.16b, v4.16b, #4 - tbl v27.16b, { v20.16b, v21.16b }, v2.16b - zip1 v7.4s, v25.4s, v21.4s - zip1 v20.4s, v21.4s, v25.4s - add v18.4s, v19.4s, v6.4s - uzp1 v5.4s, v24.4s, v24.4s - ext v21.16b, v27.16b, v27.16b, #12 - ext v7.16b, v20.16b, v7.16b, #8 - eor v19.16b, v22.16b, v18.16b - ext v5.16b, v5.16b, v24.16b, #8 - tbl v17.16b, { v28.16b, v29.16b }, v3.16b - uzp1 v21.4s, v27.4s, v21.4s - mov v28.16b, v7.16b - ushr v22.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v23.16b, v24.16b, v24.16b, #12 - uzp2 v5.4s, v5.4s, v17.4s - mov v28.s[1], v21.s[2] - orr v19.16b, v19.16b, v22.16b - ext v27.16b, v24.16b, v23.16b, #12 - ext v23.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #8 - ext v25.16b, v18.16b, v18.16b, #4 - add v18.4s, v26.4s, v19.4s - uzp1 v24.4s, v23.4s, v23.4s - eor v6.16b, v18.16b, v6.16b - ext v24.16b, v24.16b, v23.16b, #8 - add v16.4s, v18.4s, v16.4s - tbl v18.16b, { v27.16b, v28.16b }, v3.16b - tbl v27.16b, { v6.16b }, v0.16b - uzp2 v6.4s, v24.4s, v18.4s - add v24.4s, v25.4s, v27.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v25.16b - add v16.4s, v16.4s, v19.4s - eor v25.16b, v27.16b, v16.16b - add v4.4s, v16.4s, v4.4s - tbl v16.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v16.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v19.16b, v19.16b, v25.16b - ext v16.16b, v16.16b, v16.16b, #8 - add v4.4s, v4.4s, v19.4s - eor v16.16b, v4.16b, v16.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v25.16b, { v16.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v16.16b, v19.16b, v24.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v17.4s - orr v19.16b, v16.16b, v19.16b - add v27.4s, v4.4s, v19.4s - eor v25.16b, v25.16b, v27.16b - tbl v25.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v25.4s - zip2 v26.4s, v17.4s, v7.4s - ext v4.16b, v27.16b, v27.16b, #12 - eor v19.16b, v19.16b, v24.16b - add v28.4s, v4.4s, v21.4s - zip1 v20.2d, v7.2d, v17.2d - zip1 v4.4s, v26.4s, v21.4s - zip1 v17.4s, v21.4s, v26.4s - ushr v26.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v26.16b - ext v25.16b, v25.16b, v25.16b, #8 - add v27.4s, v28.4s, v19.4s - eor v25.16b, v27.16b, v25.16b - ext v24.16b, v24.16b, v24.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v19.16b, v19.16b, v24.16b - add v7.4s, v27.4s, v7.4s - ushr v27.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v27.16b - add v7.4s, v7.4s, v19.4s - eor v25.16b, v25.16b, v7.16b - add v5.4s, v7.4s, v5.4s - tbl v7.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v19.16b, v19.16b, v25.16b - ext v7.16b, v7.16b, v7.16b, #8 - add v5.4s, v5.4s, v19.4s - eor v7.16b, v5.16b, v7.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v7.16b, { v7.16b }, v0.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - tbl v16.16b, { v20.16b, v21.16b }, v2.16b - add v5.4s, v5.4s, v18.4s - orr v19.16b, v19.16b, v25.16b - ext v20.16b, v16.16b, v16.16b, #12 - ext v4.16b, v17.16b, v4.16b, #8 - add v5.4s, v5.4s, v19.4s - uzp1 v21.4s, v16.4s, v20.4s - mov v17.16b, v4.16b - ext v25.16b, v5.16b, v5.16b, #12 - mov v17.s[1], v21.s[2] - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v4.2d, v18.2d - ext v22.16b, v23.16b, v23.16b, #12 - zip2 v26.4s, v18.4s, v4.4s - tbl v18.16b, { v20.16b, v21.16b }, v2.16b - eor v5.16b, v7.16b, v5.16b - ext v16.16b, v23.16b, v22.16b, #12 - ext v22.16b, v6.16b, v6.16b, #4 - zip1 v27.4s, v26.4s, v21.4s - zip1 v20.4s, v21.4s, v26.4s - ext v21.16b, v18.16b, v18.16b, #12 - tbl v5.16b, { v5.16b }, v1.16b - ext v20.16b, v20.16b, v27.16b, #8 - uzp1 v27.4s, v18.4s, v21.4s - uzp1 v18.4s, v22.4s, v22.4s - add v21.4s, v24.4s, v5.4s - ext v18.16b, v18.16b, v22.16b, #8 - eor v19.16b, v19.16b, v21.16b - tbl v7.16b, { v16.16b, v17.16b }, v3.16b - uzp2 v18.4s, v18.4s, v17.4s - zip2 v16.4s, v16.4s, v20.4s - ushr v17.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v17.16b, v19.16b, v17.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v19.4s, v25.4s, v17.4s - eor v5.16b, v19.16b, v5.16b - ext v21.16b, v21.16b, v21.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b - add v4.4s, v19.4s, v4.4s - add v19.4s, v21.4s, v5.4s - eor v17.16b, v17.16b, v19.16b - ushr v21.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - orr v17.16b, v17.16b, v21.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - add v4.4s, v4.4s, v6.4s - add v6.4s, v19.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v17.16b, v17.16b, v19.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v17.4s - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #12 - tbl v5.16b, { v5.16b }, v0.16b - add v6.4s, v6.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - add v4.4s, v4.4s, v7.4s - orr v17.16b, v17.16b, v19.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - mov v29.16b, v20.16b - ext v4.16b, v4.16b, v4.16b, #12 - add v6.4s, v6.4s, v5.4s - mov v29.s[1], v27.s[2] - add v4.4s, v4.4s, v27.4s - zip1 v26.2d, v20.2d, v7.2d - zip1 v7.4s, v16.4s, v27.4s - zip1 v16.4s, v27.4s, v16.4s - eor v17.16b, v17.16b, v6.16b - ext v7.16b, v16.16b, v7.16b, #8 - ushr v16.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - orr v16.16b, v17.16b, v16.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v16.4s - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b - add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b - ushr v17.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v20.4s - orr v16.16b, v16.16b, v17.16b - add v4.4s, v4.4s, v16.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b - add v4.4s, v4.4s, v18.4s - ushr v17.4s, v16.4s, #7 - shl v16.4s, v16.4s, #25 - ext v23.16b, v22.16b, v22.16b, #12 - ext v4.16b, v4.16b, v4.16b, #4 - orr v16.16b, v16.16b, v17.16b - ext v28.16b, v22.16b, v23.16b, #12 - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v16.4s, v4.4s - tbl v3.16b, { v28.16b, v29.16b }, v3.16b - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v4.4s, v3.4s - tbl v4.16b, { v5.16b }, v0.16b - add v5.4s, v6.4s, v4.4s - eor v6.16b, v16.16b, v5.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - orr v6.16b, v6.16b, v16.16b - tbl v2.16b, { v26.16b, v27.16b }, v2.16b - add v3.4s, v3.4s, v6.4s - ext v19.16b, v2.16b, v2.16b, #12 - eor v4.16b, v4.16b, v3.16b - uzp1 v2.4s, v2.4s, v19.4s - ext v3.16b, v3.16b, v3.16b, #12 - tbl v4.16b, { v4.16b }, v1.16b - add v2.4s, v3.4s, v2.4s - add v3.4s, v5.4s, v4.4s - eor v5.16b, v6.16b, v3.16b - ushr v6.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v6.16b - ext v4.16b, v4.16b, v4.16b, #8 - add v2.4s, v2.4s, v5.4s - eor v4.16b, v2.16b, v4.16b - ext v3.16b, v3.16b, v3.16b, #4 - tbl v0.16b, { v4.16b }, v0.16b - add v3.4s, v3.4s, v0.4s - eor v4.16b, v5.16b, v3.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v2.4s, v2.4s, v7.4s - orr v4.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v4.4s + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + stp x20, x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x20, x0 + mov x19, x5 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x20 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] + eor v0.16b, v2.16b, v0.16b + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr q0, [x20] eor v0.16b, v0.16b, v2.16b - tbl v0.16b, { v0.16b }, v1.16b - add v1.4s, v3.4s, v0.4s - eor v3.16b, v4.16b, v1.16b - ushr v4.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - ext v0.16b, v0.16b, v0.16b, #8 - ext v1.16b, v1.16b, v1.16b, #12 - orr v3.16b, v3.16b, v4.16b - eor v2.16b, v2.16b, v1.16b - eor v3.16b, v3.16b, v0.16b - stp q2, q3, [x5] - ldr q2, [x0] - eor v1.16b, v2.16b, v1.16b - str q1, [x5, #32] - ldr q1, [x0, #16] - eor v0.16b, v1.16b, v0.16b - str q0, [x5, #48] + str q0, [x19, #32] + ldr q0, [x20, #16] + eor v0.16b, v0.16b, v3.16b + str q0, [x19, #48] + ldp x20, x19, [sp, #80] + add sp, sp, #96 + hint #29 ret -.Lfunc_end1: - .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41 +.Lfunc_end2: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end2-zfs_blake3_compress_xof_sse41 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 .p2align 4 -.LCPI2_0: +.LCPI3_0: .word 0 .word 1 .word 2 .word 3 -.LCPI2_1: +.LCPI3_1: .byte 2 .byte 3 .byte 0 @@ -1078,7 +628,7 @@ zfs_blake3_compress_xof_sse41: .byte 15 .byte 12 .byte 13 -.LCPI2_2: +.LCPI3_2: .byte 1 .byte 2 .byte 3 @@ -1095,25 +645,29 @@ zfs_blake3_compress_xof_sse41: .byte 14 .byte 15 .byte 12 +.LCPI3_3: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 .text .globl zfs_blake3_hash_many_sse41 .p2align 2 .type zfs_blake3_hash_many_sse41,@function zfs_blake3_hash_many_sse41: .cfi_startproc - stp d15, d14, [sp, #-160]! + hint #34 + stp d15, d14, [sp, #-144]! stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] stp d9, d8, [sp, #48] - stp x29, x30, [sp, #64] - stp x28, x27, [sp, #80] - stp x26, x25, [sp, #96] - stp x24, x23, [sp, #112] - stp x22, x21, [sp, #128] - stp x20, x19, [sp, #144] - mov x29, sp - sub sp, sp, #448 - .cfi_def_cfa w29, 160 + stp x29, x27, [sp, #64] + stp x26, x25, [sp, #80] + stp x24, x23, [sp, #96] + stp x22, x21, [sp, #112] + stp x20, x19, [sp, #128] + sub sp, sp, #368 + .cfi_def_cfa_offset 512 .cfi_offset w19, -8 .cfi_offset w20, -16 .cfi_offset w21, -24 @@ -1123,1341 +677,1722 @@ zfs_blake3_hash_many_sse41: .cfi_offset w25, -56 .cfi_offset w26, -64 .cfi_offset w27, -72 - .cfi_offset w28, -80 - .cfi_offset w30, -88 - .cfi_offset w29, -96 - .cfi_offset b8, -104 - .cfi_offset b9, -112 - .cfi_offset b10, -120 - .cfi_offset b11, -128 - .cfi_offset b12, -136 - .cfi_offset b13, -144 - .cfi_offset b14, -152 - .cfi_offset b15, -160 - ldr x26, [x29, #168] - ldrb w27, [x29, #160] - mov w19, w6 - mov x20, x4 - mov x22, x2 - mov x28, x1 + .cfi_offset w29, -80 + .cfi_offset b8, -88 + .cfi_offset b9, -96 + .cfi_offset b10, -104 + .cfi_offset b11, -112 + .cfi_offset b12, -120 + .cfi_offset b13, -128 + .cfi_offset b14, -136 + .cfi_offset b15, -144 + ldr x8, [sp, #520] + adrp x11, .LCPI3_1 + ldrb w9, [sp, #512] + adrp x10, .LCPI3_2 cmp x1, #4 - mov x24, x0 - str x3, [sp, #40] - b.lo .LBB2_8 - adrp x11, .LCPI2_0 - ldr q0, [x11, :lo12:.LCPI2_0] + b.lo .LBB3_6 + adrp x12, .LCPI3_0 sbfx w13, w5, #0, #1 + mov w15, #58983 + mov w16, #44677 + movk w15, #27145, lsl #16 + movk w16, #47975, lsl #16 + ldr q0, [x12, :lo12:.LCPI3_0] dup v1.4s, w13 - mov w10, #58983 - mov w11, #44677 - mov w12, #62322 + movi v13.4s, #64 + mov w13, #62322 + mov w14, #62778 + orr w12, w7, w6 and v0.16b, v1.16b, v0.16b - mov w13, #62778 - orr w8, w7, w19 - adrp x9, .LCPI2_1 - movk w10, #27145, lsl #16 - movk w11, #47975, lsl #16 - movk w12, #15470, lsl #16 - movk w13, #42319, lsl #16 - str q0, [sp, #16] + ldr q1, [x11, :lo12:.LCPI3_1] + movk w13, #15470, lsl #16 + movk w14, #42319, lsl #16 + dup v14.4s, w15 + stp q0, q1, [sp, #16] orr v0.4s, #128, lsl #24 - adrp x14, .LCPI2_2 str q0, [sp] -.LBB2_2: - ldr x2, [sp, #40] - mov x15, x2 - ld1r { v7.4s }, [x15], #4 - add x16, x2, #8 - add x17, x2, #12 - add x18, x2, #16 - add x0, x2, #20 - add x3, x2, #24 - add x2, x2, #28 - ld1r { v6.4s }, [x16] - ld1r { v17.4s }, [x17] - ld1r { v10.4s }, [x18] - ld1r { v11.4s }, [x0] - ld1r { v19.4s }, [x3] - ld1r { v18.4s }, [x15] - ld1r { v16.4s }, [x2] - cbz x22, .LBB2_7 + dup v0.4s, w16 + stp q0, q14, [sp, #48] + b .LBB3_3 +.LBB3_2: + zip1 v0.4s, v29.4s, v8.4s + add x15, x4, #4 + zip1 v1.4s, v30.4s, v31.4s + tst w5, #0x1 + zip1 v2.4s, v24.4s, v18.4s + csel x4, x15, x4, ne + zip1 v3.4s, v25.4s, v26.4s + add x0, x0, #32 + zip2 v6.4s, v29.4s, v8.4s + sub x1, x1, #4 + zip1 v4.2d, v0.2d, v1.2d + cmp x1, #3 + zip2 v7.4s, v30.4s, v31.4s + zip1 v5.2d, v2.2d, v3.2d + zip2 v0.2d, v0.2d, v1.2d + zip2 v1.2d, v2.2d, v3.2d + zip2 v2.4s, v24.4s, v18.4s + zip2 v3.4s, v25.4s, v26.4s + stp q4, q5, [x8] + zip2 v4.2d, v6.2d, v7.2d + stp q0, q1, [x8, #32] + zip1 v0.2d, v6.2d, v7.2d + zip1 v1.2d, v2.2d, v3.2d + zip2 v2.2d, v2.2d, v3.2d + stp q0, q1, [x8, #64] + stp q4, q2, [x8, #96] + add x8, x8, #128 + b.ls .LBB3_6 +.LBB3_3: + mov x15, x3 + add x16, x3, #8 + add x17, x3, #12 + add x19, x3, #16 + add x20, x3, #20 + ld1r { v29.4s }, [x15], #4 + ld1r { v30.4s }, [x16] + add x16, x3, #24 + ld1r { v31.4s }, [x17] + add x17, x3, #28 + ld1r { v24.4s }, [x19] + ld1r { v18.4s }, [x20] + ld1r { v25.4s }, [x16] + ld1r { v8.4s }, [x15] + ld1r { v26.4s }, [x17] + cbz x2, .LBB3_2 ldr q1, [sp, #16] - dup v0.4s, w20 - ldp x15, x16, [x24] - ldp x17, x18, [x24, #16] + dup v0.4s, w4 + lsr x17, x4, #32 + mov x15, xzr + ldp x19, x20, [x0, #16] add v1.4s, v0.4s, v1.4s + mov x21, x2 movi v0.4s, #128, lsl #24 - str q1, [sp, #64] + mov w26, w12 + str q1, [sp, #96] eor v0.16b, v1.16b, v0.16b ldr q1, [sp] - lsr x2, x20, #32 - mov x0, xzr - mov w6, w8 cmgt v0.4s, v1.4s, v0.4s - dup v1.4s, w2 + dup v1.4s, w17 + ldp x16, x17, [x0] sub v0.4s, v1.4s, v0.4s - str q0, [sp, #48] -.LBB2_4: - mov w4, #16 - stp q16, q17, [sp, #192] - bfi x4, x0, #6, #58 - ldr q1, [x15, x4] - ldr q3, [x16, x4] - ldr q2, [x17, x4] - ldr q4, [x18, x4] - mov w4, #32 - bfi x4, x0, #6, #58 - ldr q5, [x15, x4] - ldr q20, [x16, x4] - ldr q21, [x17, x4] - ldr q22, [x18, x4] - mov w4, #48 - lsl x3, x0, #6 - bfi x4, x0, #6, #58 - add x0, x0, #1 - ldr q0, [x15, x3] - ldr q23, [x16, x3] - ldr q16, [x17, x3] - ldr q17, [x18, x3] - cmp x0, x22 - ldr q25, [x15, x4] - ldr q14, [x16, x4] - ldr q28, [x17, x4] - ldr q31, [x18, x4] - csel w4, w27, wzr, eq - orr w4, w4, w6 - mov x2, xzr - and w6, w4, #0xff - add x3, x3, #256 -.LBB2_5: - ldr x4, [x24, x2] - add x2, x2, #8 - cmp x2, #32 - add x4, x4, x3 - prfm pldl1keep, [x4] - b.ne .LBB2_5 - zip1 v29.4s, v0.4s, v23.4s - zip2 v23.4s, v0.4s, v23.4s - zip1 v0.4s, v16.4s, v17.4s - zip2 v24.4s, v16.4s, v17.4s - zip1 v9.4s, v1.4s, v3.4s - zip2 v26.4s, v1.4s, v3.4s - zip1 v27.4s, v2.4s, v4.4s - zip2 v17.4s, v2.4s, v4.4s - zip1 v12.4s, v21.4s, v22.4s - zip2 v13.4s, v21.4s, v22.4s - add v2.4s, v7.4s, v10.4s - add v1.4s, v18.4s, v11.4s - ext v7.16b, v0.16b, v29.16b, #8 - ext v22.16b, v24.16b, v23.16b, #8 - zip1 v30.4s, v5.4s, v20.4s - zip2 v20.4s, v5.4s, v20.4s - stp q1, q2, [sp, #112] - ext v2.16b, v29.16b, v7.16b, #8 - mov v29.d[1], v0.d[0] - ext v18.16b, v23.16b, v22.16b, #8 - mov v23.d[1], v24.d[0] - zip1 v21.4s, v25.4s, v14.4s - zip2 v4.4s, v25.4s, v14.4s - zip1 v14.4s, v28.4s, v31.4s - zip2 v15.4s, v28.4s, v31.4s - add v8.4s, v6.4s, v19.4s - ext v28.16b, v27.16b, v9.16b, #8 - ext v31.16b, v17.16b, v26.16b, #8 - stur q2, [x29, #-208] - mov v7.16b, v29.16b - ext v0.16b, v12.16b, v30.16b, #8 - stp q23, q29, [x29, #-80] - mov v2.16b, v19.16b - ext v19.16b, v13.16b, v20.16b, #8 - mov v29.16b, v9.16b - ext v25.16b, v9.16b, v28.16b, #8 - mov v29.d[1], v27.d[0] - ext v24.16b, v26.16b, v31.16b, #8 - mov v26.d[1], v17.d[0] - ext v17.16b, v15.16b, v4.16b, #8 - ext v27.16b, v30.16b, v0.16b, #8 - ext v0.16b, v20.16b, v19.16b, #8 - stp q0, q25, [sp, #80] - ext v0.16b, v4.16b, v17.16b, #8 - str q0, [sp, #224] - ldr q0, [sp, #128] - mov v6.16b, v23.16b - mov v22.16b, v4.16b - ldr q16, [x9, :lo12:.LCPI2_1] - add v17.4s, v0.4s, v7.4s - ldr q0, [sp, #112] - mov v30.d[1], v12.d[0] - add v7.4s, v8.4s, v29.4s - mov v20.d[1], v13.d[0] - add v4.4s, v0.4s, v6.4s - ldr q0, [sp, #64] - dup v3.4s, w12 - ext v28.16b, v14.16b, v21.16b, #8 - dup v1.4s, w10 - eor v19.16b, v17.16b, v0.16b - ldr q0, [sp, #48] - ext v23.16b, v21.16b, v28.16b, #8 - mov v21.d[1], v14.d[0] - tbl v14.16b, { v19.16b }, v16.16b - eor v12.16b, v4.16b, v0.16b - movi v0.4s, #64 - eor v13.16b, v7.16b, v0.16b - tbl v13.16b, { v13.16b }, v16.16b - add v6.4s, v13.4s, v3.4s - dup v5.4s, w11 - tbl v12.16b, { v12.16b }, v16.16b - add v1.4s, v14.4s, v1.4s - eor v9.16b, v6.16b, v2.16b - ldp q2, q0, [sp, #192] - add v5.4s, v12.4s, v5.4s - eor v19.16b, v1.16b, v10.16b - eor v10.16b, v5.16b, v11.16b - ushr v11.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v11.16b, v19.16b, v11.16b - ushr v19.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - mov v22.d[1], v15.d[0] - orr v10.16b, v10.16b, v19.16b - ushr v19.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - add v15.4s, v0.4s, v2.4s - orr v9.16b, v9.16b, v19.16b - dup v19.4s, w6 - add v15.4s, v15.4s, v26.4s - eor v19.16b, v15.16b, v19.16b - tbl v3.16b, { v19.16b }, v16.16b - dup v19.4s, w13 - add v8.4s, v3.4s, v19.4s - ldur q31, [x29, #-208] - eor v19.16b, v8.16b, v2.16b - ushr v0.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v2.16b, v19.16b, v0.16b - ldr q19, [x14, :lo12:.LCPI2_2] - add v17.4s, v17.4s, v31.4s - add v17.4s, v17.4s, v11.4s - eor v14.16b, v14.16b, v17.16b - tbl v14.16b, { v14.16b }, v19.16b - add v1.4s, v1.4s, v14.4s - eor v11.16b, v1.16b, v11.16b - add v4.4s, v4.4s, v18.4s - ushr v0.4s, v11.4s, #7 - shl v11.4s, v11.4s, #25 - add v4.4s, v4.4s, v10.4s - orr v0.16b, v11.16b, v0.16b - eor v11.16b, v12.16b, v4.16b - tbl v11.16b, { v11.16b }, v19.16b - add v5.4s, v5.4s, v11.4s - eor v10.16b, v5.16b, v10.16b - add v7.4s, v7.4s, v25.4s - ushr v12.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - add v7.4s, v7.4s, v9.4s - orr v10.16b, v10.16b, v12.16b - eor v12.16b, v13.16b, v7.16b - tbl v12.16b, { v12.16b }, v19.16b - add v6.4s, v6.4s, v12.4s - eor v9.16b, v6.16b, v9.16b - ushr v13.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - orr v9.16b, v9.16b, v13.16b - add v13.4s, v15.4s, v24.4s - add v13.4s, v13.4s, v2.4s - eor v3.16b, v3.16b, v13.16b - tbl v3.16b, { v3.16b }, v19.16b - add v8.4s, v8.4s, v3.4s - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v30.4s - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v20.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v21.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v22.4s - mov v28.16b, v26.16b - stur q26, [x29, #-112] - mov v26.16b, v18.16b - mov v18.16b, v24.16b - stur q24, [x29, #-160] - add v6.4s, v6.4s, v3.4s - mov v24.16b, v20.16b - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldr q20, [sp, #80] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - stp q30, q22, [x29, #-192] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - mov v30.16b, v27.16b - add v17.4s, v17.4s, v27.4s - ldr q27, [sp, #224] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 + str q0, [sp, #80] +.LBB3_5: + add x23, x16, x15 + add x24, x17, x15 + add x22, x19, x15 + add x25, x20, x15 + subs x21, x21, #1 + add x15, x15, #64 + ldp q1, q2, [x23] + csel w27, w9, wzr, eq + orr w26, w27, w26 + and w26, w26, #0xff + ldp q4, q5, [x24] + dup v0.4s, w26 + mov w26, w6 + zip1 v22.4s, v1.4s, v4.4s + zip2 v20.4s, v1.4s, v4.4s + ldp q6, q7, [x22] + zip1 v17.4s, v2.4s, v5.4s + zip2 v23.4s, v2.4s, v5.4s + ldp q16, q21, [x25] + zip1 v19.4s, v6.4s, v16.4s + zip2 v1.4s, v6.4s, v16.4s + ldp q27, q28, [x23, #32] + zip1 v4.4s, v7.4s, v21.4s + zip2 v5.4s, v7.4s, v21.4s + zip2 v15.2d, v17.2d, v4.2d + ldp q9, q10, [x24, #32] + mov v17.d[1], v4.d[0] + add v4.4s, v30.4s, v25.4s + zip2 v11.2d, v23.2d, v5.2d + zip2 v3.4s, v27.4s, v9.4s + zip1 v7.4s, v27.4s, v9.4s + ldp q12, q6, [x22, #32] + mov v23.d[1], v5.d[0] + stp q11, q3, [sp, #256] + add v5.4s, v31.4s, v26.4s + add v4.4s, v4.4s, v17.4s + str q23, [sp, #352] + ldp q16, q2, [x25, #32] + add v5.4s, v5.4s, v23.4s + zip1 v3.4s, v12.4s, v16.4s eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v20.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v23.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v27.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - stur q21, [x29, #-144] - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - ldur q21, [x29, #-80] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v21.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v26.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v18.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v29.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b + zip1 v9.4s, v6.4s, v2.4s + zip2 v2.4s, v6.4s, v2.4s + stp q7, q3, [sp, #208] + zip2 v3.4s, v12.4s, v16.4s + zip1 v12.4s, v28.4s, v10.4s + zip2 v10.4s, v28.4s, v10.4s + stp q17, q2, [sp, #160] + zip2 v28.2d, v22.2d, v19.2d + mov v22.d[1], v19.d[0] + str q3, [sp, #240] + add v2.4s, v8.4s, v18.4s + eor v16.16b, v4.16b, v13.16b + dup v17.4s, w13 + mov v3.16b, v22.16b + stp q22, q28, [sp, #320] + zip2 v22.2d, v20.2d, v1.2d + mov v20.d[1], v1.d[0] + add v1.4s, v29.4s, v24.4s + add v4.4s, v4.4s, v15.4s add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ldur q22, [x29, #-64] - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v28.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v24.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v22.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v23.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ldur q22, [x29, #-144] - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v31.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v22.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v30.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v27.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldr q27, [sp, #96] - mov v21.16b, v26.16b - stur q26, [x29, #-96] - mov v28.16b, v31.16b - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldp q31, q26, [x29, #-192] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v20.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v27.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v26.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v31.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - mov v18.16b, v24.16b - mov v24.16b, v20.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - ldur q20, [x29, #-160] - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v21.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v18.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v23.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v20.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ldur q25, [x29, #-80] - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v29.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v25.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v26.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - ldur q25, [x29, #-112] - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v25.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v30.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v24.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v31.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldur q25, [x29, #-64] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldr q31, [sp, #224] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v27.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v25.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v31.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v28.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v18.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v26.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v23.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - mov v21.16b, v29.16b - stur q29, [x29, #-128] - mov v29.16b, v30.16b - mov v30.16b, v27.16b - mov v27.16b, v18.16b - str q18, [sp, #176] - eor v0.16b, v0.16b, v1.16b - mov v18.16b, v22.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ldur q22, [x29, #-96] - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v20.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v29.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v22.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v31.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v21.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v24.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v30.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v28.4s - add v6.4s, v6.4s, v3.4s - mov v22.16b, v24.16b - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldur q24, [x29, #-80] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - mov v21.16b, v30.16b - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldur q30, [x29, #-192] - mov v20.16b, v29.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - ldur q29, [x29, #-112] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v25.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v24.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v30.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v29.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v18.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v20.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v31.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v26.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v23.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v27.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v30.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - ldur q27, [x29, #-160] - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v27.4s - mov v28.16b, v25.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v21.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v28.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v29.4s - mov v25.16b, v31.16b - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldur q31, [x29, #-96] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldur q28, [x29, #-208] - mov v18.16b, v20.16b - str q20, [sp, #144] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - ldur q20, [x29, #-128] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v24.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v31.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v28.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v20.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v18.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v30.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v25.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - add v17.4s, v17.4s, v26.4s - mov v26.16b, v21.16b - add v4.4s, v4.4s, v21.4s - ldur q21, [x29, #-144] - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v21.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v28.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - str q23, [sp, #160] - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - add v17.4s, v17.4s, v23.4s - ldur q23, [x29, #-64] - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v23.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v24.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v20.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - add v13.4s, v13.4s, v0.4s - ldr q20, [sp, #176] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - eor v12.16b, v12.16b, v13.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - tbl v12.16b, { v12.16b }, v16.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v31.4s - orr v2.16b, v2.16b, v15.16b - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v20.4s - add v7.4s, v7.4s, v29.4s - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v2.4s - orr v0.16b, v0.16b, v15.16b - mov v15.16b, v31.16b - add v17.4s, v17.4s, v22.4s - eor v31.16b, v14.16b, v4.16b - eor v22.16b, v11.16b, v7.16b - add v11.4s, v13.4s, v27.4s - tbl v3.16b, { v3.16b }, v19.16b - add v11.4s, v11.4s, v0.4s - tbl v31.16b, { v31.16b }, v19.16b - add v6.4s, v6.4s, v3.4s - eor v12.16b, v12.16b, v11.16b - tbl v22.16b, { v22.16b }, v19.16b - add v8.4s, v8.4s, v31.4s - eor v10.16b, v6.16b, v10.16b - add v30.4s, v11.4s, v30.4s - tbl v11.16b, { v12.16b }, v19.16b - add v1.4s, v1.4s, v22.4s - eor v9.16b, v8.16b, v9.16b - ushr v12.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - add v5.4s, v5.4s, v11.4s - eor v2.16b, v1.16b, v2.16b - orr v10.16b, v10.16b, v12.16b - ushr v12.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v9.16b, v9.16b, v12.16b - ushr v12.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v2.16b, v2.16b, v12.16b - ushr v12.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v12.16b - add v4.4s, v4.4s, v26.4s - add v17.4s, v17.4s, v0.4s - add v7.4s, v7.4s, v28.4s - mov v18.16b, v27.16b - eor v31.16b, v31.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v27.4s, v30.4s, v2.4s - eor v22.16b, v22.16b, v4.16b - add v7.4s, v7.4s, v9.4s - eor v3.16b, v3.16b, v27.16b - add v26.4s, v27.4s, v29.4s - tbl v27.16b, { v31.16b }, v16.16b - eor v28.16b, v11.16b, v7.16b - tbl v22.16b, { v22.16b }, v16.16b - add v1.4s, v1.4s, v27.4s - add v4.4s, v4.4s, v23.4s - ldr q23, [sp, #144] - tbl v28.16b, { v28.16b }, v16.16b - tbl v3.16b, { v3.16b }, v16.16b - add v5.4s, v5.4s, v22.4s - eor v0.16b, v0.16b, v1.16b - add v6.4s, v6.4s, v28.4s - add v29.4s, v8.4s, v3.4s - eor v30.16b, v5.16b, v10.16b - ushr v8.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v31.16b, v6.16b, v9.16b - orr v0.16b, v0.16b, v8.16b - ushr v8.4s, v30.4s, #12 - shl v30.4s, v30.4s, #20 - eor v2.16b, v29.16b, v2.16b - orr v30.16b, v30.16b, v8.16b - ushr v8.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - add v17.4s, v17.4s, v25.4s - add v7.4s, v7.4s, v23.4s - orr v31.16b, v31.16b, v8.16b - ushr v8.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ldur q23, [x29, #-176] - orr v2.16b, v2.16b, v8.16b - add v17.4s, v17.4s, v0.4s - eor v27.16b, v27.16b, v17.16b - add v4.4s, v4.4s, v30.4s - add v25.4s, v26.4s, v2.4s - eor v22.16b, v22.16b, v4.16b - add v4.4s, v4.4s, v24.4s - add v7.4s, v7.4s, v31.4s - eor v3.16b, v3.16b, v25.16b - add v24.4s, v25.4s, v18.4s - tbl v25.16b, { v27.16b }, v19.16b - add v17.4s, v17.4s, v23.4s - eor v23.16b, v28.16b, v7.16b - tbl v22.16b, { v22.16b }, v19.16b - add v1.4s, v1.4s, v25.4s - tbl v23.16b, { v23.16b }, v19.16b - tbl v3.16b, { v3.16b }, v19.16b - add v5.4s, v5.4s, v22.4s - eor v0.16b, v0.16b, v1.16b - add v6.4s, v6.4s, v23.4s - add v26.4s, v29.4s, v3.4s - eor v27.16b, v5.16b, v30.16b - ushr v29.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v28.16b, v6.16b, v31.16b - orr v0.16b, v0.16b, v29.16b - ushr v29.4s, v27.4s, #7 - shl v27.4s, v27.4s, #25 - eor v2.16b, v26.16b, v2.16b - orr v27.16b, v27.16b, v29.16b - ushr v29.4s, v28.4s, #7 - shl v28.4s, v28.4s, #25 - ldur q18, [x29, #-128] - orr v28.16b, v28.16b, v29.16b - ushr v29.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v7.4s, v7.4s, v15.4s - orr v2.16b, v2.16b, v29.16b - add v17.4s, v17.4s, v27.4s - add v4.4s, v4.4s, v28.4s - add v7.4s, v7.4s, v2.4s - eor v3.16b, v3.16b, v17.16b - add v17.4s, v17.4s, v20.4s - eor v20.16b, v25.16b, v4.16b - add v4.4s, v4.4s, v21.4s - eor v21.16b, v22.16b, v7.16b - add v7.4s, v7.4s, v18.4s - add v18.4s, v24.4s, v0.4s - eor v22.16b, v23.16b, v18.16b - ldr q23, [sp, #160] - tbl v3.16b, { v3.16b }, v16.16b - tbl v20.16b, { v20.16b }, v16.16b - add v6.4s, v6.4s, v3.4s - add v18.4s, v18.4s, v23.4s - tbl v21.16b, { v21.16b }, v16.16b - tbl v16.16b, { v22.16b }, v16.16b - add v22.4s, v26.4s, v20.4s - eor v23.16b, v6.16b, v27.16b - add v1.4s, v1.4s, v21.4s - eor v24.16b, v22.16b, v28.16b - ushr v25.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - add v5.4s, v5.4s, v16.4s - eor v2.16b, v1.16b, v2.16b - orr v23.16b, v23.16b, v25.16b - ushr v25.4s, v24.4s, #12 + add v2.4s, v2.4s, v20.4s + stp q15, q20, [sp, #288] + add v1.4s, v1.4s, v3.4s + ldr q3, [sp, #96] + dup v20.4s, w14 + mov v23.16b, v22.16b + mov v15.16b, v10.16b + eor v6.16b, v1.16b, v3.16b + ldr q3, [sp, #80] + add v1.4s, v1.4s, v28.4s + ldr q28, [sp, #272] + str q23, [sp, #128] + eor v7.16b, v2.16b, v3.16b + ldp q27, q3, [sp, #32] + add v2.4s, v2.4s, v22.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v7.16b, { v7.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + tbl v0.16b, { v0.16b }, v27.16b + add v19.4s, v6.4s, v14.4s + add v21.4s, v7.4s, v3.4s + add v30.4s, v16.4s, v17.4s + add v31.4s, v0.4s, v20.4s + eor v24.16b, v19.16b, v24.16b + eor v17.16b, v21.16b, v18.16b + ushr v18.4s, v24.4s, #12 + shl v20.4s, v24.4s, #20 + eor v24.16b, v30.16b, v25.16b + eor v25.16b, v31.16b, v26.16b + ushr v26.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + ushr v29.4s, v24.4s, #12 shl v24.4s, v24.4s, #20 - eor v0.16b, v5.16b, v0.16b - orr v24.16b, v24.16b, v25.16b - ushr v25.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v2.16b, v2.16b, v25.16b - ushr v25.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - orr v0.16b, v0.16b, v25.16b - add v25.4s, v7.4s, v2.4s - add v26.4s, v18.4s, v0.4s - eor v18.16b, v21.16b, v25.16b - add v17.4s, v17.4s, v23.4s - add v4.4s, v4.4s, v24.4s - eor v16.16b, v16.16b, v26.16b - tbl v21.16b, { v18.16b }, v19.16b - eor v3.16b, v3.16b, v17.16b - eor v7.16b, v20.16b, v4.16b - tbl v16.16b, { v16.16b }, v19.16b - add v1.4s, v1.4s, v21.4s - tbl v3.16b, { v3.16b }, v19.16b - tbl v20.16b, { v7.16b }, v19.16b - eor v2.16b, v1.16b, v2.16b - eor v7.16b, v1.16b, v17.16b - add v1.4s, v5.4s, v16.4s - eor v0.16b, v1.16b, v0.16b - eor v18.16b, v1.16b, v4.16b - add v1.4s, v6.4s, v3.4s - eor v4.16b, v1.16b, v23.16b - eor v6.16b, v25.16b, v1.16b - add v1.4s, v22.4s, v20.4s - eor v5.16b, v1.16b, v24.16b - eor v17.16b, v26.16b, v1.16b - ushr v1.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - orr v1.16b, v4.16b, v1.16b - ushr v4.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v4.16b, v5.16b, v4.16b - ushr v5.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v2.16b, v2.16b, v5.16b + ushr v8.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + orr v3.16b, v20.16b, v18.16b + ldr q18, [x10, :lo12:.LCPI3_2] + orr v13.16b, v17.16b, v26.16b + orr v24.16b, v24.16b, v29.16b + orr v14.16b, v25.16b, v8.16b + add v8.4s, v1.4s, v3.4s + add v29.4s, v2.4s, v13.4s + add v17.4s, v4.4s, v24.4s + add v20.4s, v5.4s, v14.4s + eor v1.16b, v6.16b, v8.16b + eor v2.16b, v7.16b, v29.16b + eor v4.16b, v16.16b, v17.16b + eor v0.16b, v0.16b, v20.16b + tbl v25.16b, { v1.16b }, v18.16b + tbl v16.16b, { v2.16b }, v18.16b + tbl v6.16b, { v4.16b }, v18.16b + tbl v4.16b, { v0.16b }, v18.16b + add v19.4s, v19.4s, v25.4s + add v21.4s, v21.4s, v16.4s + add v26.4s, v30.4s, v6.4s + add v7.4s, v31.4s, v4.4s + eor v0.16b, v19.16b, v3.16b + eor v1.16b, v21.16b, v13.16b + eor v2.16b, v26.16b, v24.16b + eor v3.16b, v7.16b, v14.16b ushr v5.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v5.16b - eor v10.16b, v0.16b, v20.16b - eor v11.16b, v1.16b, v21.16b - eor v19.16b, v4.16b, v16.16b - cmp x0, x22 - eor v16.16b, v2.16b, v3.16b - mov w6, w19 - b.ne .LBB2_4 -.LBB2_7: - zip1 v0.4s, v7.4s, v18.4s - zip2 v1.4s, v7.4s, v18.4s - zip1 v2.4s, v6.4s, v17.4s - zip2 v3.4s, v6.4s, v17.4s - zip1 v4.4s, v10.4s, v11.4s - zip2 v5.4s, v10.4s, v11.4s - zip1 v6.4s, v19.4s, v16.4s - zip2 v7.4s, v19.4s, v16.4s - add x15, x20, #4 - tst w5, #0x1 - sub x28, x28, #4 - zip1 v16.2d, v0.2d, v2.2d - zip2 v0.2d, v0.2d, v2.2d - zip1 v2.2d, v1.2d, v3.2d - zip2 v1.2d, v1.2d, v3.2d - zip1 v3.2d, v4.2d, v6.2d - zip2 v4.2d, v4.2d, v6.2d - zip1 v6.2d, v5.2d, v7.2d - zip2 v5.2d, v5.2d, v7.2d - add x24, x24, #32 - csel x20, x15, x20, ne - cmp x28, #3 - stp q16, q3, [x26] - stp q0, q4, [x26, #32] - stp q2, q6, [x26, #64] - stp q1, q5, [x26, #96] - add x26, x26, #128 - b.hi .LBB2_2 -.LBB2_8: - cbz x28, .LBB2_16 - orr w8, w7, w19 - and x21, x5, #0x1 - stur w8, [x29, #-64] -.LBB2_10: - ldr x8, [sp, #40] - ldr x25, [x24] - ldur w4, [x29, #-64] - ldp q1, q0, [x8] - mov x8, x22 - stp q1, q0, [x29, #-48] -.LBB2_11: - subs x23, x8, #1 - b.eq .LBB2_13 - cbnz x8, .LBB2_14 - b .LBB2_15 -.LBB2_13: - orr w4, w4, w27 -.LBB2_14: - sub x0, x29, #48 - mov w2, #64 - mov x1, x25 - mov x3, x20 - bl zfs_blake3_compress_in_place_sse41 - add x25, x25, #64 - mov x8, x23 - mov w4, w19 - b .LBB2_11 -.LBB2_15: - ldp q0, q1, [x29, #-48] - add x20, x20, x21 - add x24, x24, #8 - subs x28, x28, #1 - stp q0, q1, [x26], #32 - b.ne .LBB2_10 -.LBB2_16: - add sp, sp, #448 - ldp x20, x19, [sp, #144] - ldp x22, x21, [sp, #128] - ldp x24, x23, [sp, #112] - ldp x26, x25, [sp, #96] - ldp x28, x27, [sp, #80] - ldp x29, x30, [sp, #64] + ushr v24.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v30.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v5.16b, v0.16b, v5.16b + orr v0.16b, v1.16b, v24.16b + ushr v31.4s, v3.4s, #7 + orr v2.16b, v2.16b, v30.16b + ldp q24, q30, [sp, #208] + shl v3.4s, v3.4s, #25 + zip2 v14.2d, v12.2d, v9.2d + mov v22.16b, v24.16b + orr v1.16b, v3.16b, v31.16b + zip2 v3.2d, v24.2d, v30.2d + mov v24.16b, v28.16b + mov v22.d[1], v30.d[0] + ldr q30, [sp, #240] + mov v31.16b, v12.16b + stp q22, q14, [sp, #224] + mov v24.d[1], v30.d[0] + add v12.4s, v8.4s, v22.4s + mov v31.d[1], v9.d[0] + add v22.4s, v29.4s, v24.4s + ldr q29, [sp, #176] + zip2 v28.2d, v28.2d, v30.2d + mov v9.16b, v24.16b + mov v15.d[1], v29.d[0] + zip2 v8.2d, v10.2d, v29.2d + add v10.4s, v12.4s, v0.4s + add v22.4s, v22.4s, v2.4s + str q9, [sp, #144] + add v20.4s, v20.4s, v15.4s + add v17.4s, v17.4s, v31.4s + stp q3, q8, [sp, #192] + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v3.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v28.4s + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v8.4s + orr v1.16b, v1.16b, v12.16b + add v17.4s, v17.4s, v14.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v23.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v11.4s + mov v30.16b, v28.16b + mov v28.16b, v23.16b + ldr q23, [sp, #304] + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + mov v29.16b, v31.16b + ldr q31, [sp, #160] + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v23.4s + orr v1.16b, v1.16b, v12.16b + str q29, [sp, #272] + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v31.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + add v22.4s, v22.4s, v24.4s + ldr q24, [sp, #320] + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v24.4s + ldr q24, [sp, #352] + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v24.4s + ldr q24, [sp, #336] + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v14.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v10.4s, v10.4s, v24.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v29.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v8.4s + ldr q8, [sp, #288] + orr v1.16b, v1.16b, v13.16b + add v17.4s, v17.4s, v3.4s + ldr q3, [sp, #352] + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v30.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v8.4s + mov v24.16b, v30.16b + mov v30.16b, v15.16b + add v17.4s, v17.4s, v15.4s + ldr q15, [sp, #224] + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + str q30, [sp, #176] + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v15.4s + orr v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v9.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v14.4s + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v28.4s + orr v1.16b, v1.16b, v12.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v11.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v22.4s, v22.4s, v29.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v23.4s + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v31.4s + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v30.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + add v10.4s, v10.4s, v3.4s + ldr q3, [sp, #192] + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v3.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v15.4s + ldr q15, [sp, #128] + orr v1.16b, v1.16b, v13.16b + add v17.4s, v17.4s, v24.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v21.16b, v5.16b + ldp q23, q11, [sp, #320] + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v8.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v23.4s + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + mov v28.16b, v31.16b + mov v31.16b, v8.16b + ldr q8, [sp, #208] + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v11.4s + orr v1.16b, v1.16b, v12.16b + add v17.4s, v17.4s, v8.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v29.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v30.4s + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v9.4s + orr v1.16b, v1.16b, v12.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v14.4s + ldr q14, [sp, #256] + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v22.4s, v22.4s, v3.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v15.4s + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v14.4s + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v8.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v10.4s, v10.4s, v28.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v24.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v11.4s + ldr q11, [sp, #304] + orr v1.16b, v1.16b, v13.16b + add v17.4s, v17.4s, v31.4s + ldr q31, [sp, #224] + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v23.4s + ldr q23, [sp, #240] + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v11.4s + mov v30.16b, v8.16b + mov v8.16b, v24.16b + ldr q24, [sp, #352] + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + str q8, [sp, #112] + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v24.4s + orr v1.16b, v1.16b, v12.16b + add v17.4s, v17.4s, v31.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + mov v29.16b, v3.16b + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v29.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v30.4s + ldr q30, [sp, #272] + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + mov v3.16b, v28.16b + ldr q28, [sp, #176] + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v30.4s + orr v1.16b, v1.16b, v12.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v28.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v22.4s, v22.4s, v8.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v9.4s + ldr q9, [sp, #320] + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v23.4s + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v31.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + add v10.4s, v10.4s, v14.4s + ldr q14, [sp, #288] + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v14.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v24.4s + orr v1.16b, v1.16b, v13.16b + eor v4.16b, v4.16b, v10.16b + add v17.4s, v17.4s, v9.4s + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + tbl v4.16b, { v4.16b }, v27.16b + add v17.4s, v17.4s, v1.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + add v26.4s, v26.4s, v4.4s + eor v16.16b, v16.16b, v17.16b + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v19.4s, v19.4s, v16.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v21.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v19.16b, v1.16b + add v10.4s, v10.4s, v11.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + add v22.4s, v22.4s, v15.4s + orr v5.16b, v5.16b, v13.16b + add v20.4s, v20.4s, v3.4s + mov v24.16b, v3.16b + ldr q3, [sp, #336] + orr v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v10.16b + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v3.4s + add v20.4s, v20.4s, v5.4s + tbl v4.16b, { v4.16b }, v18.16b + eor v25.16b, v25.16b, v22.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v6.16b, v20.16b + add v26.4s, v26.4s, v4.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v16.16b, v16.16b, v17.16b + tbl v6.16b, { v6.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + add v7.4s, v7.4s, v25.4s + tbl v16.16b, { v16.16b }, v18.16b + add v21.4s, v21.4s, v6.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v2.16b, v7.16b, v2.16b + add v19.4s, v19.4s, v16.4s + eor v5.16b, v21.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + ushr v12.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v1.16b, v19.16b, v1.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v8.4s + orr v2.16b, v2.16b, v12.16b + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v0.4s + add v10.4s, v10.4s, v29.4s + ldr q29, [sp, #208] + add v17.4s, v17.4s, v31.4s + orr v1.16b, v1.16b, v12.16b + add v20.4s, v20.4s, v29.4s + eor v16.16b, v16.16b, v22.16b + add v10.4s, v10.4s, v5.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + tbl v16.16b, { v16.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + add v21.4s, v21.4s, v16.4s + tbl v25.16b, { v25.16b }, v27.16b + tbl v6.16b, { v6.16b }, v27.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + add v19.4s, v19.4s, v25.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v5.16b, v5.16b, v19.16b + eor v2.16b, v26.16b, v2.16b + eor v1.16b, v7.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v14.4s + mov v8.16b, v31.16b + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + mov v31.16b, v14.16b + ushr v14.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v0.4s + add v10.4s, v10.4s, v28.4s + ldr q28, [sp, #352] + orr v2.16b, v2.16b, v13.16b + orr v1.16b, v1.16b, v14.16b + add v17.4s, v17.4s, v30.4s + add v20.4s, v20.4s, v3.4s + eor v16.16b, v16.16b, v22.16b + add v10.4s, v10.4s, v5.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + tbl v16.16b, { v16.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + add v21.4s, v21.4s, v16.4s + tbl v25.16b, { v25.16b }, v18.16b + tbl v6.16b, { v6.16b }, v18.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + add v19.4s, v19.4s, v25.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v5.16b, v19.16b, v5.16b + eor v2.16b, v26.16b, v2.16b + eor v1.16b, v7.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v10.4s, v10.4s, v23.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ushr v14.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v12.16b + add v10.4s, v10.4s, v0.4s + add v20.4s, v20.4s, v24.4s + ldr q24, [sp, #144] + orr v2.16b, v2.16b, v13.16b + orr v1.16b, v1.16b, v14.16b + add v22.4s, v22.4s, v9.4s + add v17.4s, v17.4s, v11.4s + eor v4.16b, v4.16b, v10.16b + add v20.4s, v20.4s, v5.4s + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v25.16b, v25.16b, v22.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + add v21.4s, v21.4s, v6.4s + add v7.4s, v7.4s, v25.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v5.16b, v21.16b, v5.16b + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v15.4s + ushr v14.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + mov v30.16b, v3.16b + ldr q3, [sp, #256] + ushr v12.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v14.16b + add v20.4s, v20.4s, v3.4s + orr v2.16b, v2.16b, v12.16b + orr v1.16b, v1.16b, v13.16b + add v22.4s, v22.4s, v24.4s + add v17.4s, v17.4s, v28.4s + eor v4.16b, v4.16b, v10.16b + add v20.4s, v20.4s, v5.4s + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v25.16b, v25.16b, v22.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + add v21.4s, v21.4s, v6.4s + add v7.4s, v7.4s, v25.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v5.16b, v21.16b, v5.16b + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + mov v23.16b, v9.16b + ldr q9, [sp, #112] + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ushr v14.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v12.16b + add v9.4s, v10.4s, v9.4s + orr v2.16b, v2.16b, v13.16b + orr v1.16b, v1.16b, v14.16b + ldr q14, [sp, #64] + add v22.4s, v22.4s, v31.4s + add v17.4s, v17.4s, v30.4s + add v20.4s, v20.4s, v8.4s + add v9.4s, v9.4s, v5.4s + add v22.4s, v22.4s, v0.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + eor v25.16b, v25.16b, v9.16b + eor v16.16b, v16.16b, v22.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + tbl v25.16b, { v25.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + tbl v6.16b, { v6.16b }, v27.16b + tbl v4.16b, { v4.16b }, v27.16b + add v19.4s, v19.4s, v25.4s + add v21.4s, v21.4s, v16.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + eor v5.16b, v5.16b, v19.16b + eor v0.16b, v21.16b, v0.16b + eor v2.16b, v26.16b, v2.16b + eor v1.16b, v7.16b, v1.16b + ushr v30.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ushr v10.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + ushr v12.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + orr v5.16b, v5.16b, v30.16b + add v30.4s, v9.4s, v29.4s + add v22.4s, v22.4s, v23.4s + ldr q23, [sp, #192] + orr v0.16b, v0.16b, v10.16b + orr v2.16b, v2.16b, v12.16b + orr v1.16b, v1.16b, v13.16b + add v17.4s, v17.4s, v23.4s + add v20.4s, v20.4s, v28.4s + add v23.4s, v30.4s, v5.4s + add v22.4s, v22.4s, v0.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + eor v25.16b, v25.16b, v23.16b + eor v16.16b, v16.16b, v22.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + tbl v25.16b, { v25.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + tbl v6.16b, { v6.16b }, v18.16b + tbl v4.16b, { v4.16b }, v18.16b + add v19.4s, v19.4s, v25.4s + add v21.4s, v21.4s, v16.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + eor v5.16b, v19.16b, v5.16b + eor v0.16b, v21.16b, v0.16b + eor v2.16b, v26.16b, v2.16b + eor v1.16b, v7.16b, v1.16b + ushr v28.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ushr v30.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v31.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ushr v8.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v28.16b + ldr q28, [sp, #176] + orr v0.16b, v0.16b, v30.16b + orr v2.16b, v2.16b, v31.16b + orr v1.16b, v1.16b, v8.16b + add v23.4s, v23.4s, v28.4s + add v22.4s, v22.4s, v11.4s + add v17.4s, v17.4s, v15.4s + add v20.4s, v20.4s, v3.4s + ldr q3, [sp, #272] + add v23.4s, v23.4s, v0.4s + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v1.4s + add v20.4s, v20.4s, v5.4s + eor v4.16b, v4.16b, v23.16b + eor v25.16b, v25.16b, v22.16b + eor v16.16b, v16.16b, v17.16b + eor v6.16b, v6.16b, v20.16b + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + tbl v6.16b, { v6.16b }, v27.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + add v19.4s, v19.4s, v16.4s + add v21.4s, v21.4s, v6.4s + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b + eor v5.16b, v21.16b, v5.16b + add v3.4s, v22.4s, v3.4s + ldr q22, [sp, #160] + ushr v28.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + ushr v29.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ushr v30.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + ushr v31.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v22.4s + ldr q22, [sp, #240] + orr v0.16b, v0.16b, v28.16b + prfm pldl1keep, [x23, #256] + orr v2.16b, v2.16b, v29.16b + prfm pldl1keep, [x24, #256] + orr v1.16b, v1.16b, v30.16b + prfm pldl1keep, [x22, #256] + orr v5.16b, v5.16b, v31.16b + prfm pldl1keep, [x25, #256] + add v23.4s, v23.4s, v24.4s + add v20.4s, v20.4s, v22.4s + add v3.4s, v3.4s, v2.4s + add v17.4s, v17.4s, v1.4s + add v22.4s, v23.4s, v0.4s + add v20.4s, v20.4s, v5.4s + eor v23.16b, v25.16b, v3.16b + eor v16.16b, v16.16b, v17.16b + eor v4.16b, v4.16b, v22.16b + eor v6.16b, v6.16b, v20.16b + tbl v23.16b, { v23.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + tbl v4.16b, { v4.16b }, v18.16b + tbl v6.16b, { v6.16b }, v18.16b + add v7.4s, v7.4s, v23.4s + add v19.4s, v19.4s, v16.4s + add v18.4s, v26.4s, v4.4s + add v21.4s, v21.4s, v6.4s + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b + eor v0.16b, v18.16b, v0.16b + eor v5.16b, v21.16b, v5.16b + ushr v25.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ushr v24.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v26.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v27.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v0.16b, v0.16b, v24.16b + orr v2.16b, v2.16b, v25.16b + orr v1.16b, v1.16b, v26.16b + orr v5.16b, v5.16b, v27.16b + movi v13.4s, #64 + eor v29.16b, v19.16b, v22.16b + eor v8.16b, v21.16b, v3.16b + eor v30.16b, v17.16b, v18.16b + eor v31.16b, v20.16b, v7.16b + eor v24.16b, v5.16b, v23.16b + eor v18.16b, v0.16b, v16.16b + eor v25.16b, v2.16b, v6.16b + eor v26.16b, v1.16b, v4.16b + cbnz x21, .LBB3_5 + b .LBB3_2 +.LBB3_6: + cbz x1, .LBB3_14 + adrp x12, .LCPI3_3 + ldr q0, [x11, :lo12:.LCPI3_1] + orr w11, w7, w6 + ldr q2, [x10, :lo12:.LCPI3_2] + ldr q1, [x12, :lo12:.LCPI3_3] + and x12, x5, #0x1 +.LBB3_8: + movi v3.4s, #64 + lsr x13, x4, #32 + ldp q5, q4, [x3] + mov x15, x2 + mov w14, w11 + mov v3.s[0], w4 + ldr x10, [x0] + mov v3.s[1], w13 + b .LBB3_11 +.LBB3_9: + orr w14, w14, w9 +.LBB3_10: + ldp q6, q7, [x10] + mov v16.16b, v3.16b + and w14, w14, #0xff + add v5.4s, v5.4s, v4.4s + mov x15, x13 + mov v16.s[3], w14 + add x14, x10, #32 + uzp1 v17.4s, v6.4s, v7.4s + add x10, x10, #64 + add v5.4s, v5.4s, v17.4s + eor v16.16b, v5.16b, v16.16b + tbl v16.16b, { v16.16b }, v0.16b + add v18.4s, v16.4s, v1.4s + eor v19.16b, v18.16b, v4.16b + uzp2 v4.4s, v6.4s, v7.4s + ushr v6.4s, v19.4s, #12 + shl v7.4s, v19.4s, #20 + ld2 { v19.4s, v20.4s }, [x14] + add v5.4s, v5.4s, v4.4s + mov w14, w6 + orr v6.16b, v7.16b, v6.16b + add v5.4s, v5.4s, v6.4s + eor v7.16b, v16.16b, v5.16b + add v5.4s, v5.4s, v19.4s + tbl v7.16b, { v7.16b }, v2.16b + ext v5.16b, v5.16b, v5.16b, #12 + add v16.4s, v18.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v6.16b, v6.16b, v16.16b + ext v16.16b, v16.16b, v16.16b, #4 + ushr v18.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + orr v6.16b, v6.16b, v18.16b + ext v18.16b, v20.16b, v20.16b, #12 + add v5.4s, v5.4s, v6.4s + eor v7.16b, v5.16b, v7.16b + add v5.4s, v5.4s, v18.4s + tbl v7.16b, { v7.16b }, v0.16b + add v16.4s, v16.4s, v7.4s + eor v6.16b, v6.16b, v16.16b + ushr v21.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v21.16b + uzp1 v21.4s, v17.4s, v17.4s + add v5.4s, v5.4s, v6.4s + ext v21.16b, v21.16b, v17.16b, #8 + eor v7.16b, v7.16b, v5.16b + uzp2 v21.4s, v21.4s, v4.4s + tbl v7.16b, { v7.16b }, v2.16b + add v5.4s, v5.4s, v21.4s + add v16.4s, v16.4s, v7.4s + ext v5.16b, v5.16b, v5.16b, #4 + ext v7.16b, v7.16b, v7.16b, #8 + eor v6.16b, v6.16b, v16.16b + ushr v22.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + orr v6.16b, v6.16b, v22.16b + add v22.4s, v5.4s, v6.4s + eor v5.16b, v22.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #12 + tbl v16.16b, { v5.16b }, v0.16b + ext v5.16b, v17.16b, v17.16b, #12 + add v7.4s, v7.4s, v16.4s + ext v5.16b, v17.16b, v5.16b, #12 + ext v17.16b, v19.16b, v19.16b, #12 + mov v19.16b, v18.16b + eor v6.16b, v6.16b, v7.16b + rev64 v5.4s, v5.4s + mov v19.s[1], v17.s[2] + ushr v20.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + trn2 v5.4s, v5.4s, v19.4s + orr v6.16b, v6.16b, v20.16b + zip1 v20.2d, v18.2d, v4.2d + zip2 v4.4s, v4.4s, v18.4s + add v19.4s, v6.4s, v5.4s + mov v20.s[3], v17.s[3] + add v19.4s, v19.4s, v22.4s + ext v22.16b, v20.16b, v20.16b, #12 + eor v16.16b, v16.16b, v19.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v16.16b, { v16.16b }, v2.16b + add v7.4s, v7.4s, v16.4s + ext v16.16b, v16.16b, v16.16b, #8 + eor v6.16b, v6.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #4 + ushr v23.4s, v6.4s, #7 + shl v24.4s, v6.4s, #25 + uzp1 v6.4s, v20.4s, v22.4s + orr v20.16b, v24.16b, v23.16b + add v22.4s, v20.4s, v6.4s + add v19.4s, v22.4s, v19.4s + eor v16.16b, v19.16b, v16.16b + tbl v16.16b, { v16.16b }, v0.16b + add v7.4s, v7.4s, v16.4s + eor v18.16b, v20.16b, v7.16b + zip1 v20.4s, v4.4s, v17.4s + zip1 v4.4s, v17.4s, v4.4s + ushr v17.4s, v18.4s, #12 + shl v18.4s, v18.4s, #20 + ext v20.16b, v4.16b, v20.16b, #8 + orr v4.16b, v18.16b, v17.16b + ext v18.16b, v21.16b, v21.16b, #4 + add v17.4s, v4.4s, v20.4s + add v17.4s, v17.4s, v19.4s + uzp1 v19.4s, v18.4s, v18.4s + eor v16.16b, v16.16b, v17.16b + ext v19.16b, v19.16b, v18.16b, #8 + tbl v16.16b, { v16.16b }, v2.16b + uzp2 v19.4s, v19.4s, v5.4s + add v7.4s, v7.4s, v16.4s + add v17.4s, v17.4s, v19.4s + ext v16.16b, v16.16b, v16.16b, #8 + eor v4.16b, v4.16b, v7.16b + ext v17.16b, v17.16b, v17.16b, #4 + ext v7.16b, v7.16b, v7.16b, #12 + ushr v21.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + orr v4.16b, v4.16b, v21.16b + ext v21.16b, v18.16b, v18.16b, #12 + add v17.4s, v17.4s, v4.4s + ext v18.16b, v18.16b, v21.16b, #12 + mov v21.16b, v20.16b + eor v16.16b, v17.16b, v16.16b + rev64 v18.4s, v18.4s + mov v21.s[1], v6.s[2] + tbl v16.16b, { v16.16b }, v0.16b + add v7.4s, v7.4s, v16.4s + eor v4.16b, v4.16b, v7.16b + ushr v22.4s, v4.4s, #12 + shl v23.4s, v4.4s, #20 + trn2 v4.4s, v18.4s, v21.4s + orr v18.16b, v23.16b, v22.16b + add v21.4s, v18.4s, v4.4s + add v17.4s, v21.4s, v17.4s + zip1 v21.2d, v20.2d, v5.2d + zip2 v5.4s, v5.4s, v20.4s + eor v16.16b, v16.16b, v17.16b + mov v21.s[3], v6.s[3] + ext v17.16b, v17.16b, v17.16b, #12 + zip1 v20.4s, v5.4s, v6.4s + tbl v16.16b, { v16.16b }, v2.16b + zip1 v5.4s, v6.4s, v5.4s + add v22.4s, v7.4s, v16.4s + ext v16.16b, v16.16b, v16.16b, #8 + ext v20.16b, v5.16b, v20.16b, #8 + eor v7.16b, v18.16b, v22.16b + ext v18.16b, v21.16b, v21.16b, #12 + ushr v23.4s, v7.4s, #7 + shl v24.4s, v7.4s, #25 + uzp1 v7.4s, v21.4s, v18.4s + orr v18.16b, v24.16b, v23.16b + add v21.4s, v18.4s, v7.4s + add v17.4s, v21.4s, v17.4s + ext v21.16b, v22.16b, v22.16b, #4 + eor v16.16b, v17.16b, v16.16b + tbl v16.16b, { v16.16b }, v0.16b + add v21.4s, v21.4s, v16.4s + eor v18.16b, v18.16b, v21.16b + ushr v6.4s, v18.4s, #12 + shl v18.4s, v18.4s, #20 + orr v5.16b, v18.16b, v6.16b + add v6.4s, v5.4s, v20.4s + add v6.4s, v6.4s, v17.4s + ext v17.16b, v19.16b, v19.16b, #4 + eor v16.16b, v16.16b, v6.16b + uzp1 v18.4s, v17.4s, v17.4s + tbl v16.16b, { v16.16b }, v2.16b + ext v18.16b, v18.16b, v17.16b, #8 + add v19.4s, v21.4s, v16.4s + uzp2 v18.4s, v18.4s, v4.4s + ext v16.16b, v16.16b, v16.16b, #8 + eor v5.16b, v5.16b, v19.16b + add v6.4s, v6.4s, v18.4s + ext v19.16b, v19.16b, v19.16b, #12 + ushr v21.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v5.16b, v5.16b, v21.16b + ext v21.16b, v17.16b, v17.16b, #12 + add v6.4s, v6.4s, v5.4s + ext v17.16b, v17.16b, v21.16b, #12 + mov v21.16b, v20.16b + eor v16.16b, v6.16b, v16.16b + rev64 v17.4s, v17.4s + mov v21.s[1], v7.s[2] + tbl v16.16b, { v16.16b }, v0.16b + add v19.4s, v19.4s, v16.4s + eor v5.16b, v5.16b, v19.16b + ushr v22.4s, v5.4s, #12 + shl v23.4s, v5.4s, #20 + trn2 v5.4s, v17.4s, v21.4s + orr v17.16b, v23.16b, v22.16b + add v21.4s, v17.4s, v5.4s + add v6.4s, v21.4s, v6.4s + eor v16.16b, v16.16b, v6.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v21.16b, { v16.16b }, v2.16b + zip1 v16.2d, v20.2d, v4.2d + zip2 v4.4s, v4.4s, v20.4s + add v19.4s, v19.4s, v21.4s + mov v16.s[3], v7.s[3] + ext v21.16b, v21.16b, v21.16b, #8 + zip1 v20.4s, v4.4s, v7.4s + eor v17.16b, v17.16b, v19.16b + ext v22.16b, v16.16b, v16.16b, #12 + ext v19.16b, v19.16b, v19.16b, #4 + zip1 v4.4s, v7.4s, v4.4s + ushr v23.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + uzp1 v16.4s, v16.4s, v22.4s + ext v4.16b, v4.16b, v20.16b, #8 + orr v17.16b, v17.16b, v23.16b + add v22.4s, v17.4s, v16.4s + add v6.4s, v22.4s, v6.4s + eor v21.16b, v6.16b, v21.16b + tbl v21.16b, { v21.16b }, v0.16b + add v19.4s, v19.4s, v21.4s + eor v17.16b, v17.16b, v19.16b + ushr v7.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v7.16b, v17.16b, v7.16b + add v17.4s, v7.4s, v4.4s + add v6.4s, v17.4s, v6.4s + ext v17.16b, v18.16b, v18.16b, #4 + eor v18.16b, v21.16b, v6.16b + uzp1 v20.4s, v17.4s, v17.4s + tbl v18.16b, { v18.16b }, v2.16b + ext v20.16b, v20.16b, v17.16b, #8 + add v19.4s, v19.4s, v18.4s + uzp2 v20.4s, v20.4s, v5.4s + ext v18.16b, v18.16b, v18.16b, #8 + eor v7.16b, v7.16b, v19.16b + add v6.4s, v6.4s, v20.4s + ushr v21.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v7.16b, v7.16b, v21.16b + add v21.4s, v6.4s, v7.4s + eor v6.16b, v21.16b, v18.16b + ext v18.16b, v19.16b, v19.16b, #12 + tbl v19.16b, { v6.16b }, v0.16b + ext v6.16b, v17.16b, v17.16b, #12 + add v18.4s, v18.4s, v19.4s + ext v6.16b, v17.16b, v6.16b, #12 + mov v17.16b, v4.16b + eor v7.16b, v7.16b, v18.16b + rev64 v6.4s, v6.4s + mov v17.s[1], v16.s[2] + ushr v22.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + trn2 v6.4s, v6.4s, v17.4s + orr v7.16b, v7.16b, v22.16b + add v17.4s, v7.4s, v6.4s + add v17.4s, v17.4s, v21.4s + zip1 v21.2d, v4.2d, v5.2d + zip2 v4.4s, v5.4s, v4.4s + eor v19.16b, v19.16b, v17.16b + mov v21.s[3], v16.s[3] + ext v17.16b, v17.16b, v17.16b, #12 + tbl v19.16b, { v19.16b }, v2.16b + ext v22.16b, v21.16b, v21.16b, #12 + add v18.4s, v18.4s, v19.4s + ext v19.16b, v19.16b, v19.16b, #8 + eor v7.16b, v7.16b, v18.16b + ext v18.16b, v18.16b, v18.16b, #4 + ushr v23.4s, v7.4s, #7 + shl v24.4s, v7.4s, #25 + uzp1 v7.4s, v21.4s, v22.4s + orr v21.16b, v24.16b, v23.16b + add v22.4s, v21.4s, v7.4s + add v17.4s, v22.4s, v17.4s + eor v19.16b, v17.16b, v19.16b + tbl v19.16b, { v19.16b }, v0.16b + add v18.4s, v18.4s, v19.4s + eor v5.16b, v21.16b, v18.16b + zip1 v21.4s, v4.4s, v16.4s + zip1 v4.4s, v16.4s, v4.4s + ushr v16.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v21.16b, v4.16b, v21.16b, #8 + orr v4.16b, v5.16b, v16.16b + ext v16.16b, v20.16b, v20.16b, #4 + mov v23.16b, v21.16b + add v5.4s, v4.4s, v21.4s + mov v23.s[1], v7.s[2] + add v5.4s, v5.4s, v17.4s + eor v17.16b, v19.16b, v5.16b + uzp1 v19.4s, v16.4s, v16.4s + tbl v17.16b, { v17.16b }, v2.16b + ext v19.16b, v19.16b, v16.16b, #8 + add v18.4s, v18.4s, v17.4s + uzp2 v19.4s, v19.4s, v6.4s + eor v4.16b, v4.16b, v18.16b + add v5.4s, v5.4s, v19.4s + ext v19.16b, v19.16b, v19.16b, #4 + ushr v20.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v4.16b, v20.16b + ext v4.16b, v17.16b, v17.16b, #8 + add v17.4s, v5.4s, v20.4s + ext v5.16b, v18.16b, v18.16b, #12 + eor v4.16b, v17.16b, v4.16b + tbl v18.16b, { v4.16b }, v0.16b + ext v4.16b, v16.16b, v16.16b, #12 + add v22.4s, v5.4s, v18.4s + ext v4.16b, v16.16b, v4.16b, #12 + eor v5.16b, v20.16b, v22.16b + rev64 v16.4s, v4.4s + ushr v20.4s, v5.4s, #12 + shl v24.4s, v5.4s, #20 + trn2 v5.4s, v16.4s, v23.4s + orr v16.16b, v24.16b, v20.16b + add v20.4s, v16.4s, v5.4s + add v17.4s, v20.4s, v17.4s + zip1 v20.2d, v21.2d, v6.2d + zip2 v6.4s, v6.4s, v21.4s + eor v18.16b, v18.16b, v17.16b + mov v20.s[3], v7.s[3] + ext v17.16b, v17.16b, v17.16b, #12 + zip1 v21.4s, v6.4s, v7.4s + tbl v18.16b, { v18.16b }, v2.16b + ext v24.16b, v20.16b, v20.16b, #12 + zip1 v6.4s, v7.4s, v6.4s + add v22.4s, v22.4s, v18.4s + ext v18.16b, v18.16b, v18.16b, #8 + ext v6.16b, v6.16b, v21.16b, #8 + eor v16.16b, v16.16b, v22.16b + ext v22.16b, v22.16b, v22.16b, #4 + zip1 v5.2d, v6.2d, v5.2d + zip2 v4.4s, v4.4s, v6.4s + ushr v25.4s, v16.4s, #7 + shl v26.4s, v16.4s, #25 + uzp1 v16.4s, v20.4s, v24.4s + orr v20.16b, v26.16b, v25.16b + mov v5.s[3], v16.s[3] + add v24.4s, v20.4s, v16.4s + add v17.4s, v24.4s, v17.4s + eor v18.16b, v17.16b, v18.16b + tbl v18.16b, { v18.16b }, v0.16b + add v22.4s, v22.4s, v18.4s + eor v20.16b, v20.16b, v22.16b + ushr v7.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v7.16b, v20.16b, v7.16b + add v20.4s, v7.4s, v6.4s + add v17.4s, v20.4s, v17.4s + ext v20.16b, v19.16b, v19.16b, #8 + eor v18.16b, v18.16b, v17.16b + ext v17.16b, v17.16b, v17.16b, #4 + tbl v18.16b, { v18.16b }, v2.16b + add v21.4s, v22.4s, v18.4s + uzp2 v22.4s, v20.4s, v23.4s + ext v18.16b, v18.16b, v18.16b, #8 + eor v7.16b, v7.16b, v21.16b + ext v20.16b, v22.16b, v20.16b, #4 + ushr v22.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + add v17.4s, v17.4s, v20.4s + ext v20.16b, v21.16b, v21.16b, #12 + ext v21.16b, v19.16b, v19.16b, #12 + orr v7.16b, v7.16b, v22.16b + ext v19.16b, v19.16b, v21.16b, #12 + add v17.4s, v17.4s, v7.4s + mov v21.16b, v6.16b + rev64 v19.4s, v19.4s + eor v18.16b, v17.16b, v18.16b + mov v21.s[1], v16.s[2] + tbl v18.16b, { v18.16b }, v0.16b + trn2 v19.4s, v19.4s, v21.4s + add v20.4s, v20.4s, v18.4s + eor v7.16b, v7.16b, v20.16b + ushr v22.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + orr v7.16b, v7.16b, v22.16b + add v19.4s, v7.4s, v19.4s + add v17.4s, v19.4s, v17.4s + eor v18.16b, v18.16b, v17.16b + ext v17.16b, v17.16b, v17.16b, #12 + tbl v18.16b, { v18.16b }, v2.16b + add v19.4s, v20.4s, v18.4s + ext v20.16b, v5.16b, v5.16b, #12 + ext v18.16b, v18.16b, v18.16b, #8 + eor v7.16b, v7.16b, v19.16b + uzp1 v5.4s, v5.4s, v20.4s + ushr v21.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + orr v7.16b, v7.16b, v21.16b + add v5.4s, v7.4s, v5.4s + add v5.4s, v5.4s, v17.4s + eor v17.16b, v5.16b, v18.16b + ext v18.16b, v19.16b, v19.16b, #4 + tbl v17.16b, { v17.16b }, v0.16b + add v18.4s, v18.4s, v17.4s + eor v6.16b, v7.16b, v18.16b + zip1 v7.4s, v4.4s, v16.4s + zip1 v4.4s, v16.4s, v4.4s + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + ext v4.16b, v4.16b, v7.16b, #8 + orr v6.16b, v6.16b, v16.16b + add v4.4s, v6.4s, v4.4s + add v4.4s, v4.4s, v5.4s + eor v5.16b, v17.16b, v4.16b + ext v4.16b, v4.16b, v4.16b, #4 + tbl v5.16b, { v5.16b }, v2.16b + add v7.4s, v18.4s, v5.4s + eor v6.16b, v6.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + orr v6.16b, v6.16b, v16.16b + ext v16.16b, v5.16b, v5.16b, #8 + eor v5.16b, v4.16b, v7.16b + eor v4.16b, v6.16b, v16.16b +.LBB3_11: + subs x13, x15, #1 + b.eq .LBB3_9 + cbnz x15, .LBB3_10 + add x4, x4, x12 + add x0, x0, #8 + subs x1, x1, #1 + stp q5, q4, [x8], #32 + b.ne .LBB3_8 +.LBB3_14: + add sp, sp, #368 + ldp x20, x19, [sp, #128] + ldp x22, x21, [sp, #112] + ldp x24, x23, [sp, #96] + ldp x26, x25, [sp, #80] + ldp x29, x27, [sp, #64] ldp d9, d8, [sp, #48] ldp d11, d10, [sp, #32] ldp d13, d12, [sp, #16] - ldp d15, d14, [sp], #160 + ldp d15, d14, [sp], #144 ret -.Lfunc_end2: - .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41 +.Lfunc_end3: + .size zfs_blake3_hash_many_sse41, .Lfunc_end3-zfs_blake3_hash_many_sse41 .cfi_endproc .section ".note.GNU-stack","",@progbits -#endif +#endif \ No newline at end of file