From 118fc3ef07c53a88ea1d4c21142a2b01c4648434 Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.dolbeau@european-processor-initiative.eu>
Date: Wed, 18 Dec 2019 04:34:52 +0100
Subject: [PATCH] Minor performance fix for NEON RAID-Z

The NEON code replicates too closely the SSE code, including
a masked 16-bits shift. But NEON, like AltiVec (#9539), has
unsigned 8-bits shift, so use that instead and drop the masking.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Romain Dolbeau <romain.dolbeau@european-processor-initiative.eu>
Closes #9725
---
 module/zfs/vdev_raidz_math_aarch64_neon_common.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/module/zfs/vdev_raidz_math_aarch64_neon_common.h
index 5312b9094..92a50b3a0 100644
--- a/module/zfs/vdev_raidz_math_aarch64_neon_common.h
+++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@@ -479,10 +479,8 @@ typedef struct v {
 		/* upper part */					\
 		"and v14.16b," VR0(r) ".16b,v15.16b\n"			\
 		"and v13.16b," VR1(r) ".16b,v15.16b\n"			\
-		"sshr " VR0(r) ".8h," VR0(r) ".8h,#4\n"			\
-		"sshr " VR1(r) ".8h," VR1(r) ".8h,#4\n"			\
-		"and " VR0(r) ".16b," VR0(r) ".16b,v15.16b\n"		\
-		"and " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n"		\
+		"ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n"		\
+		"ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n"		\
 									\
 		"tbl v12.16b,{v10.16b}," VR0(r) ".16b\n"		\
 		"tbl v10.16b,{v10.16b}," VR1(r) ".16b\n"		\