Minor performance fix for NEON RAID-Z

The NEON code replicates too closely the SSE code, including a masked 16-bits shift. But NEON, like AltiVec (#9539), has unsigned 8-bits shift, so use that instead and drop the masking. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Romain Dolbeau <romain.dolbeau@european-processor-initiative.eu> Closes #9725
2025-07-03 06:27:35 +03:00 · 2019-12-18 04:34:52 +01:00 · 2019-12-18 04:34:52 +01:00 · 118fc3ef07
commit 118fc3ef07
parent fe564845c0
1 changed files with 2 additions and 4 deletions
--- a/module/zfs/vdev_raidz_math_aarch64_neon_common.h
+++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@ -479,10 +479,8 @@ typedef struct v {
 		/* upper part */					\
 		"and v14.16b," VR0(r) ".16b,v15.16b\n"			\
 		"and v13.16b," VR1(r) ".16b,v15.16b\n"			\
-		"sshr " VR0(r) ".8h," VR0(r) ".8h,#4\n"			\
-		"sshr " VR1(r) ".8h," VR1(r) ".8h,#4\n"			\
-		"and " VR0(r) ".16b," VR0(r) ".16b,v15.16b\n"		\
-		"and " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n"		\
+		"ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n"		\
+		"ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n"		\
 									\
 		"tbl v12.16b,{v10.16b}," VR0(r) ".16b\n"		\
 		"tbl v10.16b,{v10.16b}," VR1(r) ".16b\n"		\