[PATCH] Add some NEON blitters for 16-bit framebuffers.

Thu May 7 02:20:02 PDT 2009

---
 pixman/pixman-arm-neon.c |  237 +++++++++++++++++++++++++++++++++++++++++++++-
 pixman/pixman-arm-neon.h |   30 ++++++
 pixman/pixman-pict.c     |   12 +++
 pixman/pixman-utils.c    |    1 +
 4 files changed, 279 insertions(+), 1 deletions(-)

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 51f7d55..3517d2d 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright Â© 2009 ARM Ltd
+ * Copyright Â© 2009 ARM Ltd, Movial Creative Technologies Oy
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -21,6 +21,7 @@
  * SOFTWARE.
  *
  * Author:  Ian Rickards (ian.rickards at arm.com) 
+ * Author:  Jonathan Morton (jonathan.morton at movial.com)
  *
  */
 
@@ -31,6 +32,9 @@
 #include "pixman-arm-neon.h"
 
 #include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
 
 
 static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
@@ -1376,3 +1380,234 @@ fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op,
     }
 }
 
+#ifdef USE_GCC_INLINE_ASM
+
+void
+fbCompositeSrc_16x16neon (
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int16_t      xSrc,
+	int16_t      ySrc,
+	int16_t      xMask,
+	int16_t      yMask,
+	int16_t      xDst,
+	int16_t      yDst,
+	uint16_t     width,
+	uint16_t     height)
+{
+	uint16_t    *dstLine, *srcLine;
+	uint32_t     dstStride, srcStride;
+
+	if(!height || !width)
+		return;
+
+	/* We simply copy 16-bit-aligned pixels from one place to another. */
+	fbComposeGetStart (pSrc, xSrc, ySrc, uint16_t, srcStride, srcLine, 1);
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+	/* Preload the first input scanline */
+	{
+		uint16_t *srcPtr = srcLine;
+		uint32_t count = width;
+
+		asm volatile (
+		"0: @ loop							\n"
+		"	SUBS    %[count], %[count], #32				\n"
+		"	PLD     [%[src]]					\n"
+		"	ADD     %[src], %[src], #64				\n"
+		"	BGT 0b							\n"
+
+		// Clobbered input registers marked as input/outputs
+		: [src] "+r" (srcPtr), [count] "+r" (count)
+		);
+	}
+
+	while(height--) {
+		uint16_t *dstPtr = dstLine;
+		uint16_t *srcPtr = srcLine;
+		uint32_t count = width;
+		uint32_t tmp = 0;
+
+		// Uses multi-register access and preloading to maximise bandwidth.
+		// Each pixel is one halfword, so a quadword contains 8px.
+		// Preload frequency assumed a 64-byte cacheline.
+		asm volatile (
+		"	CMP       %[count], #64				\n"
+		"	BLT 1f    @ skip oversized fragments		\n"
+		"0: @ start with eight quadwords at a time		\n"
+		"	PLD       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	SUB       %[count], %[count], #64		\n"
+		"	VLD1.16   {d16,d17,d18,d19}, [%[src]]!		\n"
+		"	VLD1.16   {d20,d21,d22,d23}, [%[src]]!		\n"
+		"	PLD       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	VLD1.16   {d24,d25,d26,d27}, [%[src]]!		\n"
+		"	VLD1.16   {d28,d29,d30,d31}, [%[src]]!		\n"
+		"	CMP       %[count], #64				\n"
+		"	VST1.16   {d16,d17,d18,d19}, [%[dst]]!		\n"
+		"	VST1.16   {d20,d21,d22,d23}, [%[dst]]!		\n"
+		"	VST1.16   {d24,d25,d26,d27}, [%[dst]]!		\n"
+		"	VST1.16   {d28,d29,d30,d31}, [%[dst]]!		\n"
+		"	BGE 0b						\n"
+		"	CMP       %[count], #0				\n"
+		"	BEQ 7f    @ aligned fastpath			\n"
+		"1: @ four quadwords					\n"
+		"	TST       %[count], #32				\n"
+		"	BEQ 2f    @ skip oversized fragment		\n"
+		"	PLD       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	VLD1.16   {d16,d17,d18,d19}, [%[src]]!		\n"
+		"	VLD1.16   {d20,d21,d22,d23}, [%[src]]!		\n"
+		"	VST1.16   {d16,d17,d18,d19}, [%[dst]]!		\n"
+		"	VST1.16   {d20,d21,d22,d23}, [%[dst]]!		\n"
+		"2: @ two quadwords					\n"
+		"	TST       %[count], #16				\n"
+		"	BEQ 3f    @ skip oversized fragment		\n"
+		"	PLD       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	VLD1.16   {d16,d17,d18,d19}, [%[src]]!		\n"
+		"	VST1.16   {d16,d17,d18,d19}, [%[dst]]!		\n"
+		"3: @ one quadword					\n"
+		"	TST       %[count], #8				\n"
+		"	BEQ 4f    @ skip oversized fragment		\n"
+		"	VLD1.16   {d16,d17}, [%[src]]!			\n"
+		"	VST1.16   {d16,d17}, [%[dst]]!			\n"
+		"4: @ one doubleword					\n"
+		"	TST       %[count], #4				\n"
+		"	BEQ 5f    @ skip oversized fragment		\n"
+		"	VLD1.16   {d16}, [%[src]]!			\n"
+		"	VST1.16   {d16}, [%[dst]]!			\n"
+		"5: @ one word						\n"
+		"	TST       %[count], #2				\n"
+		"	LDRNE     %[tmp], [%[src]], #4			\n"
+		"	STRNE     %[tmp], [%[dst]], #4			\n"
+		"   @ one halfword					\n"
+		"	TST       %[count], #1				\n"
+		"	BEQ 7f    @ skip oversized fragment		\n"
+		"	LDRH      %[tmp], [%[src]]			\n"
+		"	STRH      %[tmp], [%[dst]]			\n"
+		"7: @ end						\n"
+
+		// Clobbered input registers marked as input/outputs
+		: [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count), [tmp] "+r" (tmp)
+
+		// Unclobbered input
+		: [srcStride] "r" (srcStride)
+
+		// Clobbered vector registers
+		// NB: these are the quad aliases of the double registers used in the asm
+		: "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+		);
+
+		srcLine += srcStride;
+		dstLine += dstStride;
+	}
+}
+
+
+void
+fbCompositeSrc_24x16neon (
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int16_t      xSrc,
+	int16_t      ySrc,
+	int16_t      xMask,
+	int16_t      yMask,
+	int16_t      xDst,
+	int16_t      yDst,
+	uint16_t     width,
+	uint16_t     height)
+{
+	uint16_t    *dstLine;
+	uint32_t    *srcLine;
+	uint32_t     dstStride, srcStride;
+
+	if(!width || !height)
+		return;
+
+	/* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
+	fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+	/* Preload the first input scanline */
+	{
+		uint32_t *srcPtr = srcLine;
+		uint32_t count = width;
+		asm volatile (
+		"0: @ loop						\n"
+		"	SUBS    %[count], %[count], #16			\n"
+		"	PLD     [%[src]]				\n"
+		"	ADD     %[src], %[src], #64			\n"
+		"	BGT 0b						\n"
+
+		// Clobbered input registers marked as input/outputs
+		: [src] "+r" (srcPtr), [count] "+r" (count)
+		);
+	}
+
+	while(height--) {
+		uint16_t *dstPtr = dstLine;
+		uint32_t *srcPtr = srcLine;
+		uint32_t count = width;
+		uint32_t tmp1 = 0;
+		uint32_t tmp2 = 0;
+
+		// This is not as aggressive as the RGB565-source case.
+		// Generally the source is in cached RAM when the formats are different, so we use preload.
+		// We don't need to blend, so we are not reading from the uncached framebuffer.
+		asm volatile (
+		"	CMP       %[count], #16										\n"
+		"	BLT 1f    @ skip oversized fragments								\n"
+		"0: @ start with sixteen pixels at a time								\n"
+		"	SUB       %[count], %[count], #16								\n"
+		"	PLD      [%[src], %[srcStride], LSL #2]         @ preload from next scanline			\n"
+		"	VLD4.8    {d0,d1,d2,d3}, [%[src]]!		@ d3 is alpha and ignored, d2-0 are RGB.	\n"
+		"	VLD4.8    {d4,d5,d6,d7}, [%[src]]!		@ d7 is alpha and ignored, d6-4 are RGB.	\n"
+		"	VSHLL.U8  q8, d2, #8				@ expand first red for repacking		\n"
+		"	VSHLL.U8  q10, d1, #8				@ expand first green for repacking		\n"
+		"	VSHLL.U8  q11, d0, #8				@ expand first blue for repacking		\n"
+		"	VSHLL.U8  q9, d6, #8				@ expand second red for repacking		\n"
+		"	VSRI.U16  q8, q10, #5				@ insert first green after red			\n"
+		"	VSHLL.U8  q10, d5, #8				@ expand second green for repacking		\n"
+		"	VSRI.U16  q8, q11, #11				@ insert first blue after green			\n"
+		"	VSHLL.U8  q11, d4, #8				@ expand second blue for repacking		\n"
+		"	VSRI.U16  q9, q10, #5				@ insert second green after red			\n"
+		"	VSRI.U16  q9, q11, #11				@ insert second blue after green		\n"
+		"	CMP       %[count], #16										\n"
+		"	VST1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels				\n"
+		"	BGE 0b												\n"
+		"1: @ finish with individual pixels									\n"
+		"	CMP       %[count], #0				@ if we are aligned, don't overrun		\n"
+		"	BEQ 3f												\n"
+		"	PLD      [%[src], %[srcStride], LSL #2]         @ preload from next scanline			\n"
+		"2: @ small loop											\n"
+		"	LDR       %[tmp1], [%[src]], #4			@ load whole pixel and post-increment		\n"
+		"	LSR       %[tmp2], %[tmp1], #3			@ push blue to bottom				\n"
+		"	LSR       %[tmp1], %[tmp1], #10			@ truncate to green field			\n"
+		"	BFI       %[tmp2], %[tmp1], #5, #6		@ insert green in middle			\n"
+		"	LSR       %[tmp1], %[tmp1], #9			@ truncate to red field				\n"
+		"	BFI       %[tmp2], %[tmp1], #11, #5		@ insert red at top				\n"
+		"	STRH      %[tmp2], [%[dst]], #2			@ store pixel and post-increment		\n"
+		"	SUBS      %[count], %[count], #1								\n"
+		"	BNE 1b												\n"
+		"3:	@end												\n"
+
+		// Clobbered input and working registers marked as input/outputs
+		: [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count),
+		  [tmp1] "+r" (tmp1), [tmp2] "+r" (tmp2)
+
+		// Unclobbered input
+		: [srcStride] "r" (srcStride)
+
+		// Clobbered vector registers
+		// NB: these are the quad aliases of the double registers used in the asm
+		: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+		);
+
+		srcLine += srcStride;
+		dstLine += dstStride;
+	}
+}
+
+#endif /* USE_GCC_INLINE_ASM */
diff --git a/pixman/pixman-arm-neon.h b/pixman/pixman-arm-neon.h
index acfe8a4..b0c2036 100644
--- a/pixman/pixman-arm-neon.h
+++ b/pixman/pixman-arm-neon.h
@@ -134,4 +134,34 @@ fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op,
                         uint16_t     width,
                         uint16_t     height);
 
+void
+fbCompositeSrc_16x16neon (
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int16_t      xSrc,
+	int16_t      ySrc,
+	int16_t      xMask,
+	int16_t      yMask,
+	int16_t      xDst,
+	int16_t      yDst,
+	uint16_t     width,
+	uint16_t     height);
+
+void
+fbCompositeSrc_24x16neon (
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int16_t      xSrc,
+	int16_t      ySrc,
+	int16_t      xMask,
+	int16_t      yMask,
+	int16_t      xDst,
+	int16_t      yDst,
+	uint16_t     width,
+	uint16_t     height);
+
 #endif /* USE_ARM_NEON */
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index 548d38d..8524b81 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1641,6 +1641,18 @@ static const FastPathInfo arm_neon_fast_paths[] =
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
+
+#ifdef USE_GCC_INLINE_ASM
+    { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_16x16neon,              0 },
+    { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_16x16neon,              0 },
+    { PIXMAN_OP_OVER, PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_16x16neon,              0 },
+    { PIXMAN_OP_OVER, PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_16x16neon,              0 },
+    { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_24x16neon,              0 },
+    { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_24x16neon,              0 },
+    { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_24x16neon,              0 },
+    { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_24x16neon,              0 },
+#endif
+
     { PIXMAN_OP_NONE },
 };
 #endif
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index a1b7492..c5fb3c6 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -30,6 +30,7 @@
 #include "pixman-private.h"
 #include "pixman-mmx.h"
 #include "pixman-sse2.h"
+#include "pixman-arm-neon.h"
 
 #if defined(USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
-- 
1.5.6.3


--=-VQiE51CRN9Xfgo9Uw6rE--