[PATCH] Add some NEON blitters for 16-bit framebuffers.
Jonathan Morton
jmorton at sd070.hel.movial.fi
Thu May 7 02:20:02 PDT 2009
---
pixman/pixman-arm-neon.c | 237 +++++++++++++++++++++++++++++++++++++++++++++-
pixman/pixman-arm-neon.h | 30 ++++++
pixman/pixman-pict.c | 12 +++
pixman/pixman-utils.c | 1 +
4 files changed, 279 insertions(+), 1 deletions(-)
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 51f7d55..3517d2d 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2009 ARM Ltd
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
@@ -21,6 +21,7 @@
* SOFTWARE.
*
* Author: Ian Rickards (ian.rickards at arm.com)
+ * Author: Jonathan Morton (jonathan.morton at movial.com)
*
*/
@@ -31,6 +32,9 @@
#include "pixman-arm-neon.h"
#include <arm_neon.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
@@ -1376,3 +1380,234 @@ fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op,
}
}
+#ifdef USE_GCC_INLINE_ASM
+
+void
+fbCompositeSrc_16x16neon (
+ pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint16_t *dstLine, *srcLine;
+ uint32_t dstStride, srcStride;
+
+ if(!height || !width)
+ return;
+
+ /* We simply copy 16-bit-aligned pixels from one place to another. */
+ fbComposeGetStart (pSrc, xSrc, ySrc, uint16_t, srcStride, srcLine, 1);
+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+ /* Preload the first input scanline */
+ {
+ uint16_t *srcPtr = srcLine;
+ uint32_t count = width;
+
+ asm volatile (
+ "0: @ loop \n"
+ " SUBS %[count], %[count], #32 \n"
+ " PLD [%[src]] \n"
+ " ADD %[src], %[src], #64 \n"
+ " BGT 0b \n"
+
+ // Clobbered input registers marked as input/outputs
+ : [src] "+r" (srcPtr), [count] "+r" (count)
+ );
+ }
+
+ while(height--) {
+ uint16_t *dstPtr = dstLine;
+ uint16_t *srcPtr = srcLine;
+ uint32_t count = width;
+ uint32_t tmp = 0;
+
+ // Uses multi-register access and preloading to maximise bandwidth.
+ // Each pixel is one halfword, so a quadword contains 8px.
+ // Preload frequency assumed a 64-byte cacheline.
+ asm volatile (
+ " CMP %[count], #64 \n"
+ " BLT 1f @ skip oversized fragments \n"
+ "0: @ start with eight quadwords at a time \n"
+ " PLD [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " SUB %[count], %[count], #64 \n"
+ " VLD1.16 {d16,d17,d18,d19}, [%[src]]! \n"
+ " VLD1.16 {d20,d21,d22,d23}, [%[src]]! \n"
+ " PLD [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " VLD1.16 {d24,d25,d26,d27}, [%[src]]! \n"
+ " VLD1.16 {d28,d29,d30,d31}, [%[src]]! \n"
+ " CMP %[count], #64 \n"
+ " VST1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " VST1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
+ " VST1.16 {d24,d25,d26,d27}, [%[dst]]! \n"
+ " VST1.16 {d28,d29,d30,d31}, [%[dst]]! \n"
+ " BGE 0b \n"
+ " CMP %[count], #0 \n"
+ " BEQ 7f @ aligned fastpath \n"
+ "1: @ four quadwords \n"
+ " TST %[count], #32 \n"
+ " BEQ 2f @ skip oversized fragment \n"
+ " PLD [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " VLD1.16 {d16,d17,d18,d19}, [%[src]]! \n"
+ " VLD1.16 {d20,d21,d22,d23}, [%[src]]! \n"
+ " VST1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " VST1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
+ "2: @ two quadwords \n"
+ " TST %[count], #16 \n"
+ " BEQ 3f @ skip oversized fragment \n"
+ " PLD [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " VLD1.16 {d16,d17,d18,d19}, [%[src]]! \n"
+ " VST1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
+ "3: @ one quadword \n"
+ " TST %[count], #8 \n"
+ " BEQ 4f @ skip oversized fragment \n"
+ " VLD1.16 {d16,d17}, [%[src]]! \n"
+ " VST1.16 {d16,d17}, [%[dst]]! \n"
+ "4: @ one doubleword \n"
+ " TST %[count], #4 \n"
+ " BEQ 5f @ skip oversized fragment \n"
+ " VLD1.16 {d16}, [%[src]]! \n"
+ " VST1.16 {d16}, [%[dst]]! \n"
+ "5: @ one word \n"
+ " TST %[count], #2 \n"
+ " LDRNE %[tmp], [%[src]], #4 \n"
+ " STRNE %[tmp], [%[dst]], #4 \n"
+ " @ one halfword \n"
+ " TST %[count], #1 \n"
+ " BEQ 7f @ skip oversized fragment \n"
+ " LDRH %[tmp], [%[src]] \n"
+ " STRH %[tmp], [%[dst]] \n"
+ "7: @ end \n"
+
+ // Clobbered input registers marked as input/outputs
+ : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count), [tmp] "+r" (tmp)
+
+ // Unclobbered input
+ : [srcStride] "r" (srcStride)
+
+ // Clobbered vector registers
+ // NB: these are the quad aliases of the double registers used in the asm
+ : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+
+ srcLine += srcStride;
+ dstLine += dstStride;
+ }
+}
+
+
+void
+fbCompositeSrc_24x16neon (
+ pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint16_t *dstLine;
+ uint32_t *srcLine;
+ uint32_t dstStride, srcStride;
+
+ if(!width || !height)
+ return;
+
+ /* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+ /* Preload the first input scanline */
+ {
+ uint32_t *srcPtr = srcLine;
+ uint32_t count = width;
+ asm volatile (
+ "0: @ loop \n"
+ " SUBS %[count], %[count], #16 \n"
+ " PLD [%[src]] \n"
+ " ADD %[src], %[src], #64 \n"
+ " BGT 0b \n"
+
+ // Clobbered input registers marked as input/outputs
+ : [src] "+r" (srcPtr), [count] "+r" (count)
+ );
+ }
+
+ while(height--) {
+ uint16_t *dstPtr = dstLine;
+ uint32_t *srcPtr = srcLine;
+ uint32_t count = width;
+ uint32_t tmp1 = 0;
+ uint32_t tmp2 = 0;
+
+ // This is not as aggressive as the RGB565-source case.
+ // Generally the source is in cached RAM when the formats are different, so we use preload.
+ // We don't need to blend, so we are not reading from the uncached framebuffer.
+ asm volatile (
+ " CMP %[count], #16 \n"
+ " BLT 1f @ skip oversized fragments \n"
+ "0: @ start with sixteen pixels at a time \n"
+ " SUB %[count], %[count], #16 \n"
+ " PLD [%[src], %[srcStride], LSL #2] @ preload from next scanline \n"
+ " VLD4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are RGB. \n"
+ " VLD4.8 {d4,d5,d6,d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are RGB. \n"
+ " VSHLL.U8 q8, d2, #8 @ expand first red for repacking \n"
+ " VSHLL.U8 q10, d1, #8 @ expand first green for repacking \n"
+ " VSHLL.U8 q11, d0, #8 @ expand first blue for repacking \n"
+ " VSHLL.U8 q9, d6, #8 @ expand second red for repacking \n"
+ " VSRI.U16 q8, q10, #5 @ insert first green after red \n"
+ " VSHLL.U8 q10, d5, #8 @ expand second green for repacking \n"
+ " VSRI.U16 q8, q11, #11 @ insert first blue after green \n"
+ " VSHLL.U8 q11, d4, #8 @ expand second blue for repacking \n"
+ " VSRI.U16 q9, q10, #5 @ insert second green after red \n"
+ " VSRI.U16 q9, q11, #11 @ insert second blue after green \n"
+ " CMP %[count], #16 \n"
+ " VST1.16 {d16,d17,d18,d19}, [%[dst]]! @ store 16 pixels \n"
+ " BGE 0b \n"
+ "1: @ finish with individual pixels \n"
+ " CMP %[count], #0 @ if we are aligned, don't overrun \n"
+ " BEQ 3f \n"
+ " PLD [%[src], %[srcStride], LSL #2] @ preload from next scanline \n"
+ "2: @ small loop \n"
+ " LDR %[tmp1], [%[src]], #4 @ load whole pixel and post-increment \n"
+ " LSR %[tmp2], %[tmp1], #3 @ push blue to bottom \n"
+ " LSR %[tmp1], %[tmp1], #10 @ truncate to green field \n"
+ " BFI %[tmp2], %[tmp1], #5, #6 @ insert green in middle \n"
+ " LSR %[tmp1], %[tmp1], #9 @ truncate to red field \n"
+ " BFI %[tmp2], %[tmp1], #11, #5 @ insert red at top \n"
+ " STRH %[tmp2], [%[dst]], #2 @ store pixel and post-increment \n"
+ " SUBS %[count], %[count], #1 \n"
+ " BNE 1b \n"
+ "3: @end \n"
+
+ // Clobbered input and working registers marked as input/outputs
+ : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count),
+ [tmp1] "+r" (tmp1), [tmp2] "+r" (tmp2)
+
+ // Unclobbered input
+ : [srcStride] "r" (srcStride)
+
+ // Clobbered vector registers
+ // NB: these are the quad aliases of the double registers used in the asm
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+
+ srcLine += srcStride;
+ dstLine += dstStride;
+ }
+}
+
+#endif /* USE_GCC_INLINE_ASM */
diff --git a/pixman/pixman-arm-neon.h b/pixman/pixman-arm-neon.h
index acfe8a4..b0c2036 100644
--- a/pixman/pixman-arm-neon.h
+++ b/pixman/pixman-arm-neon.h
@@ -134,4 +134,34 @@ fbCompositeSrcAdd_8888x8x8neon (pixman_op_t op,
uint16_t width,
uint16_t height);
+void
+fbCompositeSrc_16x16neon (
+ pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height);
+
+void
+fbCompositeSrc_24x16neon (
+ pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height);
+
#endif /* USE_ARM_NEON */
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index 548d38d..8524b81 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1641,6 +1641,18 @@ static const FastPathInfo arm_neon_fast_paths[] =
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888neon, 0 },
+
+#ifdef USE_GCC_INLINE_ASM
+ { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_16x16neon, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_16x16neon, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_16x16neon, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_16x16neon, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_24x16neon, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_24x16neon, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_24x16neon, 0 },
+ { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_24x16neon, 0 },
+#endif
+
{ PIXMAN_OP_NONE },
};
#endif
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index a1b7492..c5fb3c6 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -30,6 +30,7 @@
#include "pixman-private.h"
#include "pixman-mmx.h"
#include "pixman-sse2.h"
+#include "pixman-arm-neon.h"
#if defined(USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
__attribute__((__force_align_arg_pointer__))
--
1.5.6.3
--=-VQiE51CRN9Xfgo9Uw6rE--
More information about the cairo
mailing list