[PATCH] Delete now-unused old NEON format-converting blitter.
Jonathan Morton
jmorton at sd070.hel.movial.fi
Tue Jun 2 05:21:37 PDT 2009
---
pixman/pixman-arm-neon.c | 107 ----------------------------------------------
1 files changed, 0 insertions(+), 107 deletions(-)
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index c2a0e03..46b872f 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -438,113 +438,6 @@ fbCompositeSrc_8888x8888neon (
void
-fbCompositeSrc_x888x0565neon (
- pixman_implementation_t * impl,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint16_t *dstLine, *dst;
- uint32_t *srcLine, *src;
- int dstStride, srcStride;
- uint32_t w;
-
- fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-
- if (width>=8)
- {
- while (height--)
- {
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- do {
- while (w>=8)
- {
-#ifndef USE_GCC_INLINE_ASM
- vst1q_u16(dst, pack0565(vld4_u8((void*)src)));
-#else
- asm volatile (
- "vld4.8 {d4-d7}, [%[src]]\n\t"
- "vshll.u8 q0, d6, #8\n\t"
- "vshll.u8 q1, d5, #8\n\t"
- "vsriq.u16 q0, q1, #5\t\n"
- "vshll.u8 q1, d4, #8\n\t"
- "vsriq.u16 q0, q1, #11\t\n"
- "vst1.16 {q0}, [%[dst]]\n\t"
- :
- : [dst] "r" (dst), [src] "r" (src)
- : "memory", "d0","d1","d2","d3","d4","d5","d6","d7"
- );
-#endif
- src+=8;
- dst+=8;
- w-=8;
- }
- if (w != 0)
- {
- src -= (8-w);
- dst -= (8-w);
- w = 8; // do another vector
- }
- } while (w!=0);
- }
- }
- else
- {
- // Handle width<8
- while (height--)
- {
- dst = dstLine;
- dstLine += dstStride;
- src = srcLine;
- srcLine += srcStride;
- w = width;
-
- while (w>=2)
- {
- uint32x2_t sval, rgb, g, b;
- sval = vld1_u32(src);
- rgb = vshr_n_u32(sval,8-5); // r (5 bits)
- g = vshr_n_u32(sval,8+8-6); // g to bottom byte
- rgb = vsli_n_u32(rgb, g, 5);
- b = vshr_n_u32(sval,8+8+8-5); // b to bottom byte
- rgb = vsli_n_u32(rgb, b, 11);
- vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
- vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),2);
- src+=2;
- w-=2;
- }
- if (w)
- {
- uint32x2_t sval, rgb, g, b;
- sval = vld1_dup_u32(src);
- rgb = vshr_n_u32(sval,8-5); // r (5 bits)
- g = vshr_n_u32(sval,8+8-6); // g to bottom byte
- rgb = vsli_n_u32(rgb, g, 5);
- b = vshr_n_u32(sval,8+8+8-5); // b to bottom byte
- rgb = vsli_n_u32(rgb, b, 11);
- vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
- }
- }
- }
-}
-
-
-void
fbCompositeSrc_8888x8x8888neon (
pixman_implementation_t * impl,
pixman_op_t op,
--
1.5.6.3
--=-bLd9RxGyX2hCZUIgt+e5
Content-Disposition: attachment; filename="0001-First-patch-to-send-upstream.patch"
Content-Type: text/x-patch; name="0001-First-patch-to-send-upstream.patch"; charset="UTF-8"
Content-Transfer-Encoding: 8bit
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 5453dbb..6158925 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1,5 +1,5 @@
/*
- * Copyright © 2009 ARM Ltd
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
@@ -21,6 +21,8 @@
* SOFTWARE.
*
* Author: Ian Rickards (ian.rickards at arm.com)
+ * Author: Jonathan Morton (jonathan.morton at movial.com)
+ * Author: Markku Vire (markku.vire at movial.com)
*
*/
@@ -31,7 +33,12 @@
#include "pixman-arm-neon.h"
#include <arm_neon.h>
+#include <string.h>
+// Deal with an intrinsic that is defined differently in GCC
+#if !defined(__ARMCC_VERSION) && !defined(__pld)
+#define __pld(_x) __builtin_prefetch(_x)
+#endif
static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
{
@@ -1391,22 +1398,659 @@ fbCompositeSrcAdd_8888x8x8neon (
}
}
+#ifdef USE_GCC_INLINE_ASM
+
+void
+fbCompositeSrc_16x16neon (
+ pixman_implementation_t * impl,
+ pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int32_t xSrc,
+ int32_t ySrc,
+ int32_t xMask,
+ int32_t yMask,
+ int32_t xDst,
+ int32_t yDst,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dstLine, *srcLine;
+ uint32_t dstStride, srcStride;
+
+ if(!height || !width)
+ return;
+
+ /* We simply copy 16-bit-aligned pixels from one place to another. */
+ fbComposeGetStart (pSrc, xSrc, ySrc, uint16_t, srcStride, srcLine, 1);
+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+ /* Preload the first input scanline */
+ {
+ uint16_t *srcPtr = srcLine;
+ uint32_t count = width;
+
+ asm volatile (
+ "0: @ loop \n"
+ " subs %[count], %[count], #32 \n"
+ " pld [%[src]] \n"
+ " add %[src], %[src], #64 \n"
+ " bgt 0b \n"
+
+ // Clobbered input registers marked as input/outputs
+ : [src] "+r" (srcPtr), [count] "+r" (count)
+ : // no unclobbered inputs
+ : "cc"
+ );
+ }
+
+ while(height--) {
+ uint16_t *dstPtr = dstLine;
+ uint16_t *srcPtr = srcLine;
+ uint32_t count = width;
+ uint32_t tmp = 0;
+
+ // Uses multi-register access and preloading to maximise bandwidth.
+ // Each pixel is one halfword, so a quadword contains 8px.
+ // Preload frequency assumed a 64-byte cacheline.
+ asm volatile (
+ " cmp %[count], #64 \n"
+ " blt 1f @ skip oversized fragments \n"
+ "0: @ start with eight quadwords at a time \n"
+ " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " sub %[count], %[count], #64 \n"
+ " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
+ " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
+ " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " vld1.16 {d24,d25,d26,d27}, [%[src]]! \n"
+ " vld1.16 {d28,d29,d30,d31}, [%[src]]! \n"
+ " cmp %[count], #64 \n"
+ " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
+ " vst1.16 {d24,d25,d26,d27}, [%[dst]]! \n"
+ " vst1.16 {d28,d29,d30,d31}, [%[dst]]! \n"
+ " bge 0b \n"
+ " cmp %[count], #0 \n"
+ " beq 7f @ aligned fastpath \n"
+ "1: @ four quadwords \n"
+ " tst %[count], #32 \n"
+ " beq 2f @ skip oversized fragment \n"
+ " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
+ " vld1.16 {d20,d21,d22,d23}, [%[src]]! \n"
+ " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " vst1.16 {d20,d21,d22,d23}, [%[dst]]! \n"
+ "2: @ two quadwords \n"
+ " tst %[count], #16 \n"
+ " beq 3f @ skip oversized fragment \n"
+ " pld [%[src], %[srcStride], LSL #1] \n" // preload from next scanline
+ " vld1.16 {d16,d17,d18,d19}, [%[src]]! \n"
+ " vst1.16 {d16,d17,d18,d19}, [%[dst]]! \n"
+ "3: @ one quadword \n"
+ " tst %[count], #8 \n"
+ " beq 4f @ skip oversized fragment \n"
+ " vld1.16 {d16,d17}, [%[src]]! \n"
+ " vst1.16 {d16,d17}, [%[dst]]! \n"
+ "4: @ one doubleword \n"
+ " tst %[count], #4 \n"
+ " beq 5f @ skip oversized fragment \n"
+ " vld1.16 {d16}, [%[src]]! \n"
+ " vst1.16 {d16}, [%[dst]]! \n"
+ "5: @ one word \n"
+ " tst %[count], #2 \n"
+ " beq 6f @ skip oversized fragment \n"
+ " ldr %[tmp], [%[src]], #4 \n"
+ " str %[tmp], [%[dst]], #4 \n"
+ "6: @ one halfword \n"
+ " tst %[count], #1 \n"
+ " beq 7f @ skip oversized fragment \n"
+ " ldrh %[tmp], [%[src]] \n"
+ " strh %[tmp], [%[dst]] \n"
+ "7: @ end \n"
+
+ // Clobbered input registers marked as input/outputs
+ : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count), [tmp] "+r" (tmp)
+
+ // Unclobbered input
+ : [srcStride] "r" (srcStride)
+
+ // Clobbered vector registers
+ // NB: these are the quad aliases of the double registers used in the asm
+ : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
+ );
+
+ srcLine += srcStride;
+ dstLine += dstStride;
+ }
+}
+
+#endif /* USE_GCC_INLINE_ASM */
+
+void
+fbCompositeSrc_24x16neon (
+ pixman_implementation_t * impl,
+ pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int32_t xSrc,
+ int32_t ySrc,
+ int32_t xMask,
+ int32_t yMask,
+ int32_t xDst,
+ int32_t yDst,
+ int32_t width,
+ int32_t height)
+{
+ uint16_t *dstLine;
+ uint32_t *srcLine;
+ uint32_t dstStride, srcStride;
+
+ if(!width || !height)
+ return;
+
+ /* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
+ fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+ /* Preload the first input scanline */
+ {
+ uint8_t *srcPtr = (uint8_t*) srcLine;
+ uint32_t count = (width + 15) / 16;
+
+#ifdef USE_GCC_INLINE_ASM
+ asm volatile (
+ "0: @ loop \n"
+ " subs %[count], %[count], #1 \n"
+ " pld [%[src]] \n"
+ " add %[src], %[src], #64 \n"
+ " bgt 0b \n"
+
+ // Clobbered input registers marked as input/outputs
+ : [src] "+r" (srcPtr), [count] "+r" (count)
+ : // no unclobbered inputs
+ : "cc"
+ );
+#else
+ do {
+ __pld(srcPtr);
+ srcPtr += 64;
+ } while(--count);
+#endif
+ }
+
+ while(height--) {
+ uint16_t *dstPtr = dstLine;
+ uint32_t *srcPtr = srcLine;
+ uint32_t count = width;
+ const uint32_t RBmask = 0x1F;
+ const uint32_t Gmask = 0x3F;
+
+ // If you're going to complain about a goto, take a long hard look
+ // at the massive blocks of assembler this skips over. ;-)
+ if(count < 8)
+ goto smallStuff;
+
+#ifdef USE_GCC_INLINE_ASM
+
+ // This is not as aggressive as the RGB565-source case.
+ // Generally the source is in cached RAM when the formats are different, so we use preload.
+ // We don't need to blend, so we are not reading from the uncached framebuffer.
+ asm volatile (
+ " cmp %[count], #16 \n"
+ " blt 1f @ skip oversized fragments \n"
+ "0: @ start with sixteen pixels at a time \n"
+ " sub %[count], %[count], #16 \n"
+ " pld [%[src], %[srcStride], lsl #2] @ preload from next scanline \n"
+ " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
+ " vld4.8 {d4,d5,d6,d7}, [%[src]]! @ d7 is alpha and ignored, d6-4 are rgb. \n"
+ " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
+ " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
+ " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
+ " vshll.u8 q9, d6, #8 @ expand second red for repacking \n"
+ " vsri.u16 q8, q10, #5 @ insert first green after red \n"
+ " vshll.u8 q10, d5, #8 @ expand second green for repacking \n"
+ " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
+ " vshll.u8 q11, d4, #8 @ expand second blue for repacking \n"
+ " vsri.u16 q9, q10, #5 @ insert second green after red \n"
+ " vsri.u16 q9, q11, #11 @ insert second blue after green \n"
+ " cmp %[count], #16 \n"
+ " vst1.16 {d16,d17,d18,d19}, [%[dst]]! @ store 16 pixels \n"
+ " bge 0b \n"
+ "1: @ end of main loop \n"
+ " cmp %[count], #8 @ can we still do an 8-pixel block? \n"
+ " blt 2f \n"
+ " sub %[count], %[count], #8 \n"
+ " pld [%[src], %[srcStride], lsl #2] @ preload from next scanline \n"
+ " vld4.8 {d0,d1,d2,d3}, [%[src]]! @ d3 is alpha and ignored, d2-0 are rgb. \n"
+ " vshll.u8 q8, d2, #8 @ expand first red for repacking \n"
+ " vshll.u8 q10, d1, #8 @ expand first green for repacking \n"
+ " vshll.u8 q11, d0, #8 @ expand first blue for repacking \n"
+ " vsri.u16 q8, q10, #5 @ insert first green after red \n"
+ " vsri.u16 q8, q11, #11 @ insert first blue after green \n"
+ " vst1.16 {d16,d17}, [%[dst]]! @ store 8 pixels \n"
+ "2: @ end \n"
+
+ // Clobbered input and working registers marked as input/outputs
+ : [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count)
+
+ // Unclobbered input
+ : [srcStride] "r" (srcStride)
+
+ // Clobbered vector registers
+ // NB: these are the quad aliases of the double registers used in the asm
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
+ );
+#else
+ // A copy of the above code, in intrinsics-form.
+ // This should be pretty self-documenting...
+ while(count >= 16) {
+ uint8x8x4_t pixelSetA, pixelSetB;
+ uint16x8_t redA, greenA, blueA;
+ uint16x8_t redB, greenB, blueB;
+ uint16x8_t destPixelsA, destPixelsB;
+
+ count -= 16;
+ __pld(srcPtr + srcStride);
+ pixelSetA = vld4_u8((uint8_t*)(srcPtr));
+ pixelSetB = vld4_u8((uint8_t*)(srcPtr+8));
+ srcPtr += 16;
+
+ redA = vshll_n_u8(pixelSetA.val[2], 8);
+ greenA = vshll_n_u8(pixelSetA.val[1], 8);
+ blueA = vshll_n_u8(pixelSetA.val[0], 8);
+ redB = vshll_n_u8(pixelSetB.val[2], 8);
+ greenB = vshll_n_u8(pixelSetB.val[1], 8);
+ blueB = vshll_n_u8(pixelSetB.val[0], 8);
+ destPixelsA = vsriq_n_u16(redA, greenA, 5);
+ destPixelsB = vsriq_n_u16(redB, greenB, 5);
+ destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
+ destPixelsB = vsriq_n_u16(destPixelsB, blueB, 11);
+
+ // There doesn't seem to be an intrinsic for the double-quadword variant
+ vst1q_u16(dstPtr , destPixelsA);
+ vst1q_u16(dstPtr+8, destPixelsB);
+ dstPtr += 16;
+ }
+
+ // 8-pixel loop
+ if(count >= 8) {
+ uint8x8x4_t pixelSetA;
+ uint16x8_t redA, greenA, blueA;
+ uint16x8_t destPixelsA;
+
+ __pld(srcPtr + srcStride);
+ count -= 8;
+ pixelSetA = vld4_u8((uint8_t*)(srcPtr));
+ srcPtr += 8;
+
+ redA = vshll_n_u8(pixelSetA.val[2], 8);
+ greenA = vshll_n_u8(pixelSetA.val[1], 8);
+ blueA = vshll_n_u8(pixelSetA.val[0], 8);
+ destPixelsA = vsriq_n_u16(redA, greenA, 5);
+ destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
+
+ vst1q_u16(dstPtr , destPixelsA);
+ dstPtr += 8;
+ }
+
+#endif // USE_GCC_INLINE_ASM
+
+ smallStuff:
+
+ if(count)
+ __pld(srcPtr + srcStride);
+
+ while(count >= 2) {
+ uint32_t srcPixelA = *srcPtr++;
+ uint32_t srcPixelB = *srcPtr++;
+
+ // ARM is really good at shift-then-ALU ops.
+ // This should be a total of six shift-ANDs and five shift-ORs.
+ uint32_t dstPixelsA;
+ uint32_t dstPixelsB;
+
+ dstPixelsA = ((srcPixelA >> 3) & RBmask);
+ dstPixelsA |= ((srcPixelA >> 10) & Gmask) << 5;
+ dstPixelsA |= ((srcPixelA >> 19) & RBmask) << 11;
+
+ dstPixelsB = ((srcPixelB >> 3) & RBmask);
+ dstPixelsB |= ((srcPixelB >> 10) & Gmask) << 5;
+ dstPixelsB |= ((srcPixelB >> 19) & RBmask) << 11;
+
+ // little-endian mode only
+ *((uint32_t*) dstPtr) = dstPixelsA | (dstPixelsB << 16);
+ dstPtr += 2;
+ count -= 2;
+ }
+
+ if(count) {
+ uint32_t srcPixel = *srcPtr++;
+
+ // ARM is really good at shift-then-ALU ops.
+ // This block should end up as three shift-ANDs and two shift-ORs.
+ uint32_t tmpBlue = (srcPixel >> 3) & RBmask;
+ uint32_t tmpGreen = (srcPixel >> 10) & Gmask;
+ uint32_t tmpRed = (srcPixel >> 19) & RBmask;
+ uint16_t dstPixel = (tmpRed << 11) | (tmpGreen << 5) | tmpBlue;
+
+ *dstPtr++ = dstPixel;
+ count--;
+ }
+
+ srcLine += srcStride;
+ dstLine += dstStride;
+ }
+}
+
+
+pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t _xor)
+{
+ uint32_t byte_stride, color;
+ char *dst;
+
+ /* stride is always multiple of 32bit units in pixman */
+ byte_stride = stride * sizeof(uint32_t);
+
+ switch (bpp)
+ {
+ case 8:
+ dst = ((char *) bits) + y * byte_stride + x;
+ _xor &= 0xff;
+ color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
+ break;
+ case 16:
+ dst = ((char *) bits) + y * byte_stride + x * 2;
+ _xor &= 0xffff;
+ color = _xor << 16 | _xor;
+ width *= 2; /* width to bytes */
+ break;
+ case 32:
+ dst = ((char *) bits) + y * byte_stride + x * 4;
+ color = _xor;
+ width *= 4; /* width to bytes */
+ break;
+ default:
+ return FALSE;
+ }
+
+#ifdef USE_GCC_INLINE_ASM
+ if (width < 16)
+ /* We have a special case for such small widths that don't allow
+ us to use wide 128-bit stores anyway. We don't waste time
+ trying to align writes, since there are only very few of them anyway */
+ asm volatile (
+ "cmp %[height], #0\n" /* Check if empty fill */
+ "beq 3f\n"
+ "vdup.32 d0, %[color]\n" /* Fill the color to neon req */
+
+ /* Check if we have a such width that can easily be handled by single
+ operation for each scanline. This significantly reduces the number
+ of test/branch instructions for each scanline */
+ "cmp %[width], #8\n"
+ "beq 4f\n"
+ "cmp %[width], #4\n"
+ "beq 5f\n"
+ "cmp %[width], #2\n"
+ "beq 6f\n"
+
+ /* Loop starts here for each scanline */
+ "1:\n"
+ "mov r4, %[dst]\n" /* Starting address of the current line */
+ "tst %[width], #8\n"
+ "beq 2f\n"
+ "vst1.8 {d0}, [r4]!\n"
+ "2:\n"
+ "tst %[width], #4\n"
+ "beq 2f\n"
+ "str %[color], [r4]!\n"
+ "2:\n"
+ "tst %[width], #2\n"
+ "beq 2f\n"
+ "strh %[color], [r4]!\n"
+ "2:\n"
+ "tst r5, #1\n"
+ "beq 2f\n"
+ "strb %[color], [r4]!\n"
+ "2:\n"
+
+ "subs %[height], %[height], #1\n"
+ "add %[dst], %[dst], %[byte_stride]\n"
+ "bne 1b\n"
+ "b 3f\n"
+
+ /* Special fillers for those widths that we can do with single operation */
+ "4:\n"
+ "subs %[height], %[height], #1\n"
+ "vst1.8 {d0}, [%[dst]]\n"
+ "add %[dst], %[dst], %[byte_stride]\n"
+ "bne 4b\n"
+ "b 3f\n"
+
+ "5:\n"
+ "subs %[height], %[height], #1\n"
+ "str %[color], [%[dst]]\n"
+ "add %[dst], %[dst], %[byte_stride]\n"
+ "bne 5b\n"
+ "b 3f\n"
+
+ "6:\n"
+ "subs %[height], %[height], #1\n"
+ "strh %[color], [%[dst]]\n"
+ "add %[dst], %[dst], %[byte_stride]\n"
+ "bne 6b\n"
+
+ "3:\n"
+ : /* No output members */
+ : [color] "r" (color), [height] "r" (height), [width] "r" (width),
+ [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
+ : "memory", "cc", "d0", "r4", "r5");
+ else
+ asm volatile (
+ "cmp %[height], #0\n" /* Check if empty fill */
+ "beq 5f\n"
+ "vdup.32 q0, %[color]\n" /* Fill the color to neon req */
+
+ /* Loop starts here for each scanline */
+ "1:\n"
+ "mov r4, %[dst]\n" /* Starting address of the current line */
+ "mov r5, %[width]\n" /* We're going to write this many bytes */
+ "ands r6, r4, #15\n" /* Are we at the 128-bit aligned address? */
+ "beq 2f\n" /* Jump to the best case */
+
+ /* We're not 128-bit aligned: However, we know that we can get to the
+ next aligned location, since the fill is at least 16 bytes wide */
+ "rsb r6, r6, #16\n" /* We would need to go forward this much */
+ "sub r5, r5, r6\n" /* Update bytes left */
+ "tst r6, #1\n"
+ "beq 6f\n"
+ "vst1.8 {d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
+ "6:\n"
+ "tst r6, #2\n"
+ "beq 6f\n"
+ "vst1.16 {d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
+ "6:\n"
+ "tst r6, #4\n"
+ "beq 6f\n"
+ "vst1.32 {d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
+ "6:\n"
+ "tst r6, #8\n"
+ "beq 2f\n"
+ "vst1.64 {d0}, [r4, :64]!\n" /* Store qword now we're 64-bit aligned */
+
+ /* The good case: We're 128-bit aligned for this scanline */
+ "2:\n"
+ "and r6, r5, #15\n" /* Number of tailing bytes */
+ "cmp r5, r6\n" /* Do we have at least one qword to write? */
+ "beq 6f\n" /* No, we just write the tail */
+ "lsr r5, r5, #4\n" /* This many full qwords to write */
+
+ /* The main block: Do 128-bit aligned writes */
+ "3:\n"
+ "subs r5, r5, #1\n"
+ "vst1.64 {d0,d1}, [r4, :128]!\n"
+ "bne 3b\n"
+
+ /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
+ We know that we're currently at 128-bit aligned address, so we can just
+ pick the biggest operations that the remaining write width allows */
+ "6:\n"
+ "cmp r6, #0\n"
+ "beq 4f\n"
+ "tst r6, #8\n"
+ "beq 6f\n"
+ "vst1.64 {d0}, [r4, :64]!\n"
+ "6:\n"
+ "tst r6, #4\n"
+ "beq 6f\n"
+ "vst1.32 {d0[0]}, [r4, :32]!\n"
+ "6:\n"
+ "tst r6, #2\n"
+ "beq 6f\n"
+ "vst1.16 {d0[0]}, [r4, :16]!\n"
+ "6:\n"
+ "tst r6, #1\n"
+ "beq 4f\n"
+ "vst1.8 {d0[0]}, [r4]!\n"
+ "4:\n"
+
+ /* Handle the next scanline */
+ "subs %[height], %[height], #1\n"
+ "add %[dst], %[dst], %[byte_stride]\n"
+ "bne 1b\n"
+ "5:\n"
+ : /* No output members */
+ : [color] "r" (color), [height] "r" (height), [width] "r" (width),
+ [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
+ : "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
+
+ return TRUE;
+
+#else
+
+ // TODO: intrinsic version for armcc
+ return FALSE;
+
+#endif
+}
+
+#ifdef USE_GCC_INLINE_ASM
+
+// TODO: is there a more generic way of doing this being introduced?
+#define NEON_SCANLINE_BUFFER_PIXELS (1024)
+
+static inline void QuadwordCopy_neon(
+ void* dst,
+ void* src,
+ uint32_t count, // of quadwords
+ uint32_t trailerCount // of bytes
+)
+{
+ // Uses aligned multi-register loads to maximise read bandwidth
+ // on uncached memory such as framebuffers
+ // The accesses do not have the aligned qualifiers, so that the copy
+ // may convert between aligned-uncached and unaligned-cached memory.
+ // It is assumed that the CPU can infer alignedness from the address.
+ asm volatile (
+ " cmp %[count], #8 \n"
+ " blt 1f @ skip oversized fragments \n"
+ "0: @ start with eight quadwords at a time \n"
+ " sub %[count], %[count], #8 \n"
+ " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
+ " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
+ " vld1.8 {d24,d25,d26,d27}, [%[src]]! \n"
+ " vld1.8 {d28,d29,d30,d31}, [%[src]]! \n"
+ " cmp %[count], #8 \n"
+ " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
+ " vst1.8 {d24,d25,d26,d27}, [%[dst]]! \n"
+ " vst1.8 {d28,d29,d30,d31}, [%[dst]]! \n"
+ " bge 0b \n"
+ "1: @ four quadwords \n"
+ " tst %[count], #4 \n"
+ " beq 2f @ skip oversized fragment \n"
+ " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
+ " vld1.8 {d20,d21,d22,d23}, [%[src]]! \n"
+ " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
+ " vst1.8 {d20,d21,d22,d23}, [%[dst]]! \n"
+ "2: @ two quadwords \n"
+ " tst %[count], #2 \n"
+ " beq 3f @ skip oversized fragment \n"
+ " vld1.8 {d16,d17,d18,d19}, [%[src]]! \n"
+ " vst1.8 {d16,d17,d18,d19}, [%[dst]]! \n"
+ "3: @ one quadword \n"
+ " tst %[count], #1 \n"
+ " beq 4f @ skip oversized fragment \n"
+ " vld1.8 {d16,d17}, [%[src]]! \n"
+ " vst1.8 {d16,d17}, [%[dst]]! \n"
+ "4: @ end \n"
+
+ // Clobbered input registers marked as input/outputs
+ : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+
+ // No unclobbered inputs
+ :
+
+ // Clobbered vector registers
+ // NB: these are the quad aliases of the double registers used in the asm
+ : "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
+ );
+
+ if(trailerCount) {
+ uint8_t *tDst = dst, *tSrc = src;
+
+ while(trailerCount >= 4) {
+ *((uint32_t*) tDst) = *((uint32_t*) tSrc);
+ tDst += 4;
+ tSrc += 4;
+ trailerCount -= 4;
+ }
+
+ if(trailerCount >= 2) {
+ *((uint16_t*) tDst) = *((uint16_t*) tSrc);
+ tDst += 2;
+ tSrc += 2;
+ trailerCount -= 2;
+ }
+
+ if(trailerCount) {
+ *tDst++ = *tSrc++;
+ trailerCount--;
+ }
+ }
+}
+
+#endif // USE_GCC_INLINE_ASM
+
static const FastPathInfo arm_neon_fast_path_array[] =
{
{ PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8neon, 0 },
{ PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000neon, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8x0565neon, 0 },
+ { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, fbCompositeSolidMask_nx8x0565neon, 0 },
{ PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_x888x0565neon, 0 },
{ PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_x888x0565neon, 0 },
{ PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_x888x0565neon, 0 },
{ PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_x888x0565neon, 0 },
+#ifdef USE_GCC_INLINE_ASM
+// { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_16x16neon, 0 },
+// { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_16x16neon, 0 },
+#endif
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888neon, NEED_SOLID_MASK },
{ PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888neon, NEED_SOLID_MASK },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8x0565neon, 0 },
- { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, fbCompositeSolidMask_nx8x0565neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon, 0 },
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon, 0 },
@@ -1462,10 +2106,13 @@ pixman_blt_neon (
int width, int height)
{
-#if 0 // Relies on code which isn't upstreamed yet
+ if(!width || !height)
+ return TRUE;
+
+#ifdef USE_GCC_INLINE_ASM
- // accelerate only straight copies
- if(src_bpp != dst_bpp || (src_bpp & 7) || !width || !height)
+ // accelerate only straight copies involving complete bytes
+ if(src_bpp != dst_bpp || (src_bpp & 7))
return FALSE;
{
diff --git a/pixman/pixman-arm-neon.h b/pixman/pixman-arm-neon.h
index aed7a4d..2442ea6 100644
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 42503fc..1204a36 100644
--=-bLd9RxGyX2hCZUIgt+e5--
More information about the cairo
mailing list