[PATCH] Delete now-unused old NEON format-converting blitter.

Tue Jun 2 05:21:37 PDT 2009

---
 pixman/pixman-arm-neon.c |  107 ----------------------------------------------
 1 files changed, 0 insertions(+), 107 deletions(-)

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index c2a0e03..46b872f 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -438,113 +438,6 @@ fbCompositeSrc_8888x8888neon (
 
 
 void
-fbCompositeSrc_x888x0565neon (
-                          pixman_implementation_t * impl,
-                          pixman_op_t op,
-                          pixman_image_t * pSrc,
-                          pixman_image_t * pMask,
-                          pixman_image_t * pDst,
-                          int32_t      xSrc,
-                          int32_t      ySrc,
-                          int32_t      xMask,
-                          int32_t      yMask,
-                          int32_t      xDst,
-                          int32_t      yDst,
-                          int32_t     width,
-                          int32_t     height)
-{
-    uint16_t    *dstLine, *dst;
-    uint32_t    *srcLine, *src;
-    int dstStride, srcStride;
-    uint32_t    w;
-
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-
-    if (width>=8)
-    {
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
-
-	    do {
-	        while (w>=8)
-	        {
-#ifndef USE_GCC_INLINE_ASM
-	            vst1q_u16(dst, pack0565(vld4_u8((void*)src)));
-#else
-                    asm volatile (
-                        "vld4.8       {d4-d7}, [%[src]]\n\t"
-                        "vshll.u8     q0, d6, #8\n\t"
-                        "vshll.u8     q1, d5, #8\n\t"
-                        "vsriq.u16    q0, q1, #5\t\n"
-                        "vshll.u8     q1, d4, #8\n\t"
-                        "vsriq.u16    q0, q1, #11\t\n"
-                        "vst1.16      {q0}, [%[dst]]\n\t"
-                        :
-                        : [dst] "r" (dst), [src] "r" (src)
-                        : "memory", "d0","d1","d2","d3","d4","d5","d6","d7"
-                        );
-#endif
-	            src+=8;
-	            dst+=8;
-	            w-=8;
-          	}
-                if (w != 0)
-                {
-                    src -= (8-w);
-                    dst -= (8-w);
-                    w = 8;  // do another vector
-                }
-            } while (w!=0);
-        }
-    }
-    else
-    {
-        // Handle width<8
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
-
-	    while (w>=2)
-	    {
-	        uint32x2_t sval, rgb, g, b;
-	        sval = vld1_u32(src);
-	        rgb = vshr_n_u32(sval,8-5); // r (5 bits)
-	        g = vshr_n_u32(sval,8+8-6);  // g to bottom byte
-	        rgb = vsli_n_u32(rgb, g, 5);
-	        b = vshr_n_u32(sval,8+8+8-5);  // b to bottom byte
-                rgb = vsli_n_u32(rgb, b, 11);
-	        vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
-	        vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),2);
-	        src+=2;
-	        w-=2;
-	    }
-            if (w)
-            {
-                uint32x2_t sval, rgb, g, b;
-                sval = vld1_dup_u32(src);
-                rgb = vshr_n_u32(sval,8-5); // r (5 bits)
-                g = vshr_n_u32(sval,8+8-6);  // g to bottom byte
-                rgb = vsli_n_u32(rgb, g, 5);
-                b = vshr_n_u32(sval,8+8+8-5);  // b to bottom byte
-                rgb = vsli_n_u32(rgb, b, 11);
-                vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
-            }
-	}
-    }
-}
-
-
-void
 fbCompositeSrc_8888x8x8888neon (
                                pixman_implementation_t * impl,
                                pixman_op_t op,
-- 
1.5.6.3


--=-bLd9RxGyX2hCZUIgt+e5
Content-Disposition: attachment; filename="0001-First-patch-to-send-upstream.patch"
Content-Type: text/x-patch; name="0001-First-patch-to-send-upstream.patch"; charset="UTF-8"
Content-Transfer-Encoding: 8bit

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 5453dbb..6158925 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright Â© 2009 ARM Ltd
+ * Copyright Â© 2009 ARM Ltd, Movial Creative Technologies Oy
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -21,6 +21,8 @@
  * SOFTWARE.
  *
  * Author:  Ian Rickards (ian.rickards at arm.com) 
+ * Author:  Jonathan Morton (jonathan.morton at movial.com)
+ * Author:  Markku Vire (markku.vire at movial.com)
  *
  */
 
@@ -31,7 +33,12 @@
 #include "pixman-arm-neon.h"
 
 #include <arm_neon.h>
+#include <string.h>
 
+// Deal with an intrinsic that is defined differently in GCC
+#if !defined(__ARMCC_VERSION) && !defined(__pld)
+#define __pld(_x) __builtin_prefetch(_x)
+#endif
 
 static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
 {
@@ -1391,22 +1398,659 @@ fbCompositeSrcAdd_8888x8x8neon (
     }
 }
 
+#ifdef USE_GCC_INLINE_ASM
+
+void
+fbCompositeSrc_16x16neon (
+	pixman_implementation_t * impl,
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int32_t      xSrc,
+	int32_t      ySrc,
+	int32_t      xMask,
+	int32_t      yMask,
+	int32_t      xDst,
+	int32_t      yDst,
+	int32_t      width,
+	int32_t      height)
+{
+	uint16_t    *dstLine, *srcLine;
+	uint32_t     dstStride, srcStride;
+
+	if(!height || !width)
+		return;
+
+	/* We simply copy 16-bit-aligned pixels from one place to another. */
+	fbComposeGetStart (pSrc, xSrc, ySrc, uint16_t, srcStride, srcLine, 1);
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+	/* Preload the first input scanline */
+	{
+		uint16_t *srcPtr = srcLine;
+		uint32_t count = width;
+
+		asm volatile (
+		"0: @ loop							\n"
+		"	subs    %[count], %[count], #32				\n"
+		"	pld     [%[src]]					\n"
+		"	add     %[src], %[src], #64				\n"
+		"	bgt 0b							\n"
+
+		// Clobbered input registers marked as input/outputs
+		: [src] "+r" (srcPtr), [count] "+r" (count)
+		: // no unclobbered inputs
+		: "cc"
+		);
+	}
+
+	while(height--) {
+		uint16_t *dstPtr = dstLine;
+		uint16_t *srcPtr = srcLine;
+		uint32_t count = width;
+		uint32_t tmp = 0;
+
+		// Uses multi-register access and preloading to maximise bandwidth.
+		// Each pixel is one halfword, so a quadword contains 8px.
+		// Preload frequency assumed a 64-byte cacheline.
+		asm volatile (
+		"	cmp       %[count], #64				\n"
+		"	blt 1f    @ skip oversized fragments		\n"
+		"0: @ start with eight quadwords at a time		\n"
+		"	pld       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	sub       %[count], %[count], #64		\n"
+		"	vld1.16   {d16,d17,d18,d19}, [%[src]]!		\n"
+		"	vld1.16   {d20,d21,d22,d23}, [%[src]]!		\n"
+		"	pld       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	vld1.16   {d24,d25,d26,d27}, [%[src]]!		\n"
+		"	vld1.16   {d28,d29,d30,d31}, [%[src]]!		\n"
+		"	cmp       %[count], #64				\n"
+		"	vst1.16   {d16,d17,d18,d19}, [%[dst]]!		\n"
+		"	vst1.16   {d20,d21,d22,d23}, [%[dst]]!		\n"
+		"	vst1.16   {d24,d25,d26,d27}, [%[dst]]!		\n"
+		"	vst1.16   {d28,d29,d30,d31}, [%[dst]]!		\n"
+		"	bge 0b						\n"
+		"	cmp       %[count], #0				\n"
+		"	beq 7f    @ aligned fastpath			\n"
+		"1: @ four quadwords					\n"
+		"	tst       %[count], #32				\n"
+		"	beq 2f    @ skip oversized fragment		\n"
+		"	pld       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	vld1.16   {d16,d17,d18,d19}, [%[src]]!		\n"
+		"	vld1.16   {d20,d21,d22,d23}, [%[src]]!		\n"
+		"	vst1.16   {d16,d17,d18,d19}, [%[dst]]!		\n"
+		"	vst1.16   {d20,d21,d22,d23}, [%[dst]]!		\n"
+		"2: @ two quadwords					\n"
+		"	tst       %[count], #16				\n"
+		"	beq 3f    @ skip oversized fragment		\n"
+		"	pld       [%[src], %[srcStride], LSL #1]	\n" // preload from next scanline
+		"	vld1.16   {d16,d17,d18,d19}, [%[src]]!		\n"
+		"	vst1.16   {d16,d17,d18,d19}, [%[dst]]!		\n"
+		"3: @ one quadword					\n"
+		"	tst       %[count], #8				\n"
+		"	beq 4f    @ skip oversized fragment		\n"
+		"	vld1.16   {d16,d17}, [%[src]]!			\n"
+		"	vst1.16   {d16,d17}, [%[dst]]!			\n"
+		"4: @ one doubleword					\n"
+		"	tst       %[count], #4				\n"
+		"	beq 5f    @ skip oversized fragment		\n"
+		"	vld1.16   {d16}, [%[src]]!			\n"
+		"	vst1.16   {d16}, [%[dst]]!			\n"
+		"5: @ one word						\n"
+		"	tst       %[count], #2				\n"
+		"	beq 6f    @ skip oversized fragment		\n"
+		"	ldr       %[tmp], [%[src]], #4			\n"
+		"	str       %[tmp], [%[dst]], #4			\n"
+		"6: @ one halfword					\n"
+		"	tst       %[count], #1				\n"
+		"	beq 7f    @ skip oversized fragment		\n"
+		"	ldrh      %[tmp], [%[src]]			\n"
+		"	strh      %[tmp], [%[dst]]			\n"
+		"7: @ end						\n"
+
+		// Clobbered input registers marked as input/outputs
+		: [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count), [tmp] "+r" (tmp)
+
+		// Unclobbered input
+		: [srcStride] "r" (srcStride)
+
+		// Clobbered vector registers
+		// NB: these are the quad aliases of the double registers used in the asm
+		: "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
+		);
+
+		srcLine += srcStride;
+		dstLine += dstStride;
+	}
+}
+
+#endif /* USE_GCC_INLINE_ASM */
+
+void
+fbCompositeSrc_24x16neon (
+	pixman_implementation_t * impl,
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int32_t      xSrc,
+	int32_t      ySrc,
+	int32_t      xMask,
+	int32_t      yMask,
+	int32_t      xDst,
+	int32_t      yDst,
+	int32_t      width,
+	int32_t      height)
+{
+	uint16_t    *dstLine;
+	uint32_t    *srcLine;
+	uint32_t     dstStride, srcStride;
+
+	if(!width || !height)
+		return;
+
+	/* We simply copy pixels from one place to another, assuming that the source's alpha is opaque. */
+	fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+	/* Preload the first input scanline */
+	{
+		uint8_t *srcPtr = (uint8_t*) srcLine;
+		uint32_t count = (width + 15) / 16;
+
+#ifdef USE_GCC_INLINE_ASM
+		asm volatile (
+		"0: @ loop						\n"
+		"	subs    %[count], %[count], #1			\n"
+		"	pld     [%[src]]				\n"
+		"	add     %[src], %[src], #64			\n"
+		"	bgt 0b						\n"
+
+		// Clobbered input registers marked as input/outputs
+		: [src] "+r" (srcPtr), [count] "+r" (count)
+		: // no unclobbered inputs
+		: "cc"
+		);
+#else
+		do {
+			__pld(srcPtr);
+			srcPtr += 64;
+		} while(--count);
+#endif
+	}
+
+	while(height--) {
+		uint16_t *dstPtr = dstLine;
+		uint32_t *srcPtr = srcLine;
+		uint32_t count = width;
+		const uint32_t RBmask = 0x1F;
+		const uint32_t Gmask = 0x3F;
+
+		// If you're going to complain about a goto, take a long hard look
+		// at the massive blocks of assembler this skips over.  ;-)
+		if(count < 8)
+			goto smallStuff;
+
+#ifdef USE_GCC_INLINE_ASM
+
+		// This is not as aggressive as the RGB565-source case.
+		// Generally the source is in cached RAM when the formats are different, so we use preload.
+		// We don't need to blend, so we are not reading from the uncached framebuffer.
+		asm volatile (
+		"	cmp       %[count], #16										\n"
+		"	blt 1f    @ skip oversized fragments								\n"
+		"0: @ start with sixteen pixels at a time								\n"
+		"	sub       %[count], %[count], #16								\n"
+		"	pld      [%[src], %[srcStride], lsl #2]         @ preload from next scanline			\n"
+		"	vld4.8    {d0,d1,d2,d3}, [%[src]]!		@ d3 is alpha and ignored, d2-0 are rgb.	\n"
+		"	vld4.8    {d4,d5,d6,d7}, [%[src]]!		@ d7 is alpha and ignored, d6-4 are rgb.	\n"
+		"	vshll.u8  q8, d2, #8				@ expand first red for repacking		\n"
+		"	vshll.u8  q10, d1, #8				@ expand first green for repacking		\n"
+		"	vshll.u8  q11, d0, #8				@ expand first blue for repacking		\n"
+		"	vshll.u8  q9, d6, #8				@ expand second red for repacking		\n"
+		"	vsri.u16  q8, q10, #5				@ insert first green after red			\n"
+		"	vshll.u8  q10, d5, #8				@ expand second green for repacking		\n"
+		"	vsri.u16  q8, q11, #11				@ insert first blue after green			\n"
+		"	vshll.u8  q11, d4, #8				@ expand second blue for repacking		\n"
+		"	vsri.u16  q9, q10, #5				@ insert second green after red			\n"
+		"	vsri.u16  q9, q11, #11				@ insert second blue after green		\n"
+		"	cmp       %[count], #16										\n"
+		"	vst1.16   {d16,d17,d18,d19}, [%[dst]]!          @ store 16 pixels				\n"
+		"	bge 0b												\n"
+		"1: @ end of main loop	\n"
+		"	cmp       %[count], #8				@ can we still do an 8-pixel block?		\n"
+		"	blt 2f												\n"
+		"	sub       %[count], %[count], #8	\n"
+		"	pld      [%[src], %[srcStride], lsl #2]         @ preload from next scanline			\n"
+		"	vld4.8    {d0,d1,d2,d3}, [%[src]]!		@ d3 is alpha and ignored, d2-0 are rgb.	\n"
+		"	vshll.u8  q8, d2, #8				@ expand first red for repacking		\n"
+		"	vshll.u8  q10, d1, #8				@ expand first green for repacking		\n"
+		"	vshll.u8  q11, d0, #8				@ expand first blue for repacking		\n"
+		"	vsri.u16  q8, q10, #5				@ insert first green after red			\n"
+		"	vsri.u16  q8, q11, #11				@ insert first blue after green			\n"
+		"	vst1.16   {d16,d17}, [%[dst]]!          @ store 8 pixels				\n"
+		"2: @ end												\n"
+
+		// Clobbered input and working registers marked as input/outputs
+		: [dst] "+r" (dstPtr), [src] "+r" (srcPtr), [count] "+r" (count)
+
+		// Unclobbered input
+		: [srcStride] "r" (srcStride)
+
+		// Clobbered vector registers
+		// NB: these are the quad aliases of the double registers used in the asm
+		: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "cc", "memory"
+		);
+#else
+		// A copy of the above code, in intrinsics-form.
+		// This should be pretty self-documenting...
+		while(count >= 16) {
+			uint8x8x4_t pixelSetA, pixelSetB;
+			uint16x8_t redA, greenA, blueA;
+			uint16x8_t redB, greenB, blueB;
+			uint16x8_t destPixelsA, destPixelsB;
+
+			count -= 16;
+			__pld(srcPtr + srcStride);
+			pixelSetA = vld4_u8((uint8_t*)(srcPtr));
+			pixelSetB = vld4_u8((uint8_t*)(srcPtr+8));
+			srcPtr += 16;
+
+			redA   = vshll_n_u8(pixelSetA.val[2], 8);
+			greenA = vshll_n_u8(pixelSetA.val[1], 8);
+			blueA  = vshll_n_u8(pixelSetA.val[0], 8);
+			redB   = vshll_n_u8(pixelSetB.val[2], 8);
+			greenB = vshll_n_u8(pixelSetB.val[1], 8);
+			blueB  = vshll_n_u8(pixelSetB.val[0], 8);
+			destPixelsA = vsriq_n_u16(redA, greenA, 5);
+			destPixelsB = vsriq_n_u16(redB, greenB, 5);
+			destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
+			destPixelsB = vsriq_n_u16(destPixelsB, blueB, 11);
+
+			// There doesn't seem to be an intrinsic for the double-quadword variant
+			vst1q_u16(dstPtr  , destPixelsA);
+			vst1q_u16(dstPtr+8, destPixelsB);
+			dstPtr += 16;
+		}
+
+		// 8-pixel loop
+		if(count >= 8) {
+			uint8x8x4_t pixelSetA;
+			uint16x8_t redA, greenA, blueA;
+			uint16x8_t destPixelsA;
+
+			__pld(srcPtr + srcStride);
+			count -= 8;
+			pixelSetA = vld4_u8((uint8_t*)(srcPtr));
+			srcPtr += 8;
+
+			redA   = vshll_n_u8(pixelSetA.val[2], 8);
+			greenA = vshll_n_u8(pixelSetA.val[1], 8);
+			blueA  = vshll_n_u8(pixelSetA.val[0], 8);
+			destPixelsA = vsriq_n_u16(redA, greenA, 5);
+			destPixelsA = vsriq_n_u16(destPixelsA, blueA, 11);
+
+			vst1q_u16(dstPtr  , destPixelsA);
+			dstPtr += 8;
+		}
+
+#endif	// USE_GCC_INLINE_ASM
+
+	smallStuff:
+
+		if(count)
+			__pld(srcPtr + srcStride);
+
+		while(count >= 2) {
+			uint32_t srcPixelA = *srcPtr++;
+			uint32_t srcPixelB = *srcPtr++;
+
+			// ARM is really good at shift-then-ALU ops.
+			// This should be a total of six shift-ANDs and five shift-ORs.
+			uint32_t dstPixelsA;
+			uint32_t dstPixelsB;
+
+			dstPixelsA  = ((srcPixelA >>  3) & RBmask);
+			dstPixelsA |= ((srcPixelA >> 10) &  Gmask) << 5;
+			dstPixelsA |= ((srcPixelA >> 19) & RBmask) << 11;
+
+			dstPixelsB  = ((srcPixelB >>  3) & RBmask);
+			dstPixelsB |= ((srcPixelB >> 10) &  Gmask) << 5;
+			dstPixelsB |= ((srcPixelB >> 19) & RBmask) << 11;
+
+			// little-endian mode only
+			*((uint32_t*) dstPtr) = dstPixelsA | (dstPixelsB << 16);
+			dstPtr += 2;
+			count -= 2;
+		}
+
+		if(count) {
+			uint32_t srcPixel = *srcPtr++;
+
+			// ARM is really good at shift-then-ALU ops.
+			// This block should end up as three shift-ANDs and two shift-ORs.
+			uint32_t tmpBlue  = (srcPixel >>  3) & RBmask;
+			uint32_t tmpGreen = (srcPixel >> 10) & Gmask;
+			uint32_t tmpRed   = (srcPixel >> 19) & RBmask;
+			uint16_t dstPixel = (tmpRed << 11) | (tmpGreen << 5) | tmpBlue;
+
+			*dstPtr++ = dstPixel;
+			count--;
+		}
+
+		srcLine += srcStride;
+		dstLine += dstStride;
+	}
+}
+
+
+pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+		  int stride,
+		  int bpp,
+		  int x,
+		  int y,
+		  int width,
+		  int height,
+		  uint32_t _xor)
+{
+    uint32_t byte_stride, color;
+    char *dst;
+
+    /* stride is always multiple of 32bit units in pixman */
+    byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+	case 8:
+	    dst = ((char *) bits) + y * byte_stride + x;
+	    _xor &= 0xff;
+	    color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
+	    break;
+	case 16:
+	    dst = ((char *) bits) + y * byte_stride + x * 2;
+	    _xor &= 0xffff;
+	    color = _xor << 16 | _xor;
+	    width *= 2;     /* width to bytes */
+	    break;
+	case 32:
+	    dst = ((char *) bits) + y * byte_stride + x * 4;
+	    color = _xor;
+	    width *= 4;     /* width to bytes */
+	    break;
+	default:
+	    return FALSE;
+    }
+
+#ifdef USE_GCC_INLINE_ASM
+    if (width < 16)
+	/* We have a special case for such small widths that don't allow
+	   us to use wide 128-bit stores anyway. We don't waste time
+	   trying to align writes, since there are only very few of them anyway */
+	asm volatile (
+	"cmp		%[height], #0\n" /* Check if empty fill */
+	"beq		3f\n"
+	"vdup.32	d0, %[color]\n"  /* Fill the color to neon req */
+
+	/* Check if we have a such width that can easily be handled by single
+	   operation for each scanline. This significantly reduces the number
+	   of test/branch instructions for each scanline */
+	"cmp		%[width], #8\n"
+	"beq		4f\n"
+	"cmp		%[width], #4\n"
+	"beq		5f\n"
+	"cmp		%[width], #2\n"
+	"beq		6f\n"
+
+	/* Loop starts here for each scanline */
+	"1:\n"
+	"mov		r4, %[dst]\n"    /* Starting address of the current line */
+	"tst		%[width], #8\n"
+	"beq		2f\n"
+	"vst1.8		{d0}, [r4]!\n"
+	"2:\n"
+	"tst		%[width], #4\n"
+	"beq		2f\n"
+	"str		%[color], [r4]!\n"
+	"2:\n"
+	"tst		%[width], #2\n"
+	"beq		2f\n"
+	"strh		%[color], [r4]!\n"
+	"2:\n"
+	"tst		r5, #1\n"
+	"beq		2f\n"
+	"strb		%[color], [r4]!\n"
+	"2:\n"
+
+	"subs		%[height], %[height], #1\n"
+	"add		%[dst], %[dst], %[byte_stride]\n"
+	"bne		1b\n"
+	"b		3f\n"
+
+	/* Special fillers for those widths that we can do with single operation */
+	"4:\n"
+	"subs		%[height], %[height], #1\n"
+	"vst1.8		{d0}, [%[dst]]\n"
+	"add		%[dst], %[dst], %[byte_stride]\n"
+	"bne		4b\n"
+	"b		3f\n"
+
+	"5:\n"
+	"subs		%[height], %[height], #1\n"
+	"str		%[color], [%[dst]]\n"
+	"add		%[dst], %[dst], %[byte_stride]\n"
+	"bne		5b\n"
+	"b		3f\n"
+
+	"6:\n"
+	"subs		%[height], %[height], #1\n"
+	"strh		%[color], [%[dst]]\n"
+	"add		%[dst], %[dst], %[byte_stride]\n"
+	"bne		6b\n"
+
+	"3:\n"
+	: /* No output members */
+	: [color] "r" (color), [height] "r" (height), [width] "r" (width),
+	  [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
+	: "memory", "cc", "d0", "r4", "r5");
+    else
+	asm volatile (
+	"cmp		%[height], #0\n" /* Check if empty fill */
+	"beq		5f\n"
+	"vdup.32	q0, %[color]\n"  /* Fill the color to neon req */
+
+	/* Loop starts here for each scanline */
+	"1:\n"
+	"mov		r4, %[dst]\n"    /* Starting address of the current line */
+	"mov		r5, %[width]\n"  /* We're going to write this many bytes */
+	"ands		r6, r4, #15\n"   /* Are we at the 128-bit aligned address? */
+	"beq		2f\n"            /* Jump to the best case */
+
+	/* We're not 128-bit aligned: However, we know that we can get to the
+	   next aligned location, since the fill is at least 16 bytes wide */
+	"rsb 		r6, r6, #16\n"   /* We would need to go forward this much */
+	"sub		r5, r5, r6\n"    /* Update bytes left */
+	"tst		r6, #1\n"
+	"beq		6f\n"
+	"vst1.8		{d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
+	"6:\n"
+	"tst		r6, #2\n"
+	"beq		6f\n"
+	"vst1.16	{d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
+	"6:\n"
+	"tst		r6, #4\n"
+	"beq		6f\n"
+	"vst1.32	{d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
+	"6:\n"
+	"tst		r6, #8\n"
+	"beq		2f\n"
+	"vst1.64	{d0}, [r4, :64]!\n"    /* Store qword now we're 64-bit aligned */
+
+	/* The good case: We're 128-bit aligned for this scanline */
+	"2:\n"
+	"and		r6, r5, #15\n"        /* Number of tailing bytes */
+	"cmp		r5, r6\n"             /* Do we have at least one qword to write? */
+	"beq		6f\n"                 /* No, we just write the tail */
+	"lsr		r5, r5, #4\n"         /* This many full qwords to write */
+
+	/* The main block: Do 128-bit aligned writes */
+	"3:\n"
+	"subs		r5, r5, #1\n"
+	"vst1.64	{d0,d1}, [r4, :128]!\n"
+	"bne		3b\n"
+
+	/* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
+	    We know that we're currently at 128-bit aligned address, so we can just
+	    pick the biggest operations that the remaining write width allows */
+	"6:\n"
+	"cmp		r6, #0\n"
+	"beq		4f\n"
+	"tst		r6, #8\n"
+	"beq		6f\n"
+	"vst1.64	{d0}, [r4, :64]!\n"
+	"6:\n"
+	"tst		r6, #4\n"
+	"beq		6f\n"
+	"vst1.32	{d0[0]}, [r4, :32]!\n"
+	"6:\n"
+	"tst		r6, #2\n"
+	"beq		6f\n"
+	"vst1.16	{d0[0]}, [r4, :16]!\n"
+	"6:\n"
+	"tst		r6, #1\n"
+	"beq		4f\n"
+	"vst1.8		{d0[0]}, [r4]!\n"
+	"4:\n"
+
+	/* Handle the next scanline */
+	"subs		%[height], %[height], #1\n"
+	"add		%[dst], %[dst], %[byte_stride]\n"
+	"bne		1b\n"
+	"5:\n"
+	: /* No output members */
+	: [color] "r" (color), [height] "r" (height), [width] "r" (width),
+	  [dst] "r" (dst) , [byte_stride] "r" (byte_stride)
+	: "memory", "cc", "q0", "d0", "d1", "r4", "r5", "r6");
+
+    return TRUE;
+
+#else
+
+    // TODO: intrinsic version for armcc
+    return FALSE;
+
+#endif
+}
+
+#ifdef USE_GCC_INLINE_ASM
+
+// TODO: is there a more generic way of doing this being introduced?
+#define NEON_SCANLINE_BUFFER_PIXELS (1024)
+
+static inline void QuadwordCopy_neon(
+	void* dst,
+	void* src,
+	uint32_t count,       // of quadwords
+	uint32_t trailerCount // of bytes
+)
+{
+	// Uses aligned multi-register loads to maximise read bandwidth
+	// on uncached memory such as framebuffers
+	// The accesses do not have the aligned qualifiers, so that the copy
+	// may convert between aligned-uncached and unaligned-cached memory.
+	// It is assumed that the CPU can infer alignedness from the address.
+	asm volatile (
+	"	cmp       %[count], #8						\n"
+	"	blt 1f    @ skip oversized fragments		\n"
+	"0: @ start with eight quadwords at a time		\n"
+	"	sub       %[count], %[count], #8			\n"
+	"	vld1.8    {d16,d17,d18,d19}, [%[src]]!		\n"
+	"	vld1.8    {d20,d21,d22,d23}, [%[src]]!		\n"
+	"	vld1.8    {d24,d25,d26,d27}, [%[src]]!		\n"
+	"	vld1.8    {d28,d29,d30,d31}, [%[src]]!		\n"
+	"	cmp       %[count], #8						\n"
+	"	vst1.8    {d16,d17,d18,d19}, [%[dst]]!		\n"
+	"	vst1.8    {d20,d21,d22,d23}, [%[dst]]!		\n"
+	"	vst1.8    {d24,d25,d26,d27}, [%[dst]]!		\n"
+	"	vst1.8    {d28,d29,d30,d31}, [%[dst]]!		\n"
+	"	bge 0b										\n"
+	"1: @ four quadwords							\n"
+	"	tst       %[count], #4						\n"
+	"	beq 2f    @ skip oversized fragment			\n"
+	"	vld1.8    {d16,d17,d18,d19}, [%[src]]!		\n"
+	"	vld1.8    {d20,d21,d22,d23}, [%[src]]!		\n"
+	"	vst1.8    {d16,d17,d18,d19}, [%[dst]]!		\n"
+	"	vst1.8    {d20,d21,d22,d23}, [%[dst]]!		\n"
+	"2: @ two quadwords								\n"
+	"	tst       %[count], #2						\n"
+	"	beq 3f    @ skip oversized fragment			\n"
+	"	vld1.8    {d16,d17,d18,d19}, [%[src]]!		\n"
+	"	vst1.8    {d16,d17,d18,d19}, [%[dst]]!		\n"
+	"3: @ one quadword								\n"
+	"	tst       %[count], #1						\n"
+	"	beq 4f    @ skip oversized fragment			\n"
+	"	vld1.8    {d16,d17}, [%[src]]!				\n"
+	"	vst1.8    {d16,d17}, [%[dst]]!				\n"
+	"4: @ end										\n"
+
+	// Clobbered input registers marked as input/outputs
+	: [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count)
+
+	// No unclobbered inputs
+	:
+
+	// Clobbered vector registers
+	// NB: these are the quad aliases of the double registers used in the asm
+	: "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"
+	);
+
+	if(trailerCount) {
+		uint8_t *tDst = dst, *tSrc = src;
+
+		while(trailerCount >= 4) {
+			*((uint32_t*) tDst) = *((uint32_t*) tSrc);
+			tDst += 4;
+			tSrc += 4;
+			trailerCount -= 4;
+		}
+
+		if(trailerCount >= 2) {
+			*((uint16_t*) tDst) = *((uint16_t*) tSrc);
+			tDst += 2;
+			tSrc += 2;
+			trailerCount -= 2;
+		}
+
+		if(trailerCount) {
+			*tDst++ = *tSrc++;
+			trailerCount--;
+		}
+	}
+}
+
+#endif  // USE_GCC_INLINE_ASM
+
 static const FastPathInfo arm_neon_fast_path_array[] = 
 {
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000neon,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565neon,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565neon,     0 },
     { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
     { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
+#ifdef USE_GCC_INLINE_ASM
+//  { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_16x16neon,              0 },
+//  { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_16x16neon,              0 },
+#endif
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565neon,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
@@ -1462,10 +2106,13 @@ pixman_blt_neon (
 	int width, int height)
 {
 
-#if 0  // Relies on code which isn't upstreamed yet
+	if(!width || !height)
+		return TRUE;
+
+#ifdef USE_GCC_INLINE_ASM
 
-	// accelerate only straight copies
-	if(src_bpp != dst_bpp || (src_bpp & 7) || !width || !height)
+	// accelerate only straight copies involving complete bytes
+	if(src_bpp != dst_bpp || (src_bpp & 7))
 		return FALSE;
 
 	{
diff --git a/pixman/pixman-arm-neon.h b/pixman/pixman-arm-neon.h
index aed7a4d..2442ea6 100644
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 42503fc..1204a36 100644

--=-bLd9RxGyX2hCZUIgt+e5--