[PATCH] Replace Ian's glyph-blitter with a better one.
Jonathan Morton
jmorton at sd070.hel.movial.fi
Mon Jun 15 06:14:00 PDT 2009
---
pixman/pixman-arm-neon.c | 515 ++++++++++++++++------------------------------
1 files changed, 178 insertions(+), 337 deletions(-)
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 467a0dd..1aefb5a 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -632,343 +632,6 @@ fbCompositeSrc_8888x8x8888neon (
}
-
-void
-fbCompositeSolidMask_nx8x0565neon (
- pixman_implementation_t * impl,
- pixman_op_t op,
- pixman_image_t * pSrc,
- pixman_image_t * pMask,
- pixman_image_t * pDst,
- int32_t xSrc,
- int32_t ySrc,
- int32_t xMask,
- int32_t yMask,
- int32_t xDst,
- int32_t yDst,
- int32_t width,
- int32_t height)
-{
- uint32_t src, srca;
- uint16_t *dstLine, *dst;
- uint8_t *maskLine, *mask;
- int dstStride, maskStride;
- uint32_t w;
- uint8x8_t sval2;
- uint8x8x4_t sval8;
-
- fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
- srca = src >> 24;
- if (src == 0)
- return;
-
- sval2=vreinterpret_u8_u32(vdup_n_u32(src));
- sval8.val[0]=vdup_lane_u8(sval2,0);
- sval8.val[1]=vdup_lane_u8(sval2,1);
- sval8.val[2]=vdup_lane_u8(sval2,2);
- sval8.val[3]=vdup_lane_u8(sval2,3);
-
- fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
- if (width>=8)
- {
- // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
- while (height--)
- {
- uint16_t *keep_dst=0;
-
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
-#ifndef USE_GCC_INLINE_ASM
- uint8x8_t alpha;
- uint16x8_t dval, temp;
- uint8x8x4_t sval8temp;
-
- alpha = vld1_u8((void*)mask);
- dval = vld1q_u16((void*)dst);
- keep_dst = dst;
-
- sval8temp = neon8mul(sval8,alpha);
- temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
- mask += (w & 7);
- dst += (w & 7);
- w -= (w & 7);
-
- while (w)
- {
- dval = vld1q_u16((void*)dst);
- alpha = vld1_u8((void*)mask);
-
- vst1q_u16((void*)keep_dst,temp);
- keep_dst = dst;
-
- sval8temp = neon8mul(sval8,alpha);
- temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
- mask+=8;
- dst+=8;
- w-=8;
- }
- vst1q_u16((void*)keep_dst,temp);
-#else
- asm volatile (
- "vdup.32 d0, %[src]\n\t"
- "vdup.8 d1, d0[1]\n\t"
- "vdup.8 d2, d0[2]\n\t"
- "vdup.8 d3, d0[3]\n\t"
- "vdup.8 d0, d0[0]\n\t"
-
- "vld1.8 {q12}, [%[dst]]\n\t"
- "vld1.8 {d31}, [%[mask]]\n\t"
- "mov %[keep_dst], %[dst]\n\t"
-
- "and ip, %[w], #7\n\t"
- "add %[mask], %[mask], ip\n\t"
- "add %[dst], %[dst], ip, LSL#1\n\t"
- "subs %[w], %[w], ip\n\t"
- "b 9f\n\t"
-// LOOP
- "2:\n\t"
-
- "vld1.16 {q12}, [%[dst]]!\n\t"
- "vld1.8 {d31}, [%[mask]]!\n\t"
- "vst1.16 {q10}, [%[keep_dst]]\n\t"
- "sub %[keep_dst], %[dst], #8*2\n\t"
- "subs %[w], %[w], #8\n\t"
- "9:\n\t"
-// expand 0565 q12 to 8888 {d4-d7}
- "vmovn.u16 d4, q12\t\n"
- "vshr.u16 q11, q12, #5\t\n"
- "vshr.u16 q10, q12, #6+5\t\n"
- "vmovn.u16 d5, q11\t\n"
- "vmovn.u16 d6, q10\t\n"
- "vshl.u8 d4, d4, #3\t\n"
- "vshl.u8 d5, d5, #2\t\n"
- "vshl.u8 d6, d6, #3\t\n"
- "vsri.u8 d4, d4, #5\t\n"
- "vsri.u8 d5, d5, #6\t\n"
- "vsri.u8 d6, d6, #5\t\n"
-
- "vmull.u8 q10, d31, d0\n\t"
- "vmull.u8 q11, d31, d1\n\t"
- "vmull.u8 q12, d31, d2\n\t"
- "vmull.u8 q13, d31, d3\n\t"
- "vrshr.u16 q8, q10, #8\n\t"
- "vrshr.u16 q9, q11, #8\n\t"
- "vraddhn.u16 d20, q10, q8\n\t"
- "vraddhn.u16 d21, q11, q9\n\t"
- "vrshr.u16 q9, q13, #8\n\t"
- "vrshr.u16 q8, q12, #8\n\t"
- "vraddhn.u16 d23, q13, q9\n\t"
- "vraddhn.u16 d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
- "vmvn.8 d30, d23\n\t"
- "vmull.u8 q14, d30, d6\n\t"
- "vmull.u8 q13, d30, d5\n\t"
- "vmull.u8 q12, d30, d4\n\t"
- "vrshr.u16 q8, q14, #8\n\t"
- "vrshr.u16 q9, q13, #8\n\t"
- "vraddhn.u16 d6, q14, q8\n\t"
- "vrshr.u16 q8, q12, #8\n\t"
- "vraddhn.u16 d5, q13, q9\n\t"
- "vqadd.u8 d6, d6, d22\n\t" // moved up
- "vraddhn.u16 d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-// "vqadd.u8 d6, d6, d22\n\t" ** moved up
- "vqadd.u8 d5, d5, d21\n\t"
- "vqadd.u8 d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
- "vshll.u8 q10, d6, #8\n\t"
- "vshll.u8 q3, d5, #8\n\t"
- "vshll.u8 q2, d4, #8\n\t"
- "vsri.u16 q10, q3, #5\t\n"
- "vsri.u16 q10, q2, #11\t\n"
-
- "bne 2b\n\t"
-
- "1:\n\t"
- "vst1.16 {q10}, [%[keep_dst]]\n\t"
-
- : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
- : [src] "r" (src)
- : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
- "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
- "d30","d31"
- );
-#endif
- }
- }
- else
- {
- while (height--)
- {
- void *dst4=0, *dst2=0;
-
- dst = dstLine;
- dstLine += dstStride;
- mask = maskLine;
- maskLine += maskStride;
- w = width;
-
-
-#ifndef USE_GCC_INLINE_ASM
- uint8x8_t alpha;
- uint16x8_t dval, temp;
- uint8x8x4_t sval8temp;
-
- if (w&4)
- {
- alpha = vreinterpret_u8_u32(vld1_lane_u32((void*)mask,vreinterpret_u32_u8(alpha),1));
- dval = vreinterpretq_u16_u64(vld1q_lane_u64((void*)dst,vreinterpretq_u64_u16(dval),1));
- dst4=dst;
- mask+=4;
- dst+=4;
- }
- if (w&2)
- {
- alpha = vreinterpret_u8_u16(vld1_lane_u16((void*)mask,vreinterpret_u16_u8(alpha),1));
- dval = vreinterpretq_u16_u32(vld1q_lane_u32((void*)dst,vreinterpretq_u32_u16(dval),1));
- dst2=dst;
- mask+=2;
- dst+=2;
- }
- if (w&1)
- {
- alpha = vld1_lane_u8((void*)mask,alpha,1);
- dval = vld1q_lane_u16((void*)dst,dval,1);
- }
-
- sval8temp = neon8mul(sval8,alpha);
- temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
- if (w&1)
- vst1q_lane_u16((void*)dst,temp,1);
- if (w&2)
- vst1q_lane_u32((void*)dst2,vreinterpretq_u32_u16(temp),1);
- if (w&4)
- vst1q_lane_u64((void*)dst4,vreinterpretq_u64_u16(temp),1);
-#else
- asm volatile (
- "vdup.32 d0, %[src]\n\t"
- "vdup.8 d1, d0[1]\n\t"
- "vdup.8 d2, d0[2]\n\t"
- "vdup.8 d3, d0[3]\n\t"
- "vdup.8 d0, d0[0]\n\t"
-
- "tst %[w], #4\t\n"
- "beq skip_load4\t\n"
-
- "vld1.64 {d25}, [%[dst]]\n\t"
- "vld1.32 {d31[1]}, [%[mask]]\n\t"
- "mov %[dst4], %[dst]\t\n"
- "add %[mask], %[mask], #4\t\n"
- "add %[dst], %[dst], #4*2\t\n"
-
- "skip_load4:\t\n"
- "tst %[w], #2\t\n"
- "beq skip_load2\t\n"
- "vld1.32 {d24[1]}, [%[dst]]\n\t"
- "vld1.16 {d31[1]}, [%[mask]]\n\t"
- "mov %[dst2], %[dst]\t\n"
- "add %[mask], %[mask], #2\t\n"
- "add %[dst], %[dst], #2*2\t\n"
-
- "skip_load2:\t\n"
- "tst %[w], #1\t\n"
- "beq skip_load1\t\n"
- "vld1.16 {d24[1]}, [%[dst]]\n\t"
- "vld1.8 {d31[1]}, [%[mask]]\n\t"
-
- "skip_load1:\t\n"
-// expand 0565 q12 to 8888 {d4-d7}
- "vmovn.u16 d4, q12\t\n"
- "vshr.u16 q11, q12, #5\t\n"
- "vshr.u16 q10, q12, #6+5\t\n"
- "vmovn.u16 d5, q11\t\n"
- "vmovn.u16 d6, q10\t\n"
- "vshl.u8 d4, d4, #3\t\n"
- "vshl.u8 d5, d5, #2\t\n"
- "vshl.u8 d6, d6, #3\t\n"
- "vsri.u8 d4, d4, #5\t\n"
- "vsri.u8 d5, d5, #6\t\n"
- "vsri.u8 d6, d6, #5\t\n"
-
- "vmull.u8 q10, d31, d0\n\t"
- "vmull.u8 q11, d31, d1\n\t"
- "vmull.u8 q12, d31, d2\n\t"
- "vmull.u8 q13, d31, d3\n\t"
- "vrshr.u16 q8, q10, #8\n\t"
- "vrshr.u16 q9, q11, #8\n\t"
- "vraddhn.u16 d20, q10, q8\n\t"
- "vraddhn.u16 d21, q11, q9\n\t"
- "vrshr.u16 q9, q13, #8\n\t"
- "vrshr.u16 q8, q12, #8\n\t"
- "vraddhn.u16 d23, q13, q9\n\t"
- "vraddhn.u16 d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
- "vmvn.8 d30, d23\n\t"
- "vmull.u8 q14, d30, d6\n\t"
- "vmull.u8 q13, d30, d5\n\t"
- "vmull.u8 q12, d30, d4\n\t"
- "vrshr.u16 q8, q14, #8\n\t"
- "vrshr.u16 q9, q13, #8\n\t"
- "vraddhn.u16 d6, q14, q8\n\t"
- "vrshr.u16 q8, q12, #8\n\t"
- "vraddhn.u16 d5, q13, q9\n\t"
- "vqadd.u8 d6, d6, d22\n\t" // moved up
- "vraddhn.u16 d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-// "vqadd.u8 d6, d6, d22\n\t" ** moved up
- "vqadd.u8 d5, d5, d21\n\t"
- "vqadd.u8 d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
- "vshll.u8 q10, d6, #8\n\t"
- "vshll.u8 q3, d5, #8\n\t"
- "vshll.u8 q2, d4, #8\n\t"
- "vsri.u16 q10, q3, #5\t\n"
- "vsri.u16 q10, q2, #11\t\n"
-
- "tst %[w], #1\n\t"
- "beq skip_store1\t\n"
- "vst1.16 {d20[1]}, [%[dst]]\t\n"
- "skip_store1:\t\n"
- "tst %[w], #2\n\t"
- "beq skip_store2\t\n"
- "vst1.32 {d20[1]}, [%[dst2]]\t\n"
- "skip_store2:\t\n"
- "tst %[w], #4\n\t"
- "beq skip_store4\t\n"
- "vst1.16 {d21}, [%[dst4]]\t\n"
- "skip_store4:\t\n"
-
- : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
- : [src] "r" (src)
- : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
- "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
- "d30","d31"
- );
-#endif
- }
- }
-}
-
-
-
void
fbCompositeSolidMask_nx8x8888neon (
pixman_implementation_t * impl,
@@ -1964,6 +1627,184 @@ static inline void QuadwordCopy_neon(
}
}
+static inline void SolidOver565_8pix_neon(
+ uint32_t glyphColour,
+ uint16_t *dest,
+ uint8_t *inMask,
+ uint32_t destStride, // bytes, not elements
+ uint32_t maskStride,
+ uint32_t count // 8-pixel groups
+)
+{
+ // Inner loop of glyph blitter (solid colour, alpha mask)
+
+#ifdef USE_GCC_INLINE_ASM
+
+ asm volatile (
+ " vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyphColour]] @ splat solid colour components \n"
+ "0: @ loop \n"
+ " vld1.16 {d0,d1}, [%[dest]] @ load first pixels from framebuffer \n"
+ " vld1.8 {d17}, [%[inMask]] @ load alpha mask of glyph \n"
+ " vmull.u8 q9, d17, d23 @ apply glyph colour alpha to mask \n"
+ " vshrn.u16 d17, q9, #8 @ reformat it to match original mask \n"
+ " vmvn d18, d17 @ we need the inverse mask for the background \n"
+ " vsli.u16 q3, q0, #5 @ duplicate framebuffer blue bits \n"
+ " vshrn.u16 d2, q0, #8 @ unpack red from framebuffer pixels \n"
+ " vshrn.u16 d4, q0, #3 @ unpack green \n"
+ " vsri.u8 d2, d2, #5 @ duplicate red bits (extend 5 to 8) \n"
+ " vshrn.u16 d6, q3, #2 @ unpack extended blue (truncate 10 to 8) \n"
+ " vsri.u8 d4, d4, #6 @ duplicate green bits (extend 6 to 8) \n"
+ " vmull.u8 q1, d2, d18 @ apply inverse mask to background red... \n"
+ " vmull.u8 q2, d4, d18 @ ...green... \n"
+ " vmull.u8 q3, d6, d18 @ ...blue \n"
+ " subs %[count], %[count], #1 @ decrement/test loop counter \n"
+ " vmlal.u8 q1, d17, d22 @ add masked foreground red... \n"
+ " vmlal.u8 q2, d17, d21 @ ...green... \n"
+ " vmlal.u8 q3, d17, d20 @ ...blue \n"
+ " add %[inMask], %[inMask], %[maskStride] @ advance mask pointer, while we wait \n"
+ " vsri.16 q1, q2, #5 @ pack green behind red \n"
+ " vsri.16 q1, q3, #11 @ pack blue into pixels \n"
+ " vst1.16 {d2,d3}, [%[dest]] @ store composited pixels \n"
+ " add %[dest], %[dest], %[destStride] @ advance framebuffer pointer \n"
+ " bne 0b @ next please \n"
+
+ // Clobbered registers marked as input/outputs
+ : [dest] "+r" (dest), [inMask] "+r" (inMask), [count] "+r" (count)
+
+ // Inputs
+ : [destStride] "r" (destStride), [maskStride] "r" (maskStride), [glyphColour] "r" (&glyphColour)
+
+ // Clobbers, including the inputs we modify, and potentially lots of memory
+ : "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
+ );
+
+#else
+
+ uint8x8x4_t solidColour = vld4_dup_u8((uint8_t*) &glyphColour);
+
+ while(count--)
+ {
+ uint16x8_t pixels = vld1q_u16(dest);
+ uint8x8_t mask = vshrn_n_u16(vmull_u8(solidColour.val[3], vld1_u8(inMask)), 8);
+ uint8x8_t iMask = vmvn_u8(mask);
+
+ uint8x8_t tRed = vshrn_n_u16(pixels, 8);
+ uint8x8_t tGreen = vshrn_n_u16(pixels, 3);
+ uint8x8_t tBlue = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2);
+
+ uint16x8_t sRed = vmull_u8(vsri_n_u8(tRed , tRed , 5), iMask);
+ uint16x8_t sGreen = vmull_u8(vsri_n_u8(tGreen, tGreen, 6), iMask);
+ uint16x8_t sBlue = vmull_u8( tBlue , iMask);
+
+ sRed = vmlal(sRed , mask, solidColour.val[2]);
+ sGreen = vmlal(sGreen, mask, solidColour.val[1]);
+ sBlue = vmlal(sBlue , mask, solidColour.val[0]);
+
+ pixels = vsri_n_u16(sRed, sGreen, 5);
+ pixels = vsri_n_u16(pixels, sBlue, 11);
+ vst1q_u16(dest, pixels);
+
+ dest += destStride;
+ mask += maskStride;
+ }
+
+#endif
+}
+
+void
+fbCompositeSolidMask_nx8x0565neon (
+ pixman_implementation_t * impl,
+ pixman_op_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int32_t xSrc,
+ int32_t ySrc,
+ int32_t xMask,
+ int32_t yMask,
+ int32_t xDst,
+ int32_t yDst,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t src, srca;
+ uint16_t *dstLine, *alignedLine;
+ uint8_t *maskLine;
+ uint32_t dstStride, maskStride;
+ uint32_t kernelCount, copyCount;
+ uint8_t kernelOffset, copyOffset;
+
+ fbComposeGetSolid(pSrc, src, pDst->bits.format);
+
+ // bail out if fully transparent or degenerate
+ srca = src >> 24;
+ if(srca == 0)
+ return;
+ if(width == 0 || height == 0)
+ return;
+
+ if(width > NEON_SCANLINE_BUFFER_PIXELS) {
+ // split the blit, so we can use a fixed-size scanline buffer
+ // TODO: there must be a more elegant way of doing this.
+ int x;
+ for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
+ fbCompositeSolidMask_nx8x0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
+ (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
+ }
+ return;
+ }
+
+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+
+ // keep within minimum number of aligned quadwords on width
+ // while also keeping the minimum number of columns to process
+ {
+ unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
+ unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
+ unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
+
+ // the fast copy must always be quadword aligned
+ copyOffset = dstLine - ((uint16_t*) alignedLeft);
+ alignedLine = dstLine - copyOffset;
+ copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+
+ if(alignedRight - alignedLeft > ceilingLength) {
+ // unaligned routine is tightest, and will not overrun
+ kernelCount = (uint32_t) (ceilingLength >> 4);
+ kernelOffset = copyOffset;
+ } else {
+ // aligned routine is equally tight, so it is safer to align
+ kernelCount = copyCount;
+ kernelOffset = 0;
+ }
+ }
+
+ {
+ uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
+ uint8_t glyphLine[NEON_SCANLINE_BUFFER_PIXELS + 8];
+ int y = height;
+
+ // row-major order
+ // left edge, middle block, right edge
+ for( ; y--; maskLine += maskStride, alignedLine += dstStride, dstLine += dstStride) {
+ // We don't want to overrun the edges of the glyph, so realign the edge data into known buffers
+ QuadwordCopy_neon(glyphLine + copyOffset, maskLine, width >> 4, width & 0xF);
+
+ // Uncached framebuffer access is really, really slow if we do it piecemeal.
+ // It should be much faster if we grab it all at once.
+ // One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
+ QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+
+ // Apply the actual filter
+ SolidOver565_8pix_neon(src, scanLine + kernelOffset, glyphLine + kernelOffset, 8 * sizeof(*dstLine), 8, kernelCount);
+
+ // Copy the modified scanline back
+ QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
+ }
+ }
+}
+
static const FastPathInfo arm_neon_fast_path_array[] =
{
{ PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8neon, 0 },
--
1.5.6.3
--=-by6jJzXI0j1mDgcsBTPS
Content-Disposition: attachment; filename="0018-Add-transparent-rect-blitter.patch"
Content-Type: text/x-patch; name="0018-Add-transparent-rect-blitter.patch"; charset="UTF-8"
Content-Transfer-Encoding: 7bit
More information about the cairo
mailing list