[PATCH] Avoid overrunning scanlines in NEON blitters.
Jonathan Morton
jmorton at sd070.hel.movial.fi
Wed Jul 1 06:29:48 PDT 2009
---
pixman/pixman-arm-neon.c | 63 +++++++++++++++++++++++++++++++++++++--------
1 files changed, 51 insertions(+), 12 deletions(-)
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 681c294..13ad48c 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1731,7 +1731,7 @@ fbCompositeSolidMask_nx8x0565neon (
uint16_t *dstLine, *alignedLine;
uint8_t *maskLine;
uint32_t dstStride, maskStride;
- uint32_t kernelCount, copyCount;
+ uint32_t kernelCount, copyCount, copyTail;
uint8_t kernelOffset, copyOffset;
src = _pixman_image_get_solid(pSrc, pDst->bits.format);
@@ -1764,13 +1764,14 @@ fbCompositeSolidMask_nx8x0565neon (
unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
- // the fast copy must always be quadword aligned
+ // the fast copy should be quadword aligned
copyOffset = dstLine - ((uint16_t*) alignedLeft);
alignedLine = dstLine - copyOffset;
copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+ copyTail = 0;
if(alignedRight - alignedLeft > ceilingLength) {
- // unaligned routine is tightest, and will not overrun
+ // unaligned routine is tightest
kernelCount = (uint32_t) (ceilingLength >> 4);
kernelOffset = copyOffset;
} else {
@@ -1778,6 +1779,18 @@ fbCompositeSolidMask_nx8x0565neon (
kernelCount = copyCount;
kernelOffset = 0;
}
+
+ // We should avoid reading beyond scanline ends for safety
+ if(alignedLine < (dstLine - xDst) ||
+ (alignedLine + (copyCount * 16 / sizeof(*dstLine))) > ((dstLine - xDst) + pDst->bits.width))
+ {
+ // switch to precise read
+ copyOffset = kernelOffset = 0;
+ alignedLine = dstLine;
+ kernelCount = (uint32_t) (ceilingLength >> 4);
+ copyCount = (width * sizeof(*dstLine)) >> 4;
+ copyTail = (width * sizeof(*dstLine)) & 0xF;
+ }
}
{
@@ -1794,7 +1807,7 @@ fbCompositeSolidMask_nx8x0565neon (
// Uncached framebuffer access is really, really slow if we do it piecemeal.
// It should be much faster if we grab it all at once.
// One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
- QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+ QuadwordCopy_neon(scanLine, alignedLine, copyCount, copyTail);
// Apply the actual filter
SolidOver565_8pix_neon(src, scanLine + kernelOffset, glyphLine + kernelOffset, 8 * sizeof(*dstLine), 8, kernelCount);
@@ -1872,7 +1885,7 @@ fbCompositeSolid_nx0565neon (
uint32_t src, srca;
uint16_t *dstLine, *alignedLine;
uint32_t dstStride;
- uint32_t kernelCount, copyCount;
+ uint32_t kernelCount, copyCount, copyTail;
uint8_t kernelOffset, copyOffset;
src = _pixman_image_get_solid(pSrc, pDst->bits.format);
@@ -1904,13 +1917,14 @@ fbCompositeSolid_nx0565neon (
unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
- // the fast copy must always be quadword aligned
+ // the fast copy should be quadword aligned
copyOffset = dstLine - ((uint16_t*) alignedLeft);
alignedLine = dstLine - copyOffset;
copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+ copyTail = 0;
if(alignedRight - alignedLeft > ceilingLength) {
- // unaligned routine is tightest, and will not overrun
+ // unaligned routine is tightest
kernelCount = (uint32_t) (ceilingLength >> 4);
kernelOffset = copyOffset;
} else {
@@ -1918,6 +1932,18 @@ fbCompositeSolid_nx0565neon (
kernelCount = copyCount;
kernelOffset = 0;
}
+
+ // We should avoid reading beyond scanline ends for safety
+ if(alignedLine < (dstLine - xDst) ||
+ (alignedLine + (copyCount * 16 / sizeof(*dstLine))) > ((dstLine - xDst) + pDst->bits.width))
+ {
+ // switch to precise read
+ copyOffset = kernelOffset = 0;
+ alignedLine = dstLine;
+ kernelCount = (uint32_t) (ceilingLength >> 4);
+ copyCount = (width * sizeof(*dstLine)) >> 4;
+ copyTail = (width * sizeof(*dstLine)) & 0xF;
+ }
}
{
@@ -1930,7 +1956,7 @@ fbCompositeSolid_nx0565neon (
// Uncached framebuffer access is really, really slow if we do it piecemeal.
// It should be much faster if we grab it all at once.
// One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
- QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+ QuadwordCopy_neon(scanLine, alignedLine, copyCount, copyTail);
// Apply the actual filter
PlainOver565_8pix_neon(src, scanLine + kernelOffset, 8 * sizeof(*dstLine), kernelCount);
@@ -2002,7 +2028,7 @@ fbCompositeOver_8888x0565neon (
uint32_t *srcLine;
uint16_t *dstLine, *alignedLine;
uint32_t dstStride, srcStride;
- uint32_t kernelCount, copyCount;
+ uint32_t kernelCount, copyCount, copyTail;
uint8_t kernelOffset, copyOffset;
// we assume mask is opaque
@@ -2028,13 +2054,14 @@ fbCompositeOver_8888x0565neon (
unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
- // the fast copy must always be quadword aligned
+ // the fast copy should be quadword aligned
copyOffset = dstLine - ((uint16_t*) alignedLeft);
alignedLine = dstLine - copyOffset;
copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+ copyTail = 0;
if(alignedRight - alignedLeft > ceilingLength) {
- // unaligned routine is tightest, and will not overrun
+ // unaligned routine is tightest
kernelCount = (uint32_t) (ceilingLength >> 4);
kernelOffset = copyOffset;
} else {
@@ -2042,6 +2069,18 @@ fbCompositeOver_8888x0565neon (
kernelCount = copyCount;
kernelOffset = 0;
}
+
+ // We should avoid reading beyond scanline ends for safety
+ if(alignedLine < (dstLine - xDst) ||
+ (alignedLine + (copyCount * 16 / sizeof(*dstLine))) > ((dstLine - xDst) + pDst->bits.width))
+ {
+ // switch to precise read
+ copyOffset = kernelOffset = 0;
+ alignedLine = dstLine;
+ kernelCount = (uint32_t) (ceilingLength >> 4);
+ copyCount = (width * sizeof(*dstLine)) >> 4;
+ copyTail = (width * sizeof(*dstLine)) & 0xF;
+ }
}
/* Preload the first input scanline */
@@ -2079,7 +2118,7 @@ fbCompositeOver_8888x0565neon (
// Uncached framebuffer access is really, really slow if we do it piecemeal.
// It should be much faster if we grab it all at once.
// One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
- QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+ QuadwordCopy_neon(scanLine, alignedLine, copyCount, copyTail);
// Apply the actual filter
ARGB8_Over565_8pix_neon(srcLine, scanLine + kernelOffset, srcStride * sizeof(*srcLine), kernelCount);
--
1.5.6.3
--=-8eMPe7TZzY3oKCFTjv/u--
More information about the cairo
mailing list