[cairo] pixman box filtering code prototype

Thu May 21 13:23:33 PDT 2009

On Mon, Nov 17, 2008 at 06:05:34PM -0500, Jeff Muizelaar wrote:
> On Mon, Nov 17, 2008 at 06:00:03PM -0500, Jeff Muizelaar wrote:
> > I've attached a rough cut of what this could look like.
> 
> Note: The new code is in pixman-rescale-mult.c

Here's a new version. It adds a lanczos downscaler and a bunch
of correctness fixes. It's basically feature complete and I plan on
cleaning it up enough to commit soon.

-Jeff
-------------- next part --------------

diff --git a/configure.ac b/configure.ac
index c555f25..f08fcd3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -106,6 +106,13 @@ if test "x$GCC" = "xyes"; then
   *) CFLAGS="$CFLAGS -Wall" ;;
   esac fi changequote([,])dnl
 
+changequote(,)dnl
+if test "x$GCC" = "xyes"; then
+  case " $CFLAGS " in
+  *[\ \	]-Wdeclaration-after-statement[\ \	]*) ;;
+  *) CFLAGS="$CFLAGS" ;;
+  esac fi changequote([,])dnl
+
 AC_PATH_PROG(PERL, perl, no)
 if test "x$PERL" = xno; then
     AC_MSG_ERROR([Perl is required to build pixman.])
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index b55daa0..19b531c 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -33,6 +33,10 @@ libpixman_1_la_SOURCES =			\
 	pixman-trap.c				\
 	pixman-compute-region.c			\
 	pixman-timer.c				\
+	pixman-rescale-box.c			\
+	pixman-rescale-mult.c			\
+	pixman-rescale-mult-old.c		\
+	pixman-rescale-lanczos.c		\
 	pixman-matrix.c
 
 libpixmanincludedir = $(includedir)/pixman-1/
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index 9d62f4a..4697130 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -453,6 +453,14 @@ pixman_image_set_source_clipping (pixman_image_t  *image,
     image_property_changed (image);
 }
 
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_has_source_clipping (pixman_image_t *image)
+{
+    image_common_t *common = &image->common;
+
+    return common->src_clip != &common->full_region;
+}
+
 /* Unlike all the other property setters, this function does not
  * copy the content of indexed. Doing this copying is simply
  * way, way too expensive.
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index b22785b..09a5762 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -37,6 +37,7 @@
 #include "pixman-arm-simd.h"
 #include "pixman-arm-neon.h"
 #include "pixman-combine32.h"
+#include "pixman-rescale.h"
 
 static void
 fbCompositeSrcScaleNearest (pixman_op_t     op,
@@ -412,6 +413,95 @@ pixman_optimize_operator(pixman_op_t op, pixman_image_t *pSrc, pixman_image_t *p
 
 }
 
+pixman_image_t *create_downscaled_image(pixman_image_t *img, int scaled_width, int scaled_height)
+{
+    pixman_image_t *new_src = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+		            scaled_width, scaled_height, NULL, scaled_width * sizeof(uint32_t*) // XXX: should this come from someplace else
+            );
+    if (!new_src)
+        goto fail;
+
+    if (downscale_lanczos_filter(img, img->bits.width, img->bits.height,
+		scaled_width, scaled_height,
+		0, 0,
+		scaled_width, scaled_height,
+		img->bits.bits, img->bits.rowstride*sizeof(uint32_t),
+		new_src->bits.bits, new_src->bits.rowstride*sizeof(uint32_t)))
+        goto resize_fail;
+
+    pixman_image_set_filter (new_src, img->common.filter,
+		    img->common.filter_params,
+		    img->common.n_filter_params);
+
+    pixman_image_set_repeat (new_src, img->common.repeat);
+
+    pixman_transform_t new_transform = *(img->common.transform);
+//#define unscaled_debug
+#ifdef unscaled_debug
+    printf("%x %x %x, %x %x %x\n", 
+             new_transform.matrix[0][0],
+             new_transform.matrix[0][1],
+             new_transform.matrix[0][2],
+             new_transform.matrix[1][0],
+             new_transform.matrix[1][1],
+             new_transform.matrix[1][2]);
+#endif
+    pixman_fixed_t scale_x, scale_y;
+    pixman_fixed_t applied_scale_x, applied_scale_y;
+    applied_scale_x = div_fixed(img->bits.width, scaled_width);
+    applied_scale_y = div_fixed(img->bits.height, scaled_height);
+    pixman_extract_scale (&new_transform, &scale_x, &scale_y);
+    //printf("%x %x, %x %x\n", scale_x, applied_scale_x, scale_y, applied_scale_y);
+    pixman_transform_unscale (&new_transform, applied_scale_x, applied_scale_y);
+#define unscaled_debug 0
+#if unscaled_debug
+    printf("%x %x %x, %x %x %x\n", 
+             new_transform.matrix[0][0],
+             new_transform.matrix[0][1],
+             new_transform.matrix[0][2],
+             new_transform.matrix[1][0],
+             new_transform.matrix[1][1],
+             new_transform.matrix[1][2]);
+#endif
+    pixman_image_set_transform (new_src, &new_transform);
+    return new_src;
+
+resize_fail:
+    pixman_image_unref(new_src);
+fail:
+    return NULL;
+}
+
+static pixman_bool_t
+is_translation_only(pixman_transform_t *transform) {
+        pixman_fixed_t (*matrix)[3] = transform->matrix;
+        return matrix[0][0] == pixman_fixed_1 &&
+                matrix[1][1] == pixman_fixed_1 &&
+                matrix[0][1] == 0 &&
+                matrix[1][0] == 0 &&
+                matrix[2][0] == 0 &&
+                matrix[2][1] == 0 &&
+                matrix[2][2] == pixman_fixed_1;
+}
+
+static void
+simplify_translation(pixman_image_t *img, int16_t *offset_x, int16_t *offset_y)
+{
+    if (is_translation_only(img->common.transform)) {
+        pixman_fixed_t (*matrix)[3] = img->common.transform->matrix;
+        if ((matrix[0][2] & 0xffff) == 0 && (matrix[1][2] & 0xffff) == 0) {
+            int shift_x = pixman_fixed_to_int(matrix[0][2]);
+            int shift_y = pixman_fixed_to_int(matrix[1][2]);
+            *offset_x += shift_x;
+            *offset_y += shift_y;
+            pixman_transform_t new = *img->common.transform;
+            new.matrix[0][2] = 0;
+            new.matrix[1][2] = 0;
+            pixman_image_set_transform(img, &new);
+        }
+    }
+}
+
 #if defined(USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 
 /*
@@ -430,6 +520,7 @@ pixman_optimize_operator(pixman_op_t op, pixman_image_t *pSrc, pixman_image_t *p
  * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
  */
 
+
 __attribute__((__force_align_arg_pointer__))
 #endif
 PIXMAN_EXPORT void
@@ -453,6 +544,8 @@ pixman_image_composite (pixman_op_t      op,
     pixman_bool_t srcAlphaMap = pSrc->common.alpha_map != NULL;
     pixman_bool_t maskAlphaMap = FALSE;
     pixman_bool_t dstAlphaMap = pDst->common.alpha_map != NULL;
+    pixman_bool_t downscaled = FALSE;
+    pixman_bool_t downscaledMask = FALSE;
     CompositeFunc func = NULL;
 
 #ifdef USE_MMX
@@ -474,6 +567,24 @@ pixman_image_composite (pixman_op_t      op,
 	srcTransform = FALSE;
     }
 
+
+    if (srcTransform && pSrc->type == BITS) {
+        int scaled_width;
+        int scaled_height;
+        pixman_get_scaled_size(pSrc->common.transform,
+                pSrc->bits.width, pSrc->bits.height,
+                &scaled_width, &scaled_height);
+        //printf("%d %d: %d %d\n", pSrc->bits.width, pSrc->bits.height, scaled_width, scaled_height);
+        if (scaled_width < pSrc->bits.width || scaled_height < pSrc->bits.height) {
+            //printf("downscale\n");
+            pSrc = create_downscaled_image(pSrc, scaled_width, scaled_height);
+            if (!pSrc)
+                return;
+            srcTransform = pSrc->common.transform != NULL;
+            downscaled = TRUE;
+        }
+    }
+
     if (pMask && pMask->type == BITS)
     {
 	maskRepeat = pMask->common.repeat == PIXMAN_REPEAT_NORMAL;
@@ -490,8 +601,78 @@ pixman_image_composite (pixman_op_t      op,
 	{
 	    maskTransform = FALSE;
 	}
+        if (maskTransform) {
+            int scaled_width;
+            int scaled_height;
+            pixman_get_scaled_size(pMask->common.transform,
+                    pMask->bits.width, pMask->bits.height,
+                    &scaled_width, &scaled_height);
+            //printf("%d %d: %d %d\n", pSrc->bits.width, pSrc->bits.height, scaled_width, scaled_height);
+            if (scaled_width < pMask->bits.width || scaled_height < pMask->bits.height) {
+                //printf("downscale\n");
+                pMask = create_downscaled_image(pMask, scaled_width, scaled_height);
+                if (!pMask)
+                    return;
+                maskTransform = pMask->common.transform != NULL;
+                downscaledMask = TRUE;
+            }
+        }
+    }
+
+    if (pSrc->type == BITS && pSrc->common.transform) {
+        simplify_translation(pSrc, &xSrc, &ySrc);
+        srcTransform = pSrc->common.transform != NULL;
     }
 
+    if (pMask && pMask->type == BITS && pMask->common.transform) {
+        simplify_translation(pMask, &xMask, &yMask);
+        maskTransform = pMask->common.transform != NULL;
+    }
+
+#if 0
+    if (((pSrc->type == BITS && pSrc->common.transform) || (pMask && pMask->type == BITS && pMask->common.transform))) {
+        pixman_fixed_t (*matrix)[3];
+        if (pSrc->common.transform) {
+            matrix = pSrc->common.transform->matrix;
+        } else {
+            matrix = pMask->common.transform->matrix;
+        }
+        if (matrix[0][0] == pixman_fixed_1 &&
+                matrix[1][1] == pixman_fixed_1 &&
+                matrix[0][1] == 0 &&
+                matrix[1][0] == 0 &&
+                matrix[2][0] == 0 &&
+                matrix[2][1] == 0 &&
+                matrix[2][2] == pixman_fixed_1) {
+            if ((matrix[0][2] & 0xffff) == 0 && (matrix[1][2] & 0xffff) == 0) {
+                int shift_x = pixman_fixed_to_int(matrix[0][2]);
+                int shift_y = pixman_fixed_to_int(matrix[1][2]);
+                xSrc += shift_x;
+                ySrc += shift_y;
+                xMask += shift_x;
+                yMask += shift_y;
+
+                pixman_transform_t new = *pSrc->common.transform;
+                new.matrix[0][2] = 0;
+                new.matrix[1][2] = 0;
+                if (pMask && pMask->common.transform) {
+                    pixman_image_set_transform(pSrc, &new);
+                    maskTransform = pMask->common.transform != NULL;
+                }
+                pixman_image_set_transform(pSrc, &new);
+                srcTransform = pSrc->common.transform != NULL;
+            } else {
+                printf("non integer trans: %x %x\n", matrix[0][2], matrix[1][2]);
+            }
+        } else {
+            printf("non integer scale\n");
+        }
+    } else {
+        printf("trans: %p\n", pSrc->common.transform);
+    }
+#endif
+    srcRepeat = pSrc->type == BITS && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL;
+
     /*
     * Check if we can replace our operator by a simpler one if the src or dest are opaque
     * The output operator should be mathematically equivalent to the source.
@@ -637,6 +818,10 @@ pixman_image_composite (pixman_op_t      op,
     pixman_walk_composite_region (op, pSrc, pMask, pDst, xSrc, ySrc,
 				  xMask, yMask, xDst, yDst, width, height,
 				  srcRepeat, maskRepeat, func);
+    if (downscaled)
+        pixman_image_unref(pSrc);
+    if (downscaledMask)
+        pixman_image_unref(pMask);
 }
 
 
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 927a1c4..7e0e276 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -810,6 +810,17 @@ pixman_rasterize_edges_accessors (pixman_image_t *image,
 pixman_bool_t
 pixman_image_is_opaque(pixman_image_t *image);
 
+void
+pixman_transform_unscale (pixman_transform_t *transform,
+			 int height, int width);
+void
+pixman_get_scaled_size (pixman_transform_t *transform,
+        int height, int width,
+        int *scaled_height, int *scaled_width);
+
+void
+pixman_extract_scale (pixman_transform_t *transform,
+        int *scale_x, int *scale_y);
 pixman_bool_t
 pixman_image_can_get_solid (pixman_image_t *image);
 
@@ -893,4 +904,45 @@ void pixman_timer_register (PixmanTimer *timer);
 
 #endif /* PIXMAN_TIMING */
 
+/**
+ * mul_fixed:
+ * @a: first number to multiply in fixed-point 16.16 format
+ * @b: second number to multiply in fixed-point 16.16 format
+ * 
+ * Author : Frederic Plourde 
+ **/
+static inline pixman_fixed_t
+mul_fixed (pixman_fixed_t a, pixman_fixed_t b)
+{
+    int64_t r = (int64_t)a * (int64_t)b;
+    return ( pixman_fixed_t )( r >> 16 );
+}
+
+/**
+ * div_fixed:
+ * @a: dividend in fixed-point 16.16 format
+ * @b: divisor  in fixed-point 16.16 format
+ *
+ * Author : Frederic Plourde 
+ **/
+static inline pixman_fixed_t
+div_fixed (pixman_fixed_t a, pixman_fixed_t b)
+{
+    int64_t div;
+
+    int64_t a_64 = ((int64_t)a) << 32 ;
+    int64_t b_64 = ((int64_t)b) << 16 ;
+
+    div = a_64 / b_64;
+
+    return (pixman_fixed_t)div;
+}
+
+static inline pixman_fixed_t
+square_fixed (pixman_fixed_t a)
+{
+    return mul_fixed(a,a);
+}
+
+
 #endif /* PIXMAN_PRIVATE_H */
diff --git a/pixman/pixman-rescale-box.c b/pixman/pixman-rescale-box.c
new file mode 100644
index 0000000..de45228
--- /dev/null
+++ b/pixman/pixman-rescale-box.c
@@ -0,0 +1,409 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright ? 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ *
+ * Based on the code from "Fast Bitmap Strecthing" by Tomas M?ller
+ *
+ * Scaling code from Graphics Gems book III
+ *  http://www.acm.org/pubs/tog/GraphicsGems/gemsiii/fastBitmap.c
+ *
+ *  License states - "All code here can be used without restrictions."
+ *  http://www.acm.org/pubs/tog/GraphicsGems/
+ *
+ * Also inspired by Mozilla's gfx/src/imgScaler.c by
+ * Tim Rowley <tor at cs.brown.edu>
+ */
+
+/* The approach is inspired by Bresenham's line drawing algorithm
+ * A similar approach is used by SDL in its blitting code
+ * although they do have some intrastructure for dynamically
+ * generating code for a particular scaling.
+ *
+ * A more clever approach is used in Evas. It creates table of row indices
+ * and a table of column indices that map the destination coordinates
+ * to source pixels. This approach avoids having an additional conditional
+ * branch inside of the inner loop at the cost of having to precompute and
+ * access these tables.
+ *
+ *   --  Jeff Muizelaar (July 2008) */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+#define abs(a) (a) > 0 ? (a) : -(a)
+#define sign(x) ((x)>0 ? 1:-1)
+
+void fetch_scanline(void *closure, int y, uint32_t *scanline)
+{
+    pixman_image_t *pict = closure;
+    if (!pixman_image_has_source_clipping(pict)) {
+        fetchProc32 fetch = ACCESS(pixman_fetchProcForPicture32)(pict);
+        fetch(pict, 0, y, pict->bits.width, scanline);
+    } else {
+        int x;
+        bits_image_t *bits = &pict->bits;
+        fetchPixelProc32 fetch = ACCESS(pixman_fetchPixelProcForPicture32)(bits);
+        for (x=0; x<bits->width; x++) {
+            if (pixman_region32_contains_point (bits->common.src_clip, x, y,NULL))
+                scanline[x] = fetch (bits, x, y);
+            else
+                scanline[x] = 0;
+        }
+    }
+}
+
+
+/*
+When box filtering with an integer sized box we only ever have two box sizes.
+Proof:
+    Assume w1 > w2.
+    r = w1/w2
+
+    The size of the two boxes is
+    r1 = ceil(r), r2 = floor(r)
+
+    we want to find non-negative integers p and q such that:
+    w1 = p*r1 + q*r2
+
+    if r1 == r2 then
+        r is an integer and thus w2 evenly divides w1
+        therefor p = w2 and q = 0
+    otherwise r1 != r2 and
+        r1 = r2 + 1
+        w1 = p*(r2 + 1) + q * r2
+           = p*r2 + q*r2 + p
+           = (p + q)*r2 + p
+
+        we then choose a value of p such that:
+            w1 = r2*w2 + p
+            thus p = w1 mod r2 XXX: this wrong...
+                   = w1 mod w2
+            and  q = w2 - p which is > 0 because
+                            p = w1 mod w2
+
+            subing into:
+            (p + q)*r2 + p
+            gives:
+            w1 = (p + w2 - p)*r2 + p
+            w1 = w2*r2 + w1 mod r2
+
+*/
+
+
+/* we can index on the bottom bit of the divisor.
+ * 
+ * we could also do a multiply per pixel and accumulate precision
+*/
+/**********************************************************
+ Stretches a horizontal source line onto a horizontal
+ destination line. Used by RectStretch.
+**********************************************************/
+void downsample_row_box_filter(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                long dx2, long e, long src_dx)
+{
+    int di = 0;
+    int divs[2];
+    int div = src_dx/dx2;
+    divs[div & 1] = (1<<24)/div;
+    div += 1;
+    divs[div & 1] = (1<<24)/div;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int count = 1;
+        a = (*src >> 24) & 0xff;
+        r = (*src >> 16) & 0xff;
+        g = (*src >>  8) & 0xff;
+        b = (*src >>  0) & 0xff;
+        e -= dx2;
+        src++;
+        while (e > 0) {
+            e -= dx2;
+            a += (*src >> 24) & 0xff;
+            r += (*src >> 16) & 0xff;
+            g += (*src >>  8) & 0xff;
+            b += (*src >>  0) & 0xff;
+            count++;
+            src++;
+        }
+#if 1
+        int div = divs[count & 1];
+
+        a = (a * div + 0x10000) >> 24;
+        r = (r * div + 0x10000) >> 24;
+        g = (g * div + 0x10000) >> 24;
+        b = (b * div + 0x10000) >> 24;
+
+        //XXX counts seem high...
+#else
+        a /= count;
+        r /= count;
+        g /= count;
+        b /= count;
+#endif
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+
+        e += src_dx;
+    }
+}
+
+void downsample_row_box_filter_sse2(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                long dx2, long e, long src_dx)
+{
+#if 0
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int count = 1;
+        a = (*src >> 24) & 0xff;
+        r = (*src >> 16) & 0xff;
+        g = (*src >>  8) & 0xff;
+        b = (*src >>  0) & 0xff;
+        while (e > 0) {
+            e -= dx2;
+            a += (*src >> 24) & 0xff;
+            r += (*src >> 16) & 0xff;
+            g += (*src >>  8) & 0xff;
+            b += (*src >>  0) & 0xff;
+            count++;
+            src++;
+        }
+        //XXX counts seem high...
+        a /= count;
+        r /= count;
+        g /= count;
+        b /= count;
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+
+        e += src_dx;
+    }
+#endif
+
+}
+
+
+void downsample_columns_box_filter(
+        int n,
+        int columns,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    int div = (1<<24)/columns;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int h;
+        uint32_t *column_src = src;
+        a = (*column_src >> 24) & 0xff;
+        r = (*column_src >> 16) & 0xff;
+        g = (*column_src >>  8) & 0xff;
+        b = (*column_src >>  0) & 0xff;
+        for (h=1; h<columns; h++) {
+            a += (*column_src >> 24) & 0xff;
+            r += (*column_src >> 16) & 0xff;
+            g += (*column_src >>  8) & 0xff;
+            b += (*column_src >>  0) & 0xff;
+            column_src += stride;
+        }
+#if 1
+        a = (a * div + 0x10000) >> 24;
+        r = (r * div + 0x10000) >> 24;
+        g = (g * div + 0x10000) >> 24;
+        b = (b * div + 0x10000) >> 24;
+#else
+        a /= columns;
+        r /= columns;
+        g /= columns ;
+        b /= columns;
+#endif
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+/**********************************************************
+ RectStretch enlarges or diminishes a source rectangle of
+ a bitmap to a destination rectangle. The source
+ rectangle is selected by the two points (xs1,ys1) and
+ (xs2,ys2), and the destination rectangle by (xd1,yd1) and
+ (xd2,yd2). Since readability of source-code is wanted,
+ some optimizations have been left out for the reader:
+ It's possible to read one line at a time, by first
+ stretching in x-direction and then stretching that bitmap
+ in y-direction.
+ Entry:
+	xs1,ys1 - first point of source rectangle
+	xs2,ys2 - second point of source rectangle
+	xd1,yd1 - first point of destination rectangle
+	xd2,yd2 - second point of destination rectangle
+**********************************************************/
+/* startColumn, startRow specify the index of the first source pixel from the source image.
+ * width and height specify the number of source pixels to be drawn
+ *
+ * origWidth, origHeight, scaledWidth and scaledHeight specify the scaling exactly
+ *
+ * x_center and y_center specify the offset within a pixel from which to start drawing
+ * they are, of course, specified in destination space
+ */
+#define ROUND
+//XXX:
+PIXMAN_EXPORT
+int downscale_box_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride)
+{
+   // printf("%d %d %d %d\n", scaledWidth, scaledHeight, origWidth, origHeight);
+        long xs2, ys2, xd2, yd2;
+        long yd1 = 0, ys1 = 0;
+        long xd1 = 0, xs1 = 0;
+	xs2 = origWidth ;
+	ys2 = origHeight ;
+	//xd2 = abs(scaledWidth) - 1;
+	xd2 = abs(scaledWidth) ;
+	yd2 = abs(scaledHeight) ;
+	long e,d,dx2;
+	short sy;
+	long dest_dy = abs(yd2);
+	long src_dy = abs(ys2);
+        if (scaledHeight < 0) {
+            ys1 = origHeight - ((src_dy<<1)+dest_dy)/(dest_dy << 1); // e = dy*2 - dx
+            sy = -1;
+        }
+	e = (src_dy<<1)-dest_dy; // e = dy*2 - dx
+	dx2 = dest_dy<<1; // dx2 = dx *2
+	src_dy <<= 1; // dy *= 2
+
+        long dest_dx,src_dx,ex,dx2x;
+	short src_sx;
+	dest_dx = abs(xd2);
+	src_dx = abs(xs2);
+        int src_direction = 1;
+#ifdef ROUND
+	ex = (src_dx<<1)-dest_dx;
+	dx2x = dest_dx<<1; // multiplying by 2 is for rounding purposes
+	src_dx <<= 1;
+#else
+        dx2x = dest_dx;
+        ex = src_dx - dest_dx;
+#endif
+
+        /* Compute the src_offset and associated error value:
+         * There are two ways to do this: */
+        /* 1: Directly */
+        int nsrc_offset = (src_dx*startColumn + dest_dx)*src_sx/(dest_dx*2);
+        long nex = (src_dx*startColumn + dest_dx)%(dest_dx*2) - dest_dx*2 + src_dx;
+	/* 2. Iteratively */
+        int src_offset = 0;
+	for (d = 0; d < startColumn; d++) {
+            //XXX: should be 'ex > 0'
+		while (ex > 0) {
+			src_offset += src_sx;
+			ex -= dx2x;
+		}
+		ex += src_dx;
+	}
+#if 0
+        if (nex != ex) {
+            printf("e: %d %d: %d %d %d\n", ex, nex, src_dx, startColumn, dest_dx);
+            assert(nex == ex);
+        }
+        if (nsrc_offset != src_offset) {
+            printf("off: %d %d: %d %d %d\n", src_offset, nsrc_offset, src_dx, startColumn, dest_dx);
+            assert(nsrc_offset == src_offset);
+        }
+
+#endif
+        //printf("src_offset %d %d %d\n", src_offset, nsrc_offset, ex);
+
+        src += src_offset;
+
+        /* we need to allocate enough room for ceil(src_height/dest_height) scanlines */
+	//XXX: I suppose we should check whether this will succeed
+        uint32_t *temp_buf = pixman_malloc_abc ((origHeight + scaledHeight-1)/scaledHeight, scaledWidth, sizeof(uint32_t));
+	if (!temp_buf)
+            return -1;
+
+        uint32_t *scanline = pixman_malloc_abc (origWidth, 1, sizeof(uint32_t));
+
+        int x = 0;
+        int y = 0;
+        /* seek to the begining */
+        for (d = 0; d < startRow; d++)
+	{
+            while (e > 0)
+            {
+                e -= dx2;
+                y++;
+            }
+            e += src_dy;
+        }
+
+        for (d = startRow; d < startRow + height; d++)
+	{
+            int columns = 0;
+            while (e > 0)
+            {
+                fetch_scanline(pict, y, scanline);
+                //XXX:  we could turn these multiplications into additions
+                downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                        dx2x, ex, src_dx);
+
+                ys1 ++;
+                e -= dx2;
+                columns++;
+                y++;
+            }
+            downsample_columns_box_filter(width, columns, temp_buf, dest + (yd1 - startRow)*dstStride/4);
+            yd1 += 1;
+            e += src_dy;
+        }
+        free(scanline);
+        free(temp_buf);
+        return 0;
+}
+
diff --git a/pixman/pixman-rescale-lanczos.c b/pixman/pixman-rescale-lanczos.c
new file mode 100644
index 0000000..596b192
--- /dev/null
+++ b/pixman/pixman-rescale-lanczos.c
@@ -0,0 +1,480 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright ? 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+#define abs(a) (a) > 0 ? (a) : -(a)
+#define sign(x) ((x)>0 ? 1:-1)
+
+static void fetch_scanline(void *closure, int y, uint32_t *scanline)
+{
+    pixman_image_t *pict = closure;
+    bits_image_t *bits = &pict->bits;
+    if (y > pict->bits.height-1)
+        y = pict->bits.height-1;
+    if (!pixman_image_has_source_clipping(pict)) {
+        fetchProc32 fetch = ACCESS(pixman_fetchProcForPicture32)(bits);
+        fetch(bits, 0, y, pict->bits.width, scanline);
+    } else {
+        int x;
+        fetchPixelProc32 fetch = ACCESS(pixman_fetchPixelProcForPicture32)(bits);
+        for (x=0; x<bits->width; x++) {
+            if (pixman_region32_contains_point (bits->common.src_clip, x, y,NULL))
+                scanline[x] = fetch (bits, x, y);
+            else
+                scanline[x] = 0;
+        }
+    }
+}
+
+#define FILTER_SHIFT 14
+#define LANCZOS_LOBES 2
+
+/* contains the filter for each destination pixel */
+struct filter {
+    int count; // filter size
+    int16_t *values; // filter coefficients
+    int offset; // offset
+};
+
+#include <pmmintrin.h>
+#include <emmintrin.h>
+static int clamp(int a)
+{
+    if (a > 255)
+        return 255;
+    if (a < 0)
+        return 0;
+    return a;
+}
+
+
+static void downsample_row_convolve_sse2(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                struct filter *filter, int src_size)
+{
+    int pixel = 0;
+    while (n--) {
+        int i;
+        __m128i accum = _mm_setzero_si128();
+        __m128i zero = _mm_setzero_si128();
+        for (i=0; i<filter[pixel].count; i++) {
+            __m128i v_src = _mm_cvtsi32_si128(src[filter[pixel].offset + i]);
+            v_src = _mm_unpacklo_epi16(_mm_unpacklo_epi8(v_src, zero), zero);
+
+            __m128i coeff = _mm_cvtsi32_si128(filter[pixel].values[i]);
+            /* duplicate the filter coefficient */
+            coeff = _mm_shuffle_epi32(coeff, _MM_SHUFFLE(0, 0, 0, 0));
+
+            /* multiply and accumulate the result:
+             * 0000vvvv_0000vvvv_0000vvvv_0000vvvv * 000000aa_000000rr_000000gg_000000bb */
+            __m128i result = _mm_madd_epi16(v_src, coeff);
+            accum = _mm_add_epi32(accum, result);
+        }
+
+        /* scale the accumulator down */
+        accum = _mm_srai_epi32(accum, FILTER_SHIFT);
+
+        /* pack 000000aa_000000rr_000000gg_000000bb -> 00000000_00000000_00000000_aarrggbb */
+        accum = _mm_packs_epi32(accum, accum);
+
+        //XXX: this should be need to saturate properly but doesn't seem to make a difference
+        accum = _mm_max_epi16(accum, zero);
+
+        accum = _mm_packus_epi16(accum, accum);
+
+        *dest = _mm_cvtsi128_si32(accum);
+        dest++;
+        pixel++;
+    }
+}
+
+static void downsample_row_convolve(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                struct filter *filter, int src_size)
+{
+    int pixel = 0;
+    while (n--) {
+        int32_t a = 0;
+        int32_t r = 0;
+        int32_t g = 0;
+        int32_t b = 0;
+        int i;
+
+        for (i=0; i<filter[pixel].count; i++) {
+            a += ((src[filter[pixel].offset + i] >> 24) & 0xff) * filter[pixel].values[i];
+            r += ((src[filter[pixel].offset + i] >> 16) & 0xff) * filter[pixel].values[i];
+            g += ((src[filter[pixel].offset + i] >>  8) & 0xff) * filter[pixel].values[i];
+            b += ((src[filter[pixel].offset + i] >>  0) & 0xff) * filter[pixel].values[i];
+        }
+        a >>= FILTER_SHIFT;
+        r >>= FILTER_SHIFT;
+        g >>= FILTER_SHIFT;
+        b >>= FILTER_SHIFT;
+
+        a = clamp(a);
+        r = clamp(r);
+        g = clamp(g);
+        b = clamp(b);
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        pixel++;
+    }
+}
+
+/* instead of src1, src2 we could pass:
+ * ring_buf,
+ * start_index,
+ * ring_buf_length,
+ * filter_length */
+static void downsample_columns_convolve(
+        int n,
+        uint32_t *src1, int src1_length,
+        uint32_t *src2, int src2_length,
+        uint32_t *dest,
+        int16_t *filter, int filter_length)
+{
+    assert(src1_length+src2_length == filter_length);
+    assert(src1_length >= 0);
+    assert(src2_length >= 0);
+    int stride = n;
+    while (n--) {
+        int32_t a = 0;
+        int32_t r = 0;
+        int32_t g = 0;
+        int32_t b = 0;
+        int i;
+        uint32_t *column_src;
+        column_src = src1;
+        if (src1_length == 0)
+            column_src = src2;
+
+        /* we do the accumulation in two steps because the source lines are in a ring buffer
+         * so won't be contiguous */
+        column_src = src1;
+        for (i=0; i<src1_length; i++) { /* loop till the end of the ring buffer */
+            a += ((*column_src >> 24) & 0xff) * filter[i];
+            r += ((*column_src >> 16) & 0xff) * filter[i];
+            g += ((*column_src >>  8) & 0xff) * filter[i];
+            b += ((*column_src >>  0) & 0xff) * filter[i];
+            column_src += stride;
+        }
+
+        /* accumulate the remaining samples starting at the begining of the ring buffer */
+        column_src = src2;
+        for (; i<filter_length; i++) {
+            a += ((*column_src >> 24) & 0xff) * filter[i];
+            r += ((*column_src >> 16) & 0xff) * filter[i];
+            g += ((*column_src >>  8) & 0xff) * filter[i];
+            b += ((*column_src >>  0) & 0xff) * filter[i];
+            column_src += stride;
+        }
+        a >>= FILTER_SHIFT;
+        r >>= FILTER_SHIFT;
+        g >>= FILTER_SHIFT;
+        b >>= FILTER_SHIFT;
+
+        a = clamp(a);
+        r = clamp(r);
+        g = clamp(g);
+        b = clamp(b);
+        a = 0xff;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src1++;
+        src2++;
+    }
+}
+
+#include <float.h>
+
+// Evaluates the Lanczos filter of the given filter size window for the given
+// position.
+//
+// |filter_size| is the width of the filter (the "window"), outside of which
+// the value of the function is 0. Inside of the window, the value is the
+// normalized sinc function:
+//   lanczos(x) = sinc(x) * sinc(x / filter_size);
+// where
+//   sinc(x) = sin(pi*x) / (pi*x);
+#include <math.h>
+float eval_lanczos(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -FLT_EPSILON &&
+      x < FLT_EPSILON)
+    return 1.0f;  // Special case the discontinuity at the origin.
+  float xpi = x * (M_PI);
+  return (sin(xpi) / xpi) *  // sinc(x)
+          sin(xpi / filter_size) / (xpi / filter_size);  // sinc(x/filter_size)
+}
+
+/* dealing with the edges:
+   some options:
+   we could always have approximately the same number of samples in the filter and just pad the image out
+   chromium seems to truncate the filter though...
+   I don't really have a good reason to choose either approach...
+   one way to get an idea is to see what other implementation do.
+   - it looks like quartz pads
+   - chromium truncates the filter
+   - opera pads
+*/
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+int floor_int(float a)
+{
+    return floor(a);
+}
+
+int ceil_int(float a)
+{
+    return ceil(a);
+}
+int float_to_fixed(float a)
+{
+    return a * (1 << FILTER_SHIFT);
+}
+
+/* this method does not do source clipping */
+struct filter *compute_lanczos_filter(int dest_subset_lo, int dest_subset_size, int src_size, float scale)
+{
+    // this is half the number of pixels that the filter uses in filter space
+    int dest_support = LANCZOS_LOBES;
+    float src_support = dest_support / scale;
+
+    /* look at ResizeFilter::ResizeFilter() and ::ComputerFilters() */
+    /* we need to compute a set of filters for each pixel */
+    /* filter width */
+    int i;
+    struct filter *filter = malloc(dest_subset_size * sizeof(struct filter));
+    int max_filter_size = ceil_int(src_support * 2) - floor_int(src_support * -2) + 1;
+    float *filter_values = malloc(max_filter_size * sizeof(float));
+    int dest_subset_hi = dest_subset_lo + dest_subset_size; // [lo, hi)
+
+
+    // When we're doing a magnification, the scale will be larger than one. This
+    // means the destination pixels are much smaller than the source pixels, and
+    // that the range covered by the filter won't necessarily cover any source
+    // pixel boundaries. Therefore, we use these clamped values (max of 1) for
+    // some computations.
+    float clamped_scale = min(1., scale);
+
+    // Speed up the divisions below by turning them into multiplies.
+    float inv_scale = 1. / scale;
+
+    // Loop over all pixels in the output range. We will generate one set of
+    // filter values for each one. Those values will tell us how to blend the
+    // source pixels to compute the destination pixel.
+    int dest_subset_i;
+    int pixel = 0;
+    for (dest_subset_i = dest_subset_lo; dest_subset_i < dest_subset_hi; dest_subset_i++, pixel++) {
+
+        float src_pixel = dest_subset_i * inv_scale;
+
+        int src_begin = max(0, floor_int(src_pixel - src_support));
+        assert(src_begin >= 0);
+        int src_end = min(src_size - 1, ceil_int(src_pixel + src_support));
+
+        // Compute the unnormalized filter value at each location of the source
+        // it covers.
+        float filter_sum = 0.; // sum of the filter values for normalizing
+        int filter_size = 0;
+        int cur_filter_pixel;
+        int j = 0;
+        for (cur_filter_pixel = src_begin; cur_filter_pixel <= src_end;
+                cur_filter_pixel++) {
+            // Distance from the center of the filter, this is the filter coordinate
+            // in source space.
+            float src_filter_pos = cur_filter_pixel - src_pixel;
+
+            // Since the filter really exists in dest space, map it there.
+            float dest_filter_pos = src_filter_pos * clamped_scale;
+
+            // Compute the filter value at that location.
+            float filter_value = eval_lanczos(LANCZOS_LOBES, dest_filter_pos);
+            filter_sum += filter_value;
+            filter_values[j] = filter_value;
+
+            filter_size++;
+            j++;
+        }
+        if (src_end < src_begin) {
+            printf("%d %d %d (%d %d - %d)\n", src_end, src_begin, dest_subset_i, dest_subset_lo, dest_subset_hi, dest_subset_size);
+            assert(src_end >= src_begin);
+        }
+        if (filter_size > max_filter_size) {
+            printf("%f %f\n", 1./scale, (1. / scale) * src_support * 2 + 1); //XXX is this correct?
+            printf("%d %d\n", filter_size, max_filter_size);
+            assert(filter_size <= max_filter_size);
+        }
+
+        //XXX: we should avoid doing malloc here
+        int16_t *fixed_filter_values = malloc(filter_size * sizeof(int16_t));
+
+        // the filter must be normalized so that we don't affect the brightness of
+        // the iamge. Convert to normalized fixed point
+        // XXX: It might be better if we didn't have to do this in a separate pass
+        int16_t fixed_sum = 0; // XXX: should we use a regular int here?
+        for (i=0; i<filter_size; i++) {
+            int16_t cur_fixed = float_to_fixed(filter_values[i] / filter_sum);
+            //printf("%d\n", cur_fixed);
+            fixed_sum += cur_fixed;
+            fixed_filter_values[i] = cur_fixed;
+        }
+        //printf("sum = %d\n", fixed_sum);
+
+        // The conversion to fixed point will leave some rounding errors, which
+        // we add back in to avoid affecting the brightness of the image. We
+        // arbitrarily add this to the center of the filter array (this won't always
+        // be the center of the filter function since it could get clipped on the
+        // edges, but it doesn't matter enough to worry about that case).
+        int16_t leftovers = float_to_fixed(1.0f) - fixed_sum;
+        fixed_filter_values[filter_size / 2] += leftovers;
+
+        filter[pixel].values = fixed_filter_values;
+        filter[pixel].count = filter_size;
+        filter[pixel].offset = src_begin;
+        assert(filter[pixel].offset >= 0);
+        assert(filter[pixel].offset + filter[pixel].count - 1 < src_size);
+    }
+    free(filter_values);
+    return filter;
+}
+
+// startColumn and startRow are in destination space
+PIXMAN_EXPORT
+int downscale_lanczos_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride)
+{
+    assert(startColumn + width <= scaledWidth);
+    assert(startRow + height <= scaledHeight);
+   // printf("%d %d %d %d\n", scaledWidth, scaledHeight, origWidth, origHeight);
+        int yd1 = 0;
+        int d;
+
+        //XXX: check what this actually needs to be
+        int lanczos_src = LANCZOS_LOBES * 2;
+
+        //XXX: this duplicates code in compute_lanczos_filter
+        int dest_support = LANCZOS_LOBES;
+        float src_support = dest_support / ((float)scaledHeight/origHeight);
+
+        int max_filter_size = ceil_int(src_support * 2) - floor_int(src_support * -2) + 1;
+
+        int ring_buf_size = max_filter_size;
+        uint32_t *ring_buf = pixman_malloc_abc (ring_buf_size, scaledWidth, sizeof(uint32_t));
+	if (!ring_buf)
+            return -1;
+
+	//XXX: I suppose we should check whether this will succeed
+        uint32_t *scanline = pixman_malloc_abc (origWidth, 1, sizeof(uint32_t));
+
+        struct filter *x_filter = compute_lanczos_filter(startColumn, width, origWidth, (float)scaledWidth/origWidth);
+        struct filter *y_filter = compute_lanczos_filter(startRow, height, origHeight, (float)scaledHeight/origHeight);
+
+        int index = 0;
+        int next_x_row = 0;
+        int filter_index = 0;
+        for (d = startRow; d < startRow + height; d++, filter_index++)
+	{
+            int filter_length = y_filter[filter_index].count;
+            int filter_size = y_filter[filter_index].count;
+            int filter_offset = y_filter[filter_index].offset;
+            assert(filter_offset >= 0);
+            /* read and downsample the rows needed to downsample the next column */
+            while (next_x_row < filter_offset + filter_length) {
+                fetch_scanline(pict, index, scanline);
+                downsample_row_convolve_sse2(width, scanline, ring_buf + width * (index % ring_buf_size), x_filter, origWidth);
+                //downsample_row_convolve(width, scanline, ring_buf + width * (index % ring_buf_size), x_filter, origWidth);
+                index++;
+                next_x_row++;
+                //XXX: when index overflows this becomes bad: i.e. 0 % 5 != 8 % 5
+            }
+
+            int src1_index = filter_offset % ring_buf_size;
+            assert(src1_index >= 0);
+            uint32_t *src1 = ring_buf + width * src1_index;
+            assert(filter_size <= ring_buf_size);
+            int src1_size = min((ring_buf_size - src1_index), filter_size);
+            assert(src1_size >= 0);
+            int src2_size = filter_size - src1_size;
+            assert(filter_size >= src1_size);
+            uint32_t *src2 = ring_buf; // src2 is always at the same location
+            downsample_columns_convolve(width, src1, src1_size, src2, src2_size, dest + (yd1 - startRow)*dstStride/4, y_filter[filter_index].values, y_filter[filter_index].count);
+            yd1++;
+        }
+        free(ring_buf);
+        free(scanline);
+        int i;
+        for (i=0; i<width; i++)
+            free(x_filter[i].values);
+        for (i=0; i<height; i++)
+            free(y_filter[i].values);
+        free(x_filter);
+        free(y_filter);
+        return 0;
+}
+
+#if 0
+            // rearrange filter coefficients in the order of the ring buffer?
+            // or we could just filter in the order of the ring buffer
+            i = 0;
+            j = start;
+            while (n) {
+                result += sample[j] * coeff[i];
+                if (j > size)
+                    j = 0;
+                i++;
+                j++;
+            }
+
+            // or rearranging the coeffs gives us
+            while (n) {
+                result += sample[i + sample*width] * coeff[i];
+            }
+            /* it also lets us move in a more orderly fashion through memory */
+            /* however I can't think of any way to get the samples to always be contiguous */
+
+            // it would be interesting to see how doing the multiplication by row and then summing all the rows compared to doing
+            // everything column wise. I expect the memory bandwidth of doing so could make things slower
+#endif
+
diff --git a/pixman/pixman-rescale-mult-old.c b/pixman/pixman-rescale-mult-old.c
new file mode 100644
index 0000000..bc27821
--- /dev/null
+++ b/pixman/pixman-rescale-mult-old.c
@@ -0,0 +1,286 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright ? 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+/* This implements a box filter that supports non-integer box sizes */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+#define abs(a) (a) > 0 ? (a) : -(a)
+#define sign(x) ((x)>0 ? 1:-1)
+
+static void fetch_scanline(void *closure, int y, uint32_t *scanline)
+{
+    pixman_image_t *pict = closure;
+    bits_image_t *bits = &pict->bits;
+    if (y > pict->bits.height-1)
+        y = pict->bits.height-1;
+    if (!pixman_image_has_source_clipping(pict)) {
+        fetchProc32 fetch = ACCESS(pixman_fetchProcForPicture32)(bits);
+        fetch(bits, 0, y, pict->bits.width, scanline);
+    } else {
+        int x;
+        fetchPixelProc32 fetch = ACCESS(pixman_fetchPixelProcForPicture32)(bits);
+        for (x=0; x<bits->width; x++) {
+            if (pixman_region32_contains_point (bits->common.src_clip, x, y,NULL))
+                scanline[x] = fetch (bits, x, y);
+            else
+                scanline[x] = 0;
+        }
+    }
+}
+
+/* we work in fixed point where 1. == 1 << 24 */
+#define FIXED_SHIFT 24
+
+static void downsample_row_box_filter(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                int coverage[], int pixel_coverage)
+{
+    /* we need an array of the pixel contribution of each destination pixel on the boundaries.
+     * we invert the value to get the value on the other size of the box */
+    /*
+
+       value  = a * contribution * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * (1 - contribution) * 1/box_size
+                a * (1/box_size - contribution * 1/box_size)
+
+        box size is constant
+
+
+       value = a * contribtion_a * 1/box_size + b * contribution_b * 1/box_size
+               contribution_b = (1 - contribution_a)
+                              = (1 - contribution_a_next)
+    */
+
+    /* box size = ceil(src_width/dest_width) */
+
+    int x = 0;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        a = ((*src >> 24) & 0xff) * start_coverage;
+        r = ((*src >> 16) & 0xff) * start_coverage;
+        g = ((*src >>  8) & 0xff) * start_coverage;
+        b = ((*src >>  0) & 0xff) * start_coverage;
+        src++;
+        x++;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*src >> 24) & 0xff) * pixel_coverage;
+            r += ((*src >> 16) & 0xff) * pixel_coverage;
+            g += ((*src >>  8) & 0xff) * pixel_coverage;
+            b += ((*src >>  0) & 0xff) * pixel_coverage;
+            src++;
+            x++;
+
+            box -= pixel_coverage;
+        }
+        /* multiply by whatever is leftover
+         * this ensures that we don't bias down.
+         * i.e. start_coverage + n*pixel_coverage + box == 1 << 24 */
+        if (box > 0) {
+            a += ((*src >> 24) & 0xff) * box;
+            r += ((*src >> 16) & 0xff) * box;
+            g += ((*src >>  8) & 0xff) * box;
+            b += ((*src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+    }
+}
+
+static void downsample_columns_box_filter(
+        int n,
+        int start_coverage,
+        int pixel_coverage,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        uint32_t *column_src = src;
+        int box = 1 << FIXED_SHIFT;
+        a = ((*column_src >> 24) & 0xff) * start_coverage;
+        r = ((*column_src >> 16) & 0xff) * start_coverage;
+        g = ((*column_src >>  8) & 0xff) * start_coverage;
+        b = ((*column_src >>  0) & 0xff) * start_coverage;
+        column_src += stride;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*column_src >> 24) & 0xff) * pixel_coverage;
+            r += ((*column_src >> 16) & 0xff) * pixel_coverage;
+            g += ((*column_src >>  8) & 0xff) * pixel_coverage;
+            b += ((*column_src >>  0) & 0xff) * pixel_coverage;
+            column_src += stride;
+            box -= pixel_coverage;
+        }
+        if (box > 0) {
+            a += ((*column_src >> 24) & 0xff) * box;
+            r += ((*column_src >> 16) & 0xff) * box;
+            g += ((*column_src >>  8) & 0xff) * box;
+            b += ((*column_src >>  0) & 0xff) * box;
+        }
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+#if 1
+int compute_coverage(int coverage[], int src_length, int dest_length) {
+    int i;
+    /* num = src_length/dest_length
+       total = sum(pixel) / num
+
+       pixel * 1/num == pixel * dest_length / src_length
+    */
+    int full = 1<<24;
+    /* the average contribution of each source pixel */
+    int ratio = ((1 << 24)*(long long int)dest_length)/src_length;
+    int finished = ((1 << 24)*(long long int)dest_length) - (((1 << 24)*(long long int)dest_length)/src_length)*src_length;
+    int p=0;
+    int p2;
+    int remainder = full;
+    /* because ((1 << 24)*(long long int)dest_length) won't always be divisible by src_length
+     * we'll need someplace to put the other bits.
+     *
+     * We want to ensure a + n*ratio < 1<<24
+     *
+     * 1<<24
+     * */
+    for (i=0; i<src_length; i++) {
+        if (remainder < ratio) {
+            p = ratio - remainder;
+            remainder = full - p;
+        } else {
+            p = ratio;
+            remainder -= ratio;
+        }
+
+        if ((((i+1)*ratio) % full) < ratio) {
+            p2 = (((i+1)*ratio) % full);
+        } else {
+            p2 = ratio;
+        }
+
+        //printf("%d %d %d %d %d\n", i, p, remainder, (i+1)*ratio, p2);
+        //assert(p == p2);
+
+        coverage[i] = p;
+    }
+    //assert(remainder == 0);
+    return ratio;
+}
+#endif
+
+
+PIXMAN_EXPORT
+int downscale_box_mult_old_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride)
+{
+   // printf("%d %d %d %d\n", scaledWidth, scaledHeight, origWidth, origHeight);
+        int yd1 = 0;
+        int d;
+
+        /* we need to allocate enough room for ceil(src_height/dest_height) scanlines */
+        uint32_t *temp_buf = pixman_malloc_abc ((origHeight + scaledHeight-1)/scaledHeight, scaledWidth, sizeof(uint32_t));
+	if (!temp_buf)
+            return -1;
+
+	//XXX: I suppose we should check whether this will succeed
+        uint32_t *scanline = pixman_malloc_abc (origWidth, 1, sizeof(uint32_t));
+
+        int *x_coverage = pixman_malloc_abc (origWidth, 1, sizeof(int));
+        int *y_coverage = pixman_malloc_abc (origHeight, 1, sizeof(int));
+        int pixel_coverage_x = compute_coverage(x_coverage, origWidth, scaledWidth);
+        int pixel_coverage_y = compute_coverage(y_coverage, origHeight, scaledHeight);
+
+        int y = 0;
+        for (d = 0; d < startRow + height; d++)
+	{
+            int columns = 0;
+            int box = 1 << FIXED_SHIFT;
+            int start_coverage_y = y_coverage[y];
+            fetch_scanline(pict, y, scanline);
+            downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+            columns++;
+            y++;
+            box -= start_coverage_y;
+
+            while (box >= pixel_coverage_y) {
+                fetch_scanline(pict, y, scanline);
+                downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+                y++;
+                box -= pixel_coverage_y;
+            }
+            if (box > 0) {
+                fetch_scanline(pict, y, scanline);
+                downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+            }
+            downsample_columns_box_filter(width, start_coverage_y, pixel_coverage_y, temp_buf, dest + (yd1 - startRow)*dstStride/4);
+            yd1++;
+        }
+        free(temp_buf);
+        return 0;
+}
diff --git a/pixman/pixman-rescale-mult.c b/pixman/pixman-rescale-mult.c
new file mode 100644
index 0000000..b023afd
--- /dev/null
+++ b/pixman/pixman-rescale-mult.c
@@ -0,0 +1,335 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright ? 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+/* This implements a box filter that supports non-integer box sizes */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+#define abs(a) (a) > 0 ? (a) : -(a)
+#define sign(x) ((x)>0 ? 1:-1)
+
+static void fetch_scanline(void *closure, int y, uint32_t *scanline)
+{
+    pixman_image_t *pict = closure;
+    bits_image_t *bits = &pict->bits;
+    if (y > pict->bits.height-1)
+        y = pict->bits.height-1;
+    if (!pixman_image_has_source_clipping(pict)) {
+        fetchProc32 fetch = ACCESS(pixman_fetchProcForPicture32)(bits);
+        fetch(bits, 0, y, pict->bits.width, scanline);
+    } else {
+        int x;
+        fetchPixelProc32 fetch = ACCESS(pixman_fetchPixelProcForPicture32)(bits);
+        for (x=0; x<bits->width; x++) {
+            if (pixman_region32_contains_point (bits->common.src_clip, x, y,NULL))
+                scanline[x] = fetch (bits, x, y);
+            else
+                scanline[x] = 0;
+        }
+    }
+}
+
+/* we work in fixed point where 1. == 1 << 24 */
+#define FIXED_SHIFT 24
+
+static void downsample_row_box_filter(
+        int start,
+		int width,
+		uint32_t *src, uint32_t *dest,
+                int coverage[], int pixel_coverage)
+{
+    /* we need an array of the pixel contribution of each destination pixel on the boundaries.
+     * we invert the value to get the value on the other size of the box */
+    /*
+
+       value  = a * contribution * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * (1 - contribution) * 1/box_size
+                a * (1/box_size - contribution * 1/box_size)
+
+        box size is constant
+
+
+       value = a * contribtion_a * 1/box_size + b * contribution_b * 1/box_size
+               contribution_b = (1 - contribution_a)
+                              = (1 - contribution_a_next)
+    */
+
+    /* box size = ceil(src_width/dest_width) */
+    int x = 0;
+    while (x < start) {
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        box -= start_coverage;
+        src++;
+        while (box >= pixel_coverage) {
+            src++;
+            box -= pixel_coverage;
+        }
+        x++;
+    }
+
+    while (x < start + width) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        a = ((*src >> 24) & 0xff) * start_coverage;
+        r = ((*src >> 16) & 0xff) * start_coverage;
+        g = ((*src >>  8) & 0xff) * start_coverage;
+        b = ((*src >>  0) & 0xff) * start_coverage;
+        src++;
+        x++;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*src >> 24) & 0xff) * pixel_coverage;
+            r += ((*src >> 16) & 0xff) * pixel_coverage;
+            g += ((*src >>  8) & 0xff) * pixel_coverage;
+            b += ((*src >>  0) & 0xff) * pixel_coverage;
+            src++;
+
+            box -= pixel_coverage;
+        }
+        /* multiply by whatever is leftover
+         * this ensures that we don't bias down.
+         * i.e. start_coverage + n*pixel_coverage + box == 1 << 24 */
+        if (box > 0) {
+            a += ((*src >> 24) & 0xff) * box;
+            r += ((*src >> 16) & 0xff) * box;
+            g += ((*src >>  8) & 0xff) * box;
+            b += ((*src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+    }
+}
+
+static void downsample_columns_box_filter(
+        int n,
+        int start_coverage,
+        int pixel_coverage,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        uint32_t *column_src = src;
+        int box = 1 << FIXED_SHIFT;
+        a = ((*column_src >> 24) & 0xff) * start_coverage;
+        r = ((*column_src >> 16) & 0xff) * start_coverage;
+        g = ((*column_src >>  8) & 0xff) * start_coverage;
+        b = ((*column_src >>  0) & 0xff) * start_coverage;
+        column_src += stride;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*column_src >> 24) & 0xff) * pixel_coverage;
+            r += ((*column_src >> 16) & 0xff) * pixel_coverage;
+            g += ((*column_src >>  8) & 0xff) * pixel_coverage;
+            b += ((*column_src >>  0) & 0xff) * pixel_coverage;
+            column_src += stride;
+            box -= pixel_coverage;
+        }
+        if (box > 0) {
+            a += ((*column_src >> 24) & 0xff) * box;
+            r += ((*column_src >> 16) & 0xff) * box;
+            g += ((*column_src >>  8) & 0xff) * box;
+            b += ((*column_src >>  0) & 0xff) * box;
+        }
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+#include <math.h>
+static int compute_coverage(int coverage[], int src_length, int dest_length) {
+    int i;
+    /* num = src_length/dest_length
+       total = sum(pixel) / num
+
+       pixel * 1/num == pixel * dest_length / src_length
+    */
+    /* the average contribution of each source pixel */
+    int ratio = ((1 << 24)*(long long int)dest_length)/src_length;
+    /* because ((1 << 24)*(long long int)dest_length) won't always be divisible by src_length
+     * we'll need someplace to put the other bits.
+     *
+     * We want to ensure a + n*ratio < 1<<24
+     *
+     * 1<<24
+     * */
+    printf("%d %d\n", src_length, dest_length);
+    double scale=(double)src_length/dest_length;
+    int y = 0;
+    for (i=0; i<dest_length; i++) {
+        float left_side = i*scale;
+        float right_side = (i+1)*scale;
+        float right_fract = right_side - floor(right_side);
+        float left_fract = ceil(left_side) - left_side;
+        assert(right_fract <= 1);
+        int count = floor(right_side) - ceil(left_side);
+        int coeff_est = (1<<24)*(left_fract)/src_length;
+        if (coeff_est == 0) {
+            coeff_est = ratio;
+        }
+        int overage = ratio*(right_fract);
+        if (left_fract == 0.)
+            count--;
+        int coeff = (1<<24) - (count * ratio + overage);
+        //printf("y: %d, %f %d %d %d-%d %f %f-%f %d\n", y, scale, ratio, i, coeff, coeff_est, right_fract, left_side, right_side, overage);
+        //if (coeff < coeff_est)
+        //    coeff = coeff_est;
+        assert(coeff >= 0);
+        int box = 1<<24;
+        box -= coeff;
+        y++;
+        while (box >= ratio) {
+            box -= ratio;
+            y++;
+        }
+        if (y != floor(right_side)) {
+            printf("y: %d, right_side: %f\n",y, right_side);
+            assert(y == floor(right_side));
+        }
+        if (y == src_length) {
+            printf("box: %d\n", box);
+            assert(box <= 1);
+            //coeff += box;
+        }
+        coverage[i] = coeff;
+
+    }
+    printf("y: %d src_length: %d\n", y, src_length);
+    assert(y == src_length);
+    return ratio;
+}
+
+PIXMAN_EXPORT
+int downscale_box_mult_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride)
+{
+   // printf("%d %d %d %d\n", scaledWidth, scaledHeight, origWidth, origHeight);
+        int yd1 = 0;
+        int d;
+
+        assert(width + startColumn <= scaledWidth);
+        /* we need to allocate enough room for ceil(src_height/dest_height)+1 scanlines */
+        uint32_t *temp_buf = pixman_malloc_abc ((origHeight + scaledHeight-1)/scaledHeight+1, width, sizeof(uint32_t));
+	if (!temp_buf)
+            return -1;
+
+	//XXX: I suppose we should check whether this will succeed
+        uint32_t *scanline = pixman_malloc_abc (origWidth, 1, sizeof(uint32_t));
+
+        int *x_coverage = pixman_malloc_abc (origWidth, 1, sizeof(int));
+        int *y_coverage = pixman_malloc_abc (origHeight, 1, sizeof(int));
+        int pixel_coverage_x = compute_coverage(x_coverage, origWidth, scaledWidth);
+        int pixel_coverage_y = compute_coverage(y_coverage, origHeight, scaledHeight);
+
+        int y = 0;
+        for (d = 0; d<startRow; d++) {
+            int box = 1 << FIXED_SHIFT;
+            int start_coverage_y = y_coverage[d];
+            box -= start_coverage_y;
+            y++;
+            while (box >= pixel_coverage_y) {
+                box -= pixel_coverage_y;
+                y++;
+            }
+        }
+        for (d = startRow; d < startRow + height; d++)
+	{
+            int columns = 0;
+            int box = 1 << FIXED_SHIFT;
+            int start_coverage_y = y_coverage[d];
+            fetch_scanline(pict, y, scanline);
+            downsample_row_box_filter(startColumn, width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+            columns++;
+            y++;
+            box -= start_coverage_y;
+
+            while (box >= pixel_coverage_y) {
+                fetch_scanline(pict, y, scanline);
+                downsample_row_box_filter(startColumn, width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+                y++;
+                box -= pixel_coverage_y;
+            }
+            if (box > 0) {
+                fetch_scanline(pict, y, scanline);
+                downsample_row_box_filter(startColumn, width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+            }
+            downsample_columns_box_filter(width, start_coverage_y, pixel_coverage_y, temp_buf, dest + (yd1 - startRow)*dstStride/4);
+            yd1++;
+            if (width*columns > ((origHeight + scaledHeight-1)/scaledHeight+1) * width) {
+                printf("%d %d\n", origHeight, scaledHeight);
+                printf("%d %d\n", columns, (origHeight + scaledHeight-1)/scaledHeight);
+                assert(width*columns <= (origHeight + scaledHeight-1)/scaledHeight * width);
+            }
+        }
+        if (y > origHeight) {
+            printf("%d %d\n", y, origHeight);
+            assert(y<=origHeight);
+        }
+        free(scanline);
+        free(x_coverage);
+        free(y_coverage);
+        free(temp_buf);
+        return 0;
+}
diff --git a/pixman/pixman-rescale.h b/pixman/pixman-rescale.h
new file mode 100644
index 0000000..e2f5685
--- /dev/null
+++ b/pixman/pixman-rescale.h
@@ -0,0 +1,21 @@
+int downscale_box_filter(
+        pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride);
+
+int downscale_box_mult2_filter( pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride);
+
+int downscale_lanczos_filter( pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride);
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index cdf0220..9266655 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -26,11 +26,278 @@
 #endif
 
 #include <stdlib.h>
+#include <math.h>
 
 #include "pixman-private.h"
 #include "pixman-mmx.h"
 #include "pixman-sse2.h"
 
+#if 0
+pixman_fixed_inverse(pixman_fixed_t a)
+{
+    /* are there any value that don't have an inverse?
+     * yes. lots */
+}
+#endif
+
+/**
+ * sqrt_fixed:
+ * @x: number to sqrt in fixed-point 16.16 format
+ * 
+ * This code is inspired by Ken Turkowski's fixed-point sqrt.
+ * http://www.worldserver.com/turk/computergraphics/FixedSqrt.pdf
+ *
+ * Author : Frederic Plourde 
+ **/
+pixman_fixed_t
+sqrt_fixed (pixman_fixed_t x)
+{
+    const pixman_fixed_t pixman_fixed_095    = 0x0000F333;
+    const pixman_fixed_t pixman_fixed_sqrt_2 = 0x00016A0A;
+    int32_t y = x;
+    int32_t a = 0;
+
+    if (y <= 0)
+    {
+	return 0;
+    }
+    else if (y > pixman_fixed_1)
+    {
+	while (y & 0xFFFF0000)
+	{
+	    y >>= 1;
+	    a++;
+	}
+    }
+    else
+    {
+	while (!(y & 0xFFFF0000))
+	{
+	    y <<= 1;
+	    a--;
+	}
+	y >>= 1;
+	a++;
+    }
+
+    y = (pixman_fixed_095 + y) >> 1;
+
+    if (a & 0x00000001) // a is odd
+    {
+	a = a >> 1;
+
+	if (a > 0)
+	{
+	    y = mul_fixed(mul_fixed(y, pixman_fixed_1 << a), pixman_fixed_sqrt_2);
+	}
+        else
+	{
+	    y = mul_fixed(mul_fixed(y, pixman_fixed_1 >> -a), pixman_fixed_sqrt_2);
+	}
+    }
+    else // a is even
+    {
+	a = a >> 1;
+
+	if (a > 0)
+	{
+	    y = mul_fixed(y, pixman_fixed_1 << a);
+	}
+	else
+	{
+	    y = mul_fixed(y, pixman_fixed_1 >> -a);
+	}
+    }
+
+    // Iterate two times to improve accuracy.
+    // One of the below two lines may be removed to
+    // increase speed but at the cost of reduced accuracy.
+    y = (y + div_fixed(x, y)) >> 1;
+    y = (y + div_fixed(x, y)) >> 1;
+
+    return y;
+}
+
+void
+pixman_transform_unscale (pixman_transform_t *transform,
+			 pixman_fixed_t xscale, pixman_fixed_t yscale)
+{
+    /*
+      we want to multiply the transform by the inverse of the following scaled
+      transform:
+
+    | xscale  0    0 |^-1     | 1/xscale    0      0 |
+    |   0   yscale 0 |    ==  |    0    1/yscale   0 |
+    |   0     0    1 |        |    0        0      1 |
+
+      multiplying with the transform matrix gives:
+
+    | 1/xscale    0      0 |   |  a  b  c  |     | a/xscale b/xscale c/xscale |
+    |    0    1/yscale   0 | * |  d  e  f  |  =  | d/yscale e/yscale f/yscale |
+    |    0        0      1 |   |  g  h  i  |     |     g        h        i    |
+
+       we perform this transformation directly to take advantage of the
+       increased accuracy */
+    transform->matrix[0][0] = div_fixed(transform->matrix[0][0], xscale);
+    transform->matrix[0][1] = div_fixed(transform->matrix[0][1], xscale);
+    transform->matrix[0][2] = div_fixed(transform->matrix[0][2], xscale);
+
+    transform->matrix[1][0] = div_fixed(transform->matrix[1][0], yscale);
+    transform->matrix[1][1] = div_fixed(transform->matrix[1][1], yscale);
+    transform->matrix[1][2] = div_fixed(transform->matrix[1][2], yscale);
+}
+
+void
+pixman_extract_scale (pixman_transform_t *transform,
+        int *scale_x, int *scale_y)
+{
+    pixman_fixed_t (*matrix)[3] = transform->matrix;
+    pixman_bool_t affine = matrix[2][0] == 0 && matrix[2][1] == 0 && matrix[2][2] == pixman_fixed_1;
+    if (affine) {
+#ifdef FIXED
+        *scale_x  = sqrt_fixed(square_fixed(width) *(square_fixed(matrix[0][0]) + square_fixed(matrix[1][0])));
+        *scale_y = sqrt_fixed(square_fixed(height)*(square_fixed(matrix[0][1]) + square_fixed(matrix[1][1])));
+#else
+        double a  = pixman_fixed_to_double(matrix[0][0]);
+        double b  = pixman_fixed_to_double(matrix[0][1]);
+        double c  = pixman_fixed_to_double(matrix[1][0]);
+        double d  = pixman_fixed_to_double(matrix[1][1]);
+
+        *scale_x = pixman_double_to_fixed(sqrt(a*a + c*c));
+        *scale_y = pixman_double_to_fixed(sqrt(b*b + d*d));
+
+#endif
+    } else {
+#if 0
+        /*
+                               /              2               2              2        \1/2
+                               |         width  ((d w  - y0 f)  (c w - y0 e))          |
+                               |-------------------------------------------------------|
+                               |                                                    2  |
+                               \ (a d w - a y0 f - c b w + c x0 f + e b y0 - e x0 d)   /
+
+                               /              2               2              2        \1/2
+                               |        height  ((b w  - x0 f)  (a w - x0 e))          |
+                               |-------------------------------------------------------|
+                               |                                                    2  |
+                               \ (a d w - a y0 f - c b w + c x0 f + e b y0 - e x0 d)   /
+
+
+        */
+        /* this isn't too nice and hasn't been tested at all. floating point is used to help readability */
+        double a  = matrix[0][0];
+        double b  = matrix[0][1];
+        double x0 = matrix[0][2];
+        double c  = matrix[1][0];
+        double d  = matrix[1][1];
+        double y0 = matrix[1][2];
+        double e  = matrix[2][0];
+        double f  = matrix[2][1];
+        double w  = matrix[2][2];
+
+        double det = (a*d*w - a*y0*f - c*b*w + c*x0*f + e*b*y0 - e*x0*d);
+        double topx = (pow(d*w - y0*f, 2.)+pow(c*w - y0*e, 2.))*width*width;
+        double topy = (pow(b*w - x0*f, 2.)+pow(a*w - x0*e, 2.))*height*height;
+        double distx = sqrt(topx/(det*det));
+        double disty = sqrt(topy/(det*det));
+        *scaled_width  = pixman_double_to_fixed(distx);
+        *scaled_height = pixman_double_to_fixed(disty);
+#endif
+    }
+}
+
+
+//XXX: could be called pixman_get_scaled_size()...
+void
+PIXMAN_EXPORT pixman_get_scaled_size (pixman_transform_t *transform,
+        int width, int height,
+        int *scaled_width, int *scaled_height)
+{
+    pixman_fixed_t (*matrix)[3] = transform->matrix;
+/*
+                            /     2   2    2 \1/2
+                            |width  (d  + c )|
+                distx :=    |----------------|
+                            |             2  |
+                            \  (a d - c b)   /
+
+                            /      2   2    2 \1/2
+                            |height  (b  + a )|
+                disty :=    |-----------------|
+                            |             2   |
+                            \  (a d - c b)    /
+                             /                  \1/2
+                            |width? (d? + c?)|
+                distx :=    |----------------|
+                            \  (a d - c b)?   /
+
+                            ?  height? (b? + a?) ? ?
+                disty :=    ? -------------------?
+                            ?    (a d - c b)?    ?
+*/
+    pixman_bool_t affine = matrix[2][0] == 0 && matrix[2][1] == 0 && matrix[2][2] == pixman_fixed_1;
+    if (affine) {
+#if fixed
+        pixman_fixed_t det = mul_fixed(matrix[0][0],matrix[1][1]) - mul_fixed(matrix[0][1],matrix[1][0]);
+
+        pixman_fixed_t topx = square_fixed(width) *(square_fixed(matrix[1][1]) + square_fixed(matrix[1][0]));
+        pixman_fixed_t topy = square_fixed(height)*(square_fixed(matrix[0][0]) + square_fixed(matrix[0][1]));
+
+        *scaled_width  = sqrt_fixed(div_fixed(topx, square_fixed(det)));
+        *scaled_height = sqrt_fixed(div_fixed(topy, square_fixed(det)));
+#else
+        double a  = pixman_fixed_to_double(matrix[0][0]);
+        double b  = pixman_fixed_to_double(matrix[0][1]);
+        double c  = pixman_fixed_to_double(matrix[1][0]);
+        double d  = pixman_fixed_to_double(matrix[1][1]);
+
+        double det = (a*d - c*b);
+        double topx = (d*d + c*c)*width*width;
+        double topy = (b*b + a*a)*height*height;
+        double distx = sqrt(topx/(det*det));
+        double disty = sqrt(topy/(det*det));
+        //printf("%f %f\n", distx, disty);
+        *scaled_width  = round(distx);
+        *scaled_height = round(disty);
+#endif
+
+    } else {
+        /*
+                               /              2               2              2        \1/2
+                               |         width  ((d w  - y0 f)  (c w - y0 e))          |
+                               |-------------------------------------------------------|
+                               |                                                    2  |
+                               \ (a d w - a y0 f - c b w + c x0 f + e b y0 - e x0 d)   /
+
+                               /              2               2              2        \1/2
+                               |        height  ((b w  - x0 f)  (a w - x0 e))          |
+                               |-------------------------------------------------------|
+                               |                                                    2  |
+                               \ (a d w - a y0 f - c b w + c x0 f + e b y0 - e x0 d)   /
+
+
+        */
+        /* this isn't too nice and hasn't been tested at all. floating point is used to help readability */
+        double a  = matrix[0][0];
+        double b  = matrix[0][1];
+        double x0 = matrix[0][2];
+        double c  = matrix[1][0];
+        double d  = matrix[1][1];
+        double y0 = matrix[1][2];
+        double e  = matrix[2][0];
+        double f  = matrix[2][1];
+        double w  = matrix[2][2];
+
+        double det = (a*d*w - a*y0*f - c*b*w + c*x0*f + e*b*y0 - e*x0*d);
+        double topx = (pow(d*w - y0*f, 2.)+pow(c*w - y0*e, 2.))*width*width;
+        double topy = (pow(b*w - x0*f, 2.)+pow(a*w - x0*e, 2.))*height*height;
+        double distx = sqrt(topx/(det*det));
+        double disty = sqrt(topy/(det*det));
+        *scaled_width  = pixman_double_to_fixed(distx);
+        *scaled_height = pixman_double_to_fixed(disty);
+    }
+}
+
 #if defined(USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
 #endif
diff --git a/pixman/pixman.h b/pixman/pixman.h
index 7ae425c..ccc2abb 100644
--- a/pixman/pixman.h
+++ b/pixman/pixman.h
@@ -765,6 +765,7 @@ pixman_bool_t   pixman_image_set_filter              (pixman_image_t
 						      int                           n_filter_params);
 void		pixman_image_set_source_clipping     (pixman_image_t		   *image,
 						      pixman_bool_t                 source_clipping);
+pixman_bool_t	pixman_image_has_source_clipping     (pixman_image_t		   *image);
 void            pixman_image_set_alpha_map           (pixman_image_t               *image,
 						      pixman_image_t               *alpha_map,
 						      int16_t                       x,
diff --git a/test/Makefile.am b/test/Makefile.am
index cb8a5ef..a284922 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -4,6 +4,9 @@ INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
 TESTPROGRAMS =			\
 	region-test		\
 	scaling-test		\
+	extract-scale		\
+	rescale-test		\
+	scale-out-test		\
 	fetch-test		\
 	trap-crasher
 
@@ -11,6 +14,9 @@ fetch_test_LDADD = $(TEST_LDADD)
 region_test_LDADD = $(TEST_LDADD)
 scaling_test_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
+extract_scale_LDADD = $(TEST_LDADD)
+rescale_test_LDADD = $(TEST_LDADD)
+scale_out_test_LDADD = $(TEST_LDADD)
 
 # GTK using test programs
 
diff --git a/test/minpng.h b/test/minpng.h
new file mode 100644
index 0000000..c6b9cfb
--- /dev/null
+++ b/test/minpng.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright ? 2009 Jeff Muizelaar
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Jeff Muizelaar not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Jeff Muizelaar makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * JEFF MUIZELAAR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL JEFF MUIZELAAR
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+/* PNG specification Annex D */
+
+/* Table of CRCs of all 8-bit messages. */
+unsigned long crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+int crc_table_computed = 0;
+
+/* Make the table for a fast CRC. */
+void make_crc_table(void)
+{
+	unsigned long c;
+	int n, k;
+
+	for (n = 0; n < 256; n++) {
+		c = (unsigned long) n;
+		for (k = 0; k < 8; k++) {
+			if (c & 1)
+				c = 0xedb88320L ^ (c >> 1);
+			else
+				c = c >> 1;
+		}
+		crc_table[n] = c;
+	}
+	crc_table_computed = 1;
+}
+
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+   should be initialized to all 1's, and the transmitted value
+   is the 1's complement of the final running CRC (see the
+   crc() routine below). */
+
+unsigned long update_crc(unsigned long crc, unsigned char *buf,
+		int len)
+{
+	unsigned long c = crc;
+	int n;
+
+	if (!crc_table_computed)
+		make_crc_table();
+	for (n = 0; n < len; n++) {
+		c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
+	}
+	return c;
+}
+
+/* Return the CRC of the bytes buf[0..len-1]. */
+unsigned long crc(unsigned char *buf, int len)
+{
+	return update_crc(0xffffffffL, buf, len) ^ 0xffffffffL;
+}
+
+char *write_be32(char *buf, int a)
+{
+	buf[0] = (a>>24) & 0xff;
+	buf[1] = (a>>16) & 0xff;
+	buf[2] = (a>>8)  & 0xff;
+	buf[3] = (a>>0)  & 0xff;
+	return &buf[4];
+}
+
+struct buf
+{
+	char *data;
+	int len;
+};
+
+struct buf chunk(char *type, struct buf q)
+{
+	struct buf b;
+	char *t = malloc(4 + 4 + len + 4);
+	char *chunk;
+	b.data = t;
+	len = q.len;
+	t = write_be32(t, len);
+	chunk = t;
+
+	t[0] = type[0]; t[1] = type[1]; t[2] = type[2]; t[3] = type[3];
+	t = &t[4];
+
+	memcpy(t, q.data, len);
+	t += q.len;
+	free(q.data);
+
+	t = write_be32(t, crc((unsigned char *)chunk, len + 4));
+
+	b.len = 4 + 4 + len + 4;
+	return b;
+}
+
+struct buf buf_cat_str(struct buf b, char *d, int len)
+{
+	struct buf r;
+	r.data = realloc(b.data, b.len + len);
+	memcpy(r.data + b.len, d, len);
+	r.len = b.len + len;
+	return r;
+}
+
+struct buf buf_cat_str_argb(struct buf b, char *d, int len)
+{
+	struct buf r;
+	r.data = realloc(b.data, b.len + len);
+	char *dest = r.data + b.len;
+	unsigned int *src = (unsigned int *)d;
+	r.len = b.len + len;
+	while (len) {
+		char alpha = (*src >> 24) & 0xff;
+		char red   = (*src >> 16) & 0xff;
+		char green = (*src >> 8) & 0xff;
+		char blue  = (*src >> 0) & 0xff;
+		*dest++ = red;
+		*dest++ = green;
+		*dest++ = blue;
+		*dest++ = alpha;
+		len-=4;
+		src++;
+	}
+	return r;
+}
+
+struct buf buf_cat(struct buf b, struct buf c)
+{
+	return buf_cat_str(b, c.data, c.len);
+}
+
+struct buf be32(int a)
+{
+	struct buf r;
+	r.data = malloc(4);
+	r.len = 4;
+	write_be32(r.data, a);
+	return r;
+}
+
+#define MOD_ADLER 65521
+/* From Wikipedia */
+uint32_t adler32(uint8_t *data, size_t len) /* data: Pointer to the data to be summed; len is in bytes */
+{
+	    uint32_t a = 1, b = 0;
+
+	    while (len != 0)
+	    {
+		    a = (a + *data++) % MOD_ADLER;
+		    b = (b + a) % MOD_ADLER;
+
+		    len--;
+	    }
+
+	    return (b << 16) | a;
+}
+
+/* 16bit length in little endian followed by
+ * ones compliment of length in little endian */
+struct buf zlib_block_length(int length)
+{
+	struct buf r;
+	r.data = malloc(4);
+	r.len = 4;
+	r.data[0] = length & 0xff;
+	r.data[1] = (length >> 8) & 0xff;
+	length = ~length;
+	r.data[2] = length & 0xff;
+	r.data[3] = (length >> 8) & 0xff;
+	return r;
+}
+
+/* inspired by "A use for uncompressed PNGs"
+ * http://drj11.wordpress.com/2007/11/20/a-use-for-uncompressed-pngs/
+ * by David Jones */
+struct buf make_png(char *d, int width, int height)
+{
+	struct buf r = {};
+	char predictor[] = {0x0};
+	char zlib_prefix[] = {0x78,0x9c};
+	char zlib_final_block_prefix[] = {0x01};
+	char zlib_block_prefix[] = {0x00};
+	char hdr_tail[] = {0x08,0x06,0x00,0x00,0x00};
+	char png_start[] = {0x89,'P','N','G','\r','\n',0x1A,'\n'};
+	r = buf_cat_str(r, png_start, sizeof(png_start));
+	struct buf ihdr = {};
+	int block_length = (width*4 + 1);
+	assert(block_length <= 65535);
+	ihdr = buf_cat(ihdr, be32(width));
+	ihdr = buf_cat(ihdr, be32(height));
+	ihdr = buf_cat_str(ihdr, hdr_tail, sizeof(hdr_tail));
+
+	r = buf_cat(r, chunk("IHDR", ihdr));
+
+	struct buf idat = {};
+	struct buf data = {};
+	idat = buf_cat_str(idat, zlib_prefix, sizeof(zlib_prefix));
+
+	int i;
+	for (i=0; i<height; i++) {
+		if (i == height - 1)
+			idat = buf_cat_str(idat, zlib_final_block_prefix, sizeof(zlib_final_block_prefix));
+		else
+			idat = buf_cat_str(idat, zlib_block_prefix, sizeof(zlib_block_prefix));
+
+		idat = buf_cat(idat, zlib_block_length(block_length));
+		data = buf_cat_str(data, predictor, sizeof(predictor));
+		idat = buf_cat_str(idat, predictor, sizeof(predictor));
+		data = buf_cat_str_argb(data, d, width * 4);
+		idat = buf_cat_str_argb(idat, d, width * 4);
+		d += width*4;
+	}
+	idat = buf_cat(idat, be32(adler32((unsigned char *)data.data, data.len)));
+
+	r = buf_cat(r, chunk("IDAT", idat));
+
+	struct buf iend = {};
+	r = buf_cat(r, chunk("IEND", iend));
+	return r;
+}
+
+
+void write_png(const char *name, char *d, int width, int height) {
+	FILE *f = fopen(name, "w+");
+	struct buf png = make_png(d, width, height);
+	fwrite(png.data, png.len, 1, f);
+	fclose(f);
+}
diff --git a/test/rescale-test.c b/test/rescale-test.c
new file mode 100644
index 0000000..8fddb64
--- /dev/null
+++ b/test/rescale-test.c
@@ -0,0 +1,123 @@
+#include <stdlib.h>
+#include <time.h>
+#include <stdio.h>
+#include <sched.h>
+#include <sys/time.h>
+#include <math.h>
+
+#include "pixman.h"
+#include "eng.h"
+#include "minpng.h"
+
+int downscale_lanczos_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride);
+
+#define WIDTH 400
+#define HEIGHT 400
+
+int randint(int min, int max)
+{
+    int q = min + random() % (max-min+1);
+    assert(q >= min);
+    assert(q <= max);
+}
+
+int main()
+{
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i, j;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        src[i] = 0xff7f0000; /* red */
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        dest[i] = 0xff0000ff; /* blue */
+
+    int orig_width = WIDTH;
+    int orig_height = HEIGHT;
+    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+            //WIDTH, HEIGHT,
+            orig_width, orig_height,
+            src,
+            WIDTH * 4);
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+            WIDTH, HEIGHT,
+            dest,
+            WIDTH * 4);
+
+    int k;
+    for (k=0; k<900; k++) {
+        int scaled_width = randint(1, WIDTH);
+        int scaled_height = randint(1, HEIGHT);
+        int x = randint(0, scaled_width-1);
+        int y = randint(0, scaled_height-1);
+        // keep these above 0 for now
+        int width = randint(1, scaled_width - x);
+        int height = randint(1, scaled_height - y);
+        //printf("%d %d\n", width, height);
+        //printf("%d %d\n", scaled_width, scaled_height);
+        //printf("%d %d\n", x, y);
+    downscale_lanczos_filter(src_img,
+            orig_width, orig_height,
+		scaled_width, scaled_height,
+		x, y,
+		width, height,
+		src, orig_width * 4,
+                dest, WIDTH * 4);
+
+    //write_png("1.png", dest, WIDTH, HEIGHT);
+    int lan_sum = sum(dest, WIDTH * HEIGHT * 4);
+    //printf("%x\n", lan_sum);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        dest[i] = 0xff0000ff; /* blue */
+#if 0
+int downscale_box_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride)
+#endif
+    downscale_box_filter(src_img,
+            orig_width, orig_height,
+		scaled_width, scaled_height,
+		x, y,
+		width, height,
+		src, orig_width * 4,
+                dest, WIDTH * 4);
+
+    int box_sum = sum(dest, WIDTH * HEIGHT * 4);
+    //printf("%x\n", box_sum);
+    //write_png("2.png", dest, WIDTH, HEIGHT);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        dest[i] = 0xff0000ff; /* blue */
+
+    downscale_box_mult_filter(src_img,
+            orig_width, orig_height,
+		scaled_width, scaled_height,
+		x, y,
+		width, height,
+		src, orig_width * 4,
+                dest, WIDTH * 4);
+
+    int mult_sum = sum(dest, WIDTH * HEIGHT * 4);
+    //printf("%x\n", mult_sum);
+    //write_png("3.png", dest, WIDTH, HEIGHT);
+        for (i = 0; i < WIDTH * HEIGHT; ++i)
+        dest[i] = 0xff0000ff; /* blue */
+
+        assert(mult_sum == lan_sum && box_sum == lan_sum);
+
+    }
+
+}
diff --git a/test/scale-out-test.c b/test/scale-out-test.c
new file mode 100644
index 0000000..881d27e
--- /dev/null
+++ b/test/scale-out-test.c
@@ -0,0 +1,139 @@
+#include <stdlib.h>
+#include <time.h>
+#include <stdio.h>
+#include <sched.h>
+#include <sys/time.h>
+#include <math.h>
+
+#include "pixman.h"
+#include "eng.h"
+#include "minpng.h"
+
+int downscale_lanczos_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride);
+
+#define WIDTH 400
+#define HEIGHT 400
+
+int randint(int min, int max)
+{
+    int q = min + random() % (max-min+1);
+    assert(q >= min);
+    assert(q <= max);
+}
+
+//#include "in.h"
+extern char input[];
+int main()
+{
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i, j;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        src[i] = 0xff7f0000; /* red */
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        dest[i] = 0xff0000ff; /* blue */
+
+    int orig_width = 1024;
+    int orig_height = 768;
+    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+            //WIDTH, HEIGHT,
+            orig_width, orig_height,
+            input,
+            orig_width * 4);
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+            WIDTH, HEIGHT,
+            dest,
+            WIDTH * 4);
+
+    int k;
+        int scaled_width = 200;
+        int scaled_height = 200;
+        int x = 0;
+        int y = 0;
+        // keep these above 0 for now
+        int width = scaled_width;
+        int height = scaled_height;
+        //printf("%d %d\n", width, height);
+        //printf("%d %d\n", scaled_width, scaled_height);
+        //printf("%d %d\n", x, y);
+    clock_t start, end;
+    start = clock();
+    downscale_lanczos_filter(src_img,
+            orig_width, orig_height,
+		scaled_width, scaled_height,
+		x, y,
+		width, height,
+		src, orig_width * 4,
+                dest, WIDTH * 4);
+    end = clock();
+    printf("t: %d\n", end - start);
+
+    write_png("1.png", dest, WIDTH, HEIGHT);
+    int lan_sum = sum(dest, WIDTH * HEIGHT * 4);
+    printf("%x\n", lan_sum);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        dest[i] = 0xff0000ff; /* blue */
+#if 0
+int downscale_box_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride)
+#endif
+    start = clock();
+    downscale_box_filter(src_img,
+            orig_width, orig_height,
+		scaled_width, scaled_height,
+		x, y,
+		width, height,
+		src, orig_width * 4,
+                dest, WIDTH * 4);
+    end = clock();
+    printf("t: %d\n", end - start);
+
+    int box_sum = sum(dest, WIDTH * HEIGHT * 4);
+    printf("%x\n", box_sum);
+    write_png("2.png", dest, WIDTH, HEIGHT);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+        dest[i] = 0xff0000ff; /* blue */
+
+    start = clock();
+    downscale_box_mult_filter(src_img,
+            orig_width, orig_height,
+		scaled_width, scaled_height,
+		x, y,
+		width, height,
+		src, orig_width * 4,
+                dest, WIDTH * 4);
+    end = clock();
+    printf("t: %d\n", end - start);
+    int mult_sum = sum(dest, WIDTH * HEIGHT * 4);
+    printf("%x\n", mult_sum);
+    write_png("3.png", dest, WIDTH, HEIGHT);
+
+    start = clock();
+    downscale_box_mult_old_filter(src_img,
+            orig_width, orig_height,
+		scaled_width, scaled_height,
+		x, y,
+		width, height,
+		src, orig_width * 4,
+                dest, WIDTH * 4);
+    end = clock();
+    printf("t: %d\n", end - start);
+
+
+}