[cairo] [RFC] high-quality pixman image downscaling

Fri Jun 26 10:06:30 PDT 2009

The attached patch is my work in progress for adding higher quality image
downscaling to pixman. The basic idea is to create a downscaled copy of a
pixman_image_t. This downscaled copy can then be substituted for the original
and passed through pixman_image_composite path to get full range of affine
transformations and composition modes. It also allows a scaled copy of the
image to be cached to avoid the costly downscaling operation when the image is
drawn again at the same scale. As a reference point, this is the approach
quartz takes to high quality downscaling.

It includes three different downscalers of increasing quality and
decreasing speed.

The first is a simple integer box filter. The inner loop of this
downscaler has one add per component per source pixel and one
multiplication per component per destination pixel. The number of pixels
summed alternates between two different values.

The second (pixman-rescale-mult) is a constant non-integer sized box filter. The inner loop
of this multiplies each source pixel by a coefficient, sums all the
results and then shifts to get the resulting destination pixel.

The final one is lanczos2 filter. This one uses twice as many source
pixels per destination pixel as rescale-mult. Like rescale-mult it
multiplies, sums and shifts the source pixels for each destination
pixel. This one also has an SSE2 version of the inner loop.

I'm most interested in comments on the interface between pixman and the
scalers.

The entry point to the scalers is:

downscale_image(pixman_image_t *pict, enum downscale_type type,
                int scaled_width, int scaled_height,
                int start_column, int start_row,
                int width, int height,
                uint32_t *dest, int dst_stride)

This function will downscale an pixman_image_t into an ARGB32 buffer (dest).

 type is one of (BOX, MULT, LANCZOS)

 scaled_width / original_width is the scale ratio. Using scaled_width instead of
 just passing in the ratio avoids any rounding errors because the ratio can be
 specified exactly
 
 start_column, start_row, width and height specify a rectangle in the scaled
 source space.
 
 dest, dst_stride, width, and height allow specifying a rectangle in the
 destination buffer.

The interface to each of the scalers looks something like:

downscale_*_filter(
        const struct rescale_fetcher *fetcher, void *closure,
        int orig_width, int orig_height,
	int scaled_width, int scaled_height,
	int start_column, int start_row,
	int width, int height,
	uint32_t *dest, int dstStride);

This interface shares many of the same parameters as above. The key addition is
the fetcher parameter. This is meant to be a generic interface for the scalers
to get scanlines from the source. It allows the fetcher to convert and copy a
scanline to a temporary buffer or return a pointer to the source pixels
directly without making a copy.

Thoughts, comments and questions all welcome,

-Jeff

-------------- next part --------------

diff --git a/pixman/pixman-rescale.c b/pixman/pixman-rescale.c
new file mode 100644
index 0000000..06d5197
--- /dev/null
+++ b/pixman/pixman-rescale.c
@@ -0,0 +1,121 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+
+/* Fetcher documentation:
+ * Goals: we need a generic interface to fetch lines of ARGB32
+ * - we want to support converting to argb32
+ * - we also want to support having the rescaler read directly from the buffer
+ *   if the buffer is of a format compatible with the rescaler */
+
+static void fetch_convert(void *closure, int y, uint32_t **scanline_ptr)
+{
+    pixman_image_t *pict = closure;
+    uint32_t *scanline = *scanline_ptr;
+    pict->bits.fetch_scanline_raw_32(pict, 0, y, pict->bits.width, scanline, NULL, 0);
+
+}
+
+static uint32_t *fetch_convert_init(void *closure)
+{
+    pixman_image_t *pict = closure;
+    return pixman_malloc_abc (pict->bits.width, 1, sizeof(uint32_t));
+}
+
+static void fetch_convert_fini(void *closure, uint32_t *scanline)
+{
+    free(scanline);
+}
+
+
+static void fetch_raw(void *closure, int y, uint32_t **scanline)
+{
+    pixman_image_t *pict = closure;
+    bits_image_t *bits = &pict->bits;
+    /* set the scanline to the begining of the yth row */
+    *scanline = bits->bits + y*bits->rowstride;
+}
+
+static uint32_t *fetch_raw_init(void *closure)
+{
+    return NULL;
+}
+
+static void fetch_raw_fini(void *closure, uint32_t *scanline)
+{
+}
+
+const struct rescale_fetcher fetcher_convert = {
+    fetch_convert,
+    fetch_convert_init,
+    fetch_convert_fini
+};
+
+const struct rescale_fetcher fetcher_raw = {
+    fetch_raw,
+    fetch_raw_init,
+    fetch_raw_fini
+};
+
+/* downscale an pixman_image_t into an ARGB32 buffer
+ *
+ * scaled_width / original_width is the scale ratio
+ * using scaled_width instead of just passing in the ratio avoids any rounding errors because
+ * the ratio can be specified exactly
+ *
+ * start_column, start_row, width, height
+ * specify a rectangle in the scaled source space
+ *
+ * dest, dst_stride, width, and height allow specifying
+ * a rectangle in the destination buffer
+ */
+PIXMAN_EXPORT
+int downscale_image(pixman_image_t *pict, enum downscale_type type,
+		int scaled_width, int scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dst_stride)
+{
+    const struct rescale_fetcher *fetcher;
+    int orig_height = pict->bits.height;
+    int orig_width  = pict->bits.width;
+
+    /* choose a fetcher */
+    if (PIXMAN_FORMAT_BPP(pict->bits.format) == 32 &&
+            PIXMAN_FORMAT_R(pict->bits.format) == 8 &&
+            PIXMAN_FORMAT_G(pict->bits.format) == 8 &&
+            PIXMAN_FORMAT_B(pict->bits.format) == 8) {
+        fetcher = &fetcher_raw;
+    } else {
+        fetcher = &fetcher_convert;
+    }
+
+    /* choose and run downscaler */
+    switch (type) {
+        case LANCZOS:
+            return downscale_lanczos_filter(fetcher, pict, orig_width, orig_height,
+            scaled_width, scaled_height,
+            start_column, start_row,
+            width, height,
+            dest, dst_stride);
+        case BOX:
+            return downscale_box_filter(fetcher, pict, orig_width, orig_height,
+            scaled_width, scaled_height,
+            start_column, start_row,
+            width, height,
+            dest, dst_stride);
+        case MULT:
+            return downscale_box_mult_filter(fetcher, pict, orig_width, orig_height,
+            scaled_width, scaled_height,
+            start_column, start_row,
+            width, height,
+            dest, dst_stride);
+        default:
+            return -1;
+    }
+}
diff --git a/pixman/pixman-rescale.h b/pixman/pixman-rescale.h
new file mode 100644
index 0000000..8584ff7
--- /dev/null
+++ b/pixman/pixman-rescale.h
@@ -0,0 +1,44 @@
+#include <stdint.h>
+struct rescale_fetcher
+{
+    void (*fetch_scanline)(void *closure, int y, uint32_t **scanline_ptr);
+    uint32_t *(*init)(void *closure);
+    void (*fini)(void *closure, uint32_t *scanline);
+};
+
+enum downscale_type
+{
+    BOX,
+    MULT,
+    LANCZOS
+};
+
+int downscale_image(pixman_image_t *image, enum downscale_type type,
+		int scaled_width, int scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dstStride);
+
+int downscale_box_filter(
+        const struct rescale_fetcher *fetcher, void *closure,
+        unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dstStride);
+
+int downscale_box_mult_filter(
+        const struct rescale_fetcher *fetcher, void *closure,
+        unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dstStride);
+
+int downscale_lanczos_filter(
+        const struct rescale_fetcher *fetcher, void *closure,
+        unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dstStride);

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 04e96cb..76ecfbe 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -33,6 +33,11 @@ libpixman_1_la_SOURCES =			\
 	pixman-edge-imp.h			\
 	pixman-trap.c				\
 	pixman-timer.c				\
+	pixman-rescale.c			\
+	pixman-rescale-box.c			\
+	pixman-rescale-mult.c			\
+	pixman-rescale-mult-old.c		\
+	pixman-rescale-lanczos.c		\
 	pixman-matrix.c
 
 libpixmanincludedir = $(includedir)/pixman-1/
diff --git a/pixman/pixman-rescale-box.c b/pixman/pixman-rescale-box.c
new file mode 100644
index 0000000..1959c88
--- /dev/null
+++ b/pixman/pixman-rescale-box.c
@@ -0,0 +1,339 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright ? 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+#define abs(a) (a) > 0 ? (a) : -(a)
+#define sign(x) ((x)>0 ? 1:-1)
+
+/*
+When box filtering with an integer sized box we only ever have two box sizes.
+Proof:
+    Assume w1 > w2.
+    r = w1/w2
+
+    The size of the two boxes is
+    r1 = ceil(r), r2 = floor(r)
+
+    we want to find non-negative integers p and q such that:
+    w1 = p*r1 + q*r2
+
+    if r1 == r2 then
+        r is an integer and thus w2 evenly divides w1
+        therefor p = w2 and q = 0
+    otherwise r1 != r2 and
+        r1 = r2 + 1
+        w1 = p*(r2 + 1) + q * r2
+           = p*r2 + q*r2 + p
+           = (p + q)*r2 + p
+
+        we then choose a value of p such that:
+            w1 = r2*w2 + p
+            thus p = w1 mod r2 XXX: this wrong...
+                   = w1 mod w2
+            and  q = w2 - p which is > 0 because
+                            p = w1 mod w2
+
+            subing into:
+            (p + q)*r2 + p
+            gives:
+            w1 = (p + w2 - p)*r2 + p
+            w1 = w2*r2 + w1 mod r2
+
+*/
+
+
+/* we can index on the bottom bit of the divisor.
+ * 
+ * we could also do a multiply per pixel and accumulate precision
+*/
+void downsample_row_box_filter(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                long dx2, long e, long src_dx)
+{
+    int di = 0;
+
+    /* the filter sums only two different counts of samples.
+     * we compute the inverses for these two different counts
+     * so that we can do multiplications instead of divisions */
+    int div_inv[2];
+    int div = src_dx/dx2;
+    div_inv[div & 1] = (1<<24)/div;
+    div += 1;
+    div_inv[div & 1] = (1<<24)/div;
+
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int count = 1;
+        a = (*src >> 24) & 0xff;
+        r = (*src >> 16) & 0xff;
+        g = (*src >>  8) & 0xff;
+        b = (*src >>  0) & 0xff;
+        e -= dx2;
+        src++;
+        while (e > 0) {
+            e -= dx2;
+            a += (*src >> 24) & 0xff;
+            r += (*src >> 16) & 0xff;
+            g += (*src >>  8) & 0xff;
+            b += (*src >>  0) & 0xff;
+            count++;
+            src++;
+        }
+#if 1
+        int div = div_inv[count & 1];
+
+        a = (a * div + 0x10000) >> 24;
+        r = (r * div + 0x10000) >> 24;
+        g = (g * div + 0x10000) >> 24;
+        b = (b * div + 0x10000) >> 24;
+
+        //XXX counts seem high...
+#else
+        a /= count;
+        r /= count;
+        g /= count;
+        b /= count;
+#endif
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+
+        e += src_dx;
+    }
+}
+
+void downsample_row_box_filter_sse2(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                long dx2, long e, long src_dx)
+{
+#if 0
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int count = 1;
+        a = (*src >> 24) & 0xff;
+        r = (*src >> 16) & 0xff;
+        g = (*src >>  8) & 0xff;
+        b = (*src >>  0) & 0xff;
+        while (e > 0) {
+            e -= dx2;
+            a += (*src >> 24) & 0xff;
+            r += (*src >> 16) & 0xff;
+            g += (*src >>  8) & 0xff;
+            b += (*src >>  0) & 0xff;
+            count++;
+            src++;
+        }
+        //XXX counts seem high...
+        a /= count;
+        r /= count;
+        g /= count;
+        b /= count;
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+
+        e += src_dx;
+    }
+#endif
+
+}
+
+
+void downsample_columns_box_filter(
+        int n,
+        int columns,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    int div = (1<<24)/columns;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int h;
+        uint32_t *column_src = src;
+        a = (*column_src >> 24) & 0xff;
+        r = (*column_src >> 16) & 0xff;
+        g = (*column_src >>  8) & 0xff;
+        b = (*column_src >>  0) & 0xff;
+        for (h=1; h<columns; h++) {
+            a += (*column_src >> 24) & 0xff;
+            r += (*column_src >> 16) & 0xff;
+            g += (*column_src >>  8) & 0xff;
+            b += (*column_src >>  0) & 0xff;
+            column_src += stride;
+        }
+#if 1
+        a = (a * div + 0x10000) >> 24;
+        r = (r * div + 0x10000) >> 24;
+        g = (g * div + 0x10000) >> 24;
+        b = (b * div + 0x10000) >> 24;
+#else
+        a /= columns;
+        r /= columns;
+        g /= columns ;
+        b /= columns;
+#endif
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+#define ROUND
+//XXX:
+PIXMAN_EXPORT
+int downscale_box_filter(const struct rescale_fetcher *fetcher, void *closure, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dstStride)
+{
+   // printf("%d %d %d %d\n", scaledWidth, scaledHeight, origWidth, origHeight);
+        long xs2, ys2, xd2, yd2;
+        long yd1 = 0, ys1 = 0;
+        long xd1 = 0, xs1 = 0;
+	xs2 = origWidth ;
+	ys2 = origHeight ;
+	//xd2 = abs(scaledWidth) - 1;
+	xd2 = abs(scaledWidth) ;
+	yd2 = abs(scaledHeight) ;
+	long e,d,dx2;
+	short sy;
+	long dest_dy = abs(yd2);
+	long src_dy = abs(ys2);
+        if (scaledHeight < 0) {
+            ys1 = origHeight - ((src_dy<<1)+dest_dy)/(dest_dy << 1); // e = dy*2 - dx
+            sy = -1;
+        }
+	e = (src_dy<<1)-dest_dy; // e = dy*2 - dx
+	dx2 = dest_dy<<1; // dx2 = dx *2
+	src_dy <<= 1; // dy *= 2
+
+        long dest_dx,src_dx,ex,dx2x;
+	short src_sx;
+	dest_dx = abs(xd2);
+	src_dx = abs(xs2);
+        int src_direction = 1;
+#ifdef ROUND
+	ex = (src_dx<<1)-dest_dx;
+	dx2x = dest_dx<<1; // multiplying by 2 is for rounding purposes
+	src_dx <<= 1;
+#else
+        dx2x = dest_dx;
+        ex = src_dx - dest_dx;
+#endif
+
+        /* Compute the src_offset and associated error value:
+         * There are two ways to do this: */
+        /* 1: Directly */
+        int nsrc_offset = (src_dx*startColumn + dest_dx)*src_sx/(dest_dx*2);
+        long nex = (src_dx*startColumn + dest_dx)%(dest_dx*2) - dest_dx*2 + src_dx;
+	/* 2. Iteratively */
+        int src_offset = 0;
+	for (d = 0; d < startColumn; d++) {
+            //XXX: should be 'ex > 0'
+		while (ex > 0) {
+			src_offset += src_sx;
+			ex -= dx2x;
+		}
+		ex += src_dx;
+	}
+#if 0
+        if (nex != ex) {
+            printf("e: %d %d: %d %d %d\n", ex, nex, src_dx, startColumn, dest_dx);
+            assert(nex == ex);
+        }
+        if (nsrc_offset != src_offset) {
+            printf("off: %d %d: %d %d %d\n", src_offset, nsrc_offset, src_dx, startColumn, dest_dx);
+            assert(nsrc_offset == src_offset);
+        }
+
+#endif
+        //printf("src_offset %d %d %d\n", src_offset, nsrc_offset, ex);
+
+        /* we need to allocate enough room for ceil(src_height/dest_height) scanlines */
+	//XXX: I suppose we should check whether this will succeed
+        uint32_t *temp_buf = pixman_malloc_abc ((origHeight + scaledHeight-1)/scaledHeight, scaledWidth, sizeof(uint32_t));
+	if (!temp_buf)
+            return -1;
+
+        uint32_t *scanline = fetcher->init(closure);
+
+        int x = 0;
+        int y = 0;
+        /* seek to the begining */
+        for (d = 0; d < startRow; d++)
+	{
+            while (e > 0)
+            {
+                e -= dx2;
+                y++;
+            }
+            e += src_dy;
+        }
+
+        for (d = startRow; d < startRow + height; d++)
+	{
+            int columns = 0;
+            while (e > 0)
+            {
+                fetcher->fetch_scanline(closure, y, &scanline);
+                //XXX:  we could turn these multiplications into additions
+                //XXX: add src_offset to scanline?
+                downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                        dx2x, ex, src_dx);
+
+                ys1 ++;
+                e -= dx2;
+                columns++;
+                y++;
+            }
+            downsample_columns_box_filter(width, columns, temp_buf, dest + (yd1 - startRow)*dstStride/4);
+            yd1 += 1;
+            e += src_dy;
+        }
+        fetcher->fini(closure, scanline);
+        free(temp_buf);
+        return 0;
+}
+
diff --git a/pixman/pixman-rescale-lanczos.c b/pixman/pixman-rescale-lanczos.c
new file mode 100644
index 0000000..77a6e4b
--- /dev/null
+++ b/pixman/pixman-rescale-lanczos.c
@@ -0,0 +1,500 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright ? 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ *
+ * Portions derived from Chromium:
+ *
+ * Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *    * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <float.h>
+#include <math.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+
+
+#define FILTER_SHIFT 14
+#define LANCZOS_LOBES 2
+
+/* contains the filter for each destination pixel */
+struct filter {
+    int count; // filter size
+    int16_t *values; // filter coefficients
+    int offset; // offset
+};
+
+static int clamp(int a)
+{
+    if (a > 255)
+        return 255;
+    if (a < 0)
+        return 0;
+    return a;
+}
+
+/* from ffmpeg */
+static inline uint8_t clip_uint8(int a)
+{
+    if (a&(~255)) // check if any of the high bits are set
+        return (-a)>>31; // clip to either 0 or 255
+    else
+        return a;
+}
+
+#include <pmmintrin.h>
+#include <emmintrin.h>
+
+static void
+downsample_row_convolve_sse2(int n,
+		uint32_t *src, uint32_t *dest,
+                const struct filter *filter, int src_size)
+{
+    int pixel = 0;
+    while (n--) {
+        int i;
+        __m128i accum = _mm_setzero_si128();
+        __m128i zero = _mm_setzero_si128();
+        for (i=0; i<filter[pixel].count; i++) {
+            __m128i v_src = _mm_cvtsi32_si128(src[filter[pixel].offset + i]);
+            v_src = _mm_unpacklo_epi16(_mm_unpacklo_epi8(v_src, zero), zero);
+
+            __m128i coeff = _mm_cvtsi32_si128(filter[pixel].values[i]);
+            /* duplicate the filter coefficient */
+            coeff = _mm_shuffle_epi32(coeff, _MM_SHUFFLE(0, 0, 0, 0));
+
+            /* multiply and accumulate the result:
+             * 0000vvvv_0000vvvv_0000vvvv_0000vvvv * 000000aa_000000rr_000000gg_000000bb */
+            __m128i result = _mm_madd_epi16(v_src, coeff);
+            accum = _mm_add_epi32(accum, result);
+        }
+
+        /* scale the accumulator down */
+        accum = _mm_srai_epi32(accum, FILTER_SHIFT);
+
+        /* pack 000000aa_000000rr_000000gg_000000bb -> 00000000_00000000_00000000_aarrggbb */
+        accum = _mm_packs_epi32(accum, accum);
+
+        //XXX: this should be need to saturate properly but doesn't seem to make a difference
+        accum = _mm_max_epi16(accum, zero);
+
+        accum = _mm_packus_epi16(accum, accum);
+
+        *dest = _mm_cvtsi128_si32(accum);
+        dest++;
+        pixel++;
+    }
+}
+
+static void
+downsample_row_convolve(int n,
+		uint32_t *src, uint32_t *dest,
+                struct filter *filter, int src_size)
+{
+    int pixel = 0;
+    while (n--) {
+        int32_t a = 0;
+        int32_t r = 0;
+        int32_t g = 0;
+        int32_t b = 0;
+        int i;
+
+        for (i=0; i<filter[pixel].count; i++) {
+            a += ((src[filter[pixel].offset + i] >> 24) & 0xff) * filter[pixel].values[i];
+            r += ((src[filter[pixel].offset + i] >> 16) & 0xff) * filter[pixel].values[i];
+            g += ((src[filter[pixel].offset + i] >>  8) & 0xff) * filter[pixel].values[i];
+            b += ((src[filter[pixel].offset + i] >>  0) & 0xff) * filter[pixel].values[i];
+        }
+        a >>= FILTER_SHIFT;
+        r >>= FILTER_SHIFT;
+        g >>= FILTER_SHIFT;
+        b >>= FILTER_SHIFT;
+
+        a = clamp(a);
+        r = clamp(r);
+        g = clamp(g);
+        b = clamp(b);
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        pixel++;
+    }
+}
+
+/* instead of src1, src2 we could pass:
+ * ring_buf,
+ * start_index,
+ * ring_buf_length,
+ * filter_length */
+static void
+downsample_columns_convolve(int n,
+        uint32_t *src1, int src1_length,
+        uint32_t *src2, int src2_length,
+        uint32_t *dest,
+        int16_t *filter, int filter_length)
+{
+    assert(src1_length+src2_length == filter_length);
+    assert(src1_length >= 0);
+    assert(src2_length >= 0);
+    int stride = n;
+    while (n--) {
+        int32_t a = 0;
+        int32_t r = 0;
+        int32_t g = 0;
+        int32_t b = 0;
+        int i;
+        uint32_t *column_src;
+
+        /* we do the accumulation in two steps because the source lines are in a ring buffer
+         * so won't be contiguous */
+        column_src = src1;
+        for (i=0; i<src1_length; i++) { /* loop till the end of the ring buffer */
+            a += ((*column_src >> 24) & 0xff) * filter[i];
+            r += ((*column_src >> 16) & 0xff) * filter[i];
+            g += ((*column_src >>  8) & 0xff) * filter[i];
+            b += ((*column_src >>  0) & 0xff) * filter[i];
+            column_src += stride;
+        }
+
+        /* accumulate the remaining samples starting at the begining of the ring buffer */
+        column_src = src2;
+        for (; i<filter_length; i++) {
+            a += ((*column_src >> 24) & 0xff) * filter[i];
+            r += ((*column_src >> 16) & 0xff) * filter[i];
+            g += ((*column_src >>  8) & 0xff) * filter[i];
+            b += ((*column_src >>  0) & 0xff) * filter[i];
+            column_src += stride;
+        }
+
+        a >>= FILTER_SHIFT;
+        r >>= FILTER_SHIFT;
+        g >>= FILTER_SHIFT;
+        b >>= FILTER_SHIFT;
+
+        a = clamp(a);
+        r = clamp(r);
+        g = clamp(g);
+        b = clamp(b);
+        a = 0xff;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src1++;
+        src2++;
+    }
+}
+
+
+// Evaluates the Lanczos filter of the given filter size window for the given
+// position.
+//
+// |filter_size| is the width of the filter (the "window"), outside of which
+// the value of the function is 0. Inside of the window, the value is the
+// normalized sinc function:
+//   lanczos(x) = sinc(x) * sinc(x / filter_size);
+// where
+//   sinc(x) = sin(pi*x) / (pi*x);
+float eval_lanczos(int filter_size, float x) {
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  // Outside of the window.
+  if (x > -FLT_EPSILON &&
+      x < FLT_EPSILON)
+    return 1.0f;  // Special case the discontinuity at the origin.
+  float xpi = x * (M_PI);
+  return (sin(xpi) / xpi) *  // sinc(x)
+          sin(xpi / filter_size) / (xpi / filter_size);  // sinc(x/filter_size)
+}
+
+/* dealing with the edges:
+   some options:
+   we could always have approximately the same number of samples in the filter and just pad the image out
+   chromium seems to truncate the filter though...
+   I don't really have a good reason to choose either approach...
+   one way to get an idea is to see what other implementation do.
+   - it looks like quartz pads
+   - chromium truncates the filter
+   - opera pads
+*/
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+int floor_int(float a)
+{
+    return floor(a);
+}
+
+int ceil_int(float a)
+{
+    return ceil(a);
+}
+
+int float_to_fixed(float a)
+{
+    return a * (1 << FILTER_SHIFT);
+}
+
+/* this method does not do source clipping
+ * but will do dest clipping */
+struct filter *compute_lanczos_filter(int dest_subset_lo, int dest_subset_size, int src_size, float scale)
+{
+    // this is half the number of pixels that the filter uses in filter space
+    int dest_support = LANCZOS_LOBES;
+    float src_support = dest_support / scale;
+
+    /* look at ResizeFilter::ResizeFilter() and ::ComputerFilters() */
+    /* we need to compute a set of filters for each pixel */
+    /* filter width */
+    int i;
+    struct filter *filter = malloc(dest_subset_size * sizeof(struct filter));
+    int max_filter_size = ceil_int(src_support * 2) - floor_int(src_support * -2) + 1;
+    float *filter_values = malloc(max_filter_size * sizeof(float));
+    int dest_subset_hi = dest_subset_lo + dest_subset_size; // [lo, hi)
+
+
+    // When we're doing a magnification, the scale will be larger than one. This
+    // means the destination pixels are much smaller than the source pixels, and
+    // that the range covered by the filter won't necessarily cover any source
+    // pixel boundaries. Therefore, we use these clamped values (max of 1) for
+    // some computations.
+    float clamped_scale = min(1., scale);
+
+    // Speed up the divisions below by turning them into multiplies.
+    float inv_scale = 1. / scale;
+
+    // Loop over all pixels in the output range. We will generate one set of
+    // filter values for each one. Those values will tell us how to blend the
+    // source pixels to compute the destination pixel.
+    int dest_subset_i;
+    int pixel = 0;
+    for (dest_subset_i = dest_subset_lo; dest_subset_i < dest_subset_hi; dest_subset_i++, pixel++) {
+
+        float src_pixel = dest_subset_i * inv_scale;
+
+        int src_begin = max(0, floor_int(src_pixel - src_support));
+        assert(src_begin >= 0);
+        int src_end = min(src_size - 1, ceil_int(src_pixel + src_support));
+
+        // Compute the unnormalized filter value at each location of the source
+        // it covers.
+        float filter_sum = 0.; // sum of the filter values for normalizing
+        int filter_size = 0;
+        int cur_filter_pixel;
+        int j = 0;
+        for (cur_filter_pixel = src_begin; cur_filter_pixel <= src_end;
+                cur_filter_pixel++) {
+            // Distance from the center of the filter, this is the filter coordinate
+            // in source space.
+            float src_filter_pos = cur_filter_pixel - src_pixel;
+
+            // Since the filter really exists in dest space, map it there.
+            float dest_filter_pos = src_filter_pos * clamped_scale;
+
+            // Compute the filter value at that location.
+            float filter_value = eval_lanczos(LANCZOS_LOBES, dest_filter_pos);
+            filter_sum += filter_value;
+            filter_values[j] = filter_value;
+
+            filter_size++;
+            j++;
+        }
+        if (src_end < src_begin) {
+            printf("%d %d %d (%d %d - %d)\n", src_end, src_begin, dest_subset_i, dest_subset_lo, dest_subset_hi, dest_subset_size);
+            assert(src_end >= src_begin);
+        }
+        if (filter_size > max_filter_size) {
+            printf("%f %f\n", 1./scale, (1. / scale) * src_support * 2 + 1); //XXX is this correct?
+            printf("%d %d\n", filter_size, max_filter_size);
+            assert(filter_size <= max_filter_size);
+        }
+
+        //XXX: eventually we should avoid doing malloc here and just malloc a big
+        // chunk of max_filter_size * sizeof(int16_t)
+        int16_t *fixed_filter_values = malloc(filter_size * sizeof(int16_t));
+
+        // the filter must be normalized so that we don't affect the brightness of
+        // the image. Convert to normalized fixed point
+        // XXX: It might be better if we didn't have to do this in a separate pass
+        int16_t fixed_sum = 0; // XXX: should we use a regular int here?
+        for (i=0; i<filter_size; i++) {
+            int16_t cur_fixed = float_to_fixed(filter_values[i] / filter_sum);
+            fixed_sum += cur_fixed;
+            fixed_filter_values[i] = cur_fixed;
+        }
+
+        // The conversion to fixed point will leave some rounding errors, which
+        // we add back in to avoid affecting the brightness of the image. We
+        // arbitrarily add this to the center of the filter array (this won't always
+        // be the center of the filter function since it could get clipped on the
+        // edges, but it doesn't matter enough to worry about that case).
+        int16_t leftovers = float_to_fixed(1.0f) - fixed_sum;
+        fixed_filter_values[filter_size / 2] += leftovers;
+
+        filter[pixel].values = fixed_filter_values;
+        filter[pixel].count = filter_size;
+        filter[pixel].offset = src_begin;
+        assert(filter[pixel].offset >= 0);
+        assert(filter[pixel].offset + filter[pixel].count - 1 < src_size);
+    }
+    free(filter_values);
+    return filter;
+}
+
+// startColumn and startRow are in destination space
+PIXMAN_EXPORT
+int downscale_lanczos_filter(const struct rescale_fetcher *fetcher, void *closure, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dstStride)
+{
+    assert(startColumn + width <= scaledWidth);
+    assert(startRow + height <= scaledHeight);
+    int yd1 = 0;
+    int d;
+
+    //XXX: check what this actually needs to be
+    int lanczos_src = LANCZOS_LOBES * 2;
+
+    //XXX: this duplicates code in compute_lanczos_filter
+    int dest_support = LANCZOS_LOBES;
+    float src_support = dest_support / ((float)scaledHeight/origHeight);
+
+    int max_filter_size = ceil_int(src_support * 2) - floor_int(src_support * -2) + 1;
+
+    int ring_buf_size = max_filter_size;
+    uint32_t *ring_buf = pixman_malloc_abc (ring_buf_size, scaledWidth, sizeof(uint32_t));
+    if (!ring_buf)
+        return -1;
+
+    //XXX: I suppose we should check whether this will succeed
+    uint32_t *scanline = fetcher->init(closure);
+
+    void (*fetch_scanline)(void *closure, int y, uint32_t **scanline) = fetcher->fetch_scanline;
+
+    struct filter *x_filter = compute_lanczos_filter(startColumn, width, origWidth, (float)scaledWidth/origWidth);
+    struct filter *y_filter = compute_lanczos_filter(startRow, height, origHeight, (float)scaledHeight/origHeight);
+
+    int index = 0;
+    int next_x_row = 0;
+    int filter_index = 0;
+    for (d = startRow; d < startRow + height; d++, filter_index++)
+    {
+        int filter_length = y_filter[filter_index].count;
+        int filter_size = y_filter[filter_index].count;
+        int filter_offset = y_filter[filter_index].offset;
+        assert(filter_offset >= 0);
+        /* read and downsample the rows needed to downsample the next column */
+        while (next_x_row < filter_offset + filter_length) {
+            fetch_scanline(closure, index, &scanline);
+            downsample_row_convolve_sse2(width, scanline, ring_buf + width * (index % ring_buf_size), x_filter, origWidth);
+            //downsample_row_convolve(width, scanline, ring_buf + width * (index % ring_buf_size), x_filter, origWidth);
+            index++;
+            next_x_row++;
+            //XXX: when index overflows this becomes bad: i.e. 0 % 5 != 8 % 5
+        }
+
+        int src1_index = filter_offset % ring_buf_size;
+        assert(src1_index >= 0);
+        assert(filter_size <= ring_buf_size);
+        uint32_t *src1 = ring_buf + width * src1_index;
+        uint32_t *src2 = ring_buf; // src2 is always at the same location
+
+        int src1_size = min(ring_buf_size - src1_index, filter_size);
+        int src2_size = filter_size - src1_size;
+        assert(src1_size >= 0);
+        assert(filter_size >= src1_size);
+
+        downsample_columns_convolve(width, src1, src1_size, src2, src2_size, dest + (yd1 - startRow)*dstStride/4, y_filter[filter_index].values, y_filter[filter_index].count);
+        yd1++;
+    }
+
+    free(ring_buf);
+    fetcher->fini(closure, scanline);
+    int i;
+    for (i=0; i<width; i++)
+        free(x_filter[i].values);
+    for (i=0; i<height; i++)
+        free(y_filter[i].values);
+    free(x_filter);
+    free(y_filter);
+
+    return 0;
+}
+
+#if 0
+            // rearrange filter coefficients in the order of the ring buffer?
+            // or we could just filter in the order of the ring buffer
+            i = 0;
+            j = start;
+            while (n) {
+                result += sample[j] * coeff[i];
+                if (j > size)
+                    j = 0;
+                i++;
+                j++;
+            }
+
+            // or rearranging the coeffs gives us
+            while (n) {
+                result += sample[i + sample*width] * coeff[i];
+            }
+            /* it also lets us move in a more orderly fashion through memory */
+            /* however I can't think of any way to get the samples to always be contiguous */
+
+            // it would be interesting to see how doing the multiplication by row and then summing all the rows compared to doing
+            // everything column wise. I expect the memory bandwidth of doing so could make things slower
+#endif
+
diff --git a/pixman/pixman-rescale-mult.c b/pixman/pixman-rescale-mult.c
new file mode 100644
index 0000000..4342e79
--- /dev/null
+++ b/pixman/pixman-rescale-mult.c
@@ -0,0 +1,442 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright ? 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+/* This implements a box filter that supports non-integer box sizes */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+
+/* we work in fixed point where 1. == 1 << 24 */
+#define FIXED_SHIFT 24
+
+#if 0
+static void downsample_row_box_filter_sse2(
+        int start,
+		int width,
+		uint32_t *src, uint32_t *dest,
+                int coverage[], int pixel_coverage)
+{
+    /* we need an array of the pixel contribution of each destination pixel on the boundaries.
+     * we invert the value to get the value on the other size of the box */
+    /*
+
+       value  = a * contribution * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * (1 - contribution) * 1/box_size
+                a * (1/box_size - contribution * 1/box_size)
+
+        box size is constant
+
+
+       value = a * contribtion_a * 1/box_size + b * contribution_b * 1/box_size
+               contribution_b = (1 - contribution_a)
+                              = (1 - contribution_a_next)
+    */
+
+    /* box size = ceil(src_width/dest_width) */
+    int x = 0;
+    while (x < start) {
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        box -= start_coverage;
+        src++;
+        while (box >= pixel_coverage) {
+            src++;
+            box -= pixel_coverage;
+        }
+        x++;
+    }
+    __m128i accum = _mm_setzero_si128();
+    __m128i zero = _mm_setzero_si128();
+    while (x < start + width) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        __m128i v_src = _mm_cvtsi32_si128(*src);
+        v_src = _mm_unpacklo_epi16(_mm_unpacklo_epi8(v_src, zero), zero);
+        __m128i coeff = _mm_cvtsi32_si128(start_coverage);
+        /* duplicate the filter coefficient */
+        coeff = _mm_shuffle_epi32(coeff, _MM_SHUFFLE(0, 0, 0, 0));
+        /* multiply and accumulate the result:
+         * 0000vvvv_0000vvvv_0000vvvv_0000vvvv * 000000aa_000000rr_000000gg_000000bb */
+        accum = _mm_madd_epi16(v_src, coeff);
+        // we can't use a 32 bit mulitply in sse2
+        // sse4.1 gives us pmulld but it is apparenlty slow (6 clocks on Nehalem)
+        src++;
+        x++;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*src >> 24) & 0xff) * pixel_coverage;
+            r += ((*src >> 16) & 0xff) * pixel_coverage;
+            g += ((*src >>  8) & 0xff) * pixel_coverage;
+            b += ((*src >>  0) & 0xff) * pixel_coverage;
+            src++;
+
+            box -= pixel_coverage;
+        }
+        /* multiply by whatever is leftover
+         * this ensures that we don't bias down.
+         * i.e. start_coverage + n*pixel_coverage + box == 1 << 24 */
+        if (box > 0) {
+            a += ((*src >> 24) & 0xff) * box;
+            r += ((*src >> 16) & 0xff) * box;
+            g += ((*src >>  8) & 0xff) * box;
+            b += ((*src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+    }
+}
+#endif
+
+static void downsample_row_box_filter(
+        int start, int width,
+        uint32_t *src, uint32_t *dest,
+        int coverage[], int pixel_coverage)
+{
+    /* we need an array of the pixel contribution of each destination pixel on the boundaries.
+     * we invert the value to get the value on the other size of the box */
+    /*
+
+       value  = a * contribution * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * (1 - contribution) * 1/box_size
+                a * (1/box_size - contribution * 1/box_size)
+
+        box size is constant
+
+
+       value = a * contribtion_a * 1/box_size + b * contribution_b * 1/box_size
+               contribution_b = (1 - contribution_a)
+                              = (1 - contribution_a_next)
+    */
+
+    /* box size = ceil(src_width/dest_width) */
+    int x = 0;
+
+    /* skip to start */
+    //XXX: it might be possible to do this directly instead of iteratively, however
+    // the iterative solution is simple
+    while (x < start) {
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        box -= start_coverage;
+        src++;
+        while (box >= pixel_coverage) {
+            src++;
+            box -= pixel_coverage;
+        }
+        x++;
+    }
+
+    while (x < start + width) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        a = ((*src >> 24) & 0xff) * start_coverage;
+        r = ((*src >> 16) & 0xff) * start_coverage;
+        g = ((*src >>  8) & 0xff) * start_coverage;
+        b = ((*src >>  0) & 0xff) * start_coverage;
+        src++;
+        x++;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*src >> 24) & 0xff) * pixel_coverage;
+            r += ((*src >> 16) & 0xff) * pixel_coverage;
+            g += ((*src >>  8) & 0xff) * pixel_coverage;
+            b += ((*src >>  0) & 0xff) * pixel_coverage;
+            src++;
+
+            box -= pixel_coverage;
+        }
+        /* multiply by whatever is leftover
+         * this ensures that we don't bias down.
+         * i.e. start_coverage + n*pixel_coverage + box == 1 << 24 */
+        if (box > 0) {
+            a += ((*src >> 24) & 0xff) * box;
+            r += ((*src >> 16) & 0xff) * box;
+            g += ((*src >>  8) & 0xff) * box;
+            b += ((*src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+    }
+}
+
+static void downsample_columns_box_filter(
+        int n,
+        int start_coverage,
+        int pixel_coverage,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        uint32_t *column_src = src;
+        int box = 1 << FIXED_SHIFT;
+        a = ((*column_src >> 24) & 0xff) * start_coverage;
+        r = ((*column_src >> 16) & 0xff) * start_coverage;
+        g = ((*column_src >>  8) & 0xff) * start_coverage;
+        b = ((*column_src >>  0) & 0xff) * start_coverage;
+        column_src += stride;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*column_src >> 24) & 0xff) * pixel_coverage;
+            r += ((*column_src >> 16) & 0xff) * pixel_coverage;
+            g += ((*column_src >>  8) & 0xff) * pixel_coverage;
+            b += ((*column_src >>  0) & 0xff) * pixel_coverage;
+            column_src += stride;
+            box -= pixel_coverage;
+        }
+        if (box > 0) {
+            a += ((*column_src >> 24) & 0xff) * box;
+            r += ((*column_src >> 16) & 0xff) * box;
+            g += ((*column_src >>  8) & 0xff) * box;
+            b += ((*column_src >>  0) & 0xff) * box;
+        }
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+#include <math.h>
+static int compute_coverage(int coverage[], int src_length, int dest_length) {
+    int i;
+    /* num = src_length/dest_length
+       total = sum(pixel) / num
+
+       pixel * 1/num == pixel * dest_length / src_length
+    */
+    /* the average contribution of each source pixel */
+    int ratio = ((1 << 24)*(long long int)dest_length)/src_length;
+    /* because ((1 << 24)*(long long int)dest_length) won't always be divisible by src_length
+     * we'll need someplace to put the other bits.
+     *
+     * We want to ensure a + n*ratio < 1<<24
+     *
+     * 1<<24
+     * */
+    printf("%d %d\n", src_length, dest_length);
+    double scale=(double)src_length/dest_length;
+    int y = 0;
+    /* for each destination pixel compute the coverage of the left most pixel included in the box */
+    for (i=0; i<dest_length; i++) {
+        float left_side = i*scale;
+        float right_side = (i+1)*scale;
+        float right_fract = right_side - floor(right_side);
+        float left_fract = ceil(left_side) - left_side;
+        assert(right_fract <= 1);
+
+        /* find out how many source pixels will be used to fill the box */
+        int count = floor(right_side) - ceil(left_side);
+        /* what's the maximum value this expression can become?
+           floor((i+1)*scale) - ceil(i*scale)
+
+           (i+1)*scale - i*scale == scale
+
+           since floor((i+1)*scale) <= (i+1)*scale
+           and   ceil(i*scale)      >= i*scale
+
+           floor((i+1)*scale) - ceil(i*scale) <= scale
+
+           further since: floor((i+1)*scale) - ceil(i*scale) is an integer
+
+           therefore:
+           floor((i+1)*scale) - ceil(i*scale) <= floor(scale)
+        */
+
+        if (left_fract == 0.)
+            count--;
+        int overage = ratio*(right_fract);
+        int coeff = (1<<24) - (count * ratio + overage);
+#define COV_DEBUG
+#ifdef COV_DEBUG
+        int coeff_est = (1<<24)*(left_fract)/src_length;
+        if (coeff_est == 0) {
+            coeff_est = ratio;
+        }
+
+        //printf("y: %d, %f %d %d %d-%d %f %f-%f %d\n", y, scale, ratio, i, coeff, coeff_est, right_fract, left_side, right_side, overage);
+        //if (coeff < coeff_est)
+        //    coeff = coeff_est;
+        assert(coeff >= 0);
+        int box = 1<<24;
+        box -= coeff;
+        y++;
+        while (box >= ratio) {
+            box -= ratio;
+            y++;
+        }
+        if (y != floor(right_side)) {
+            printf("y: %d, right_side: %f\n",y, right_side);
+            assert(y == floor(right_side));
+        }
+        if (y == src_length) {
+            printf("box: %d\n", box);
+            assert(box <= 1);
+            //coeff += box;
+        }
+#endif
+        coverage[i] = coeff;
+
+    }
+    printf("y: %d src_length: %d\n", y, src_length);
+    assert(y == src_length);
+    return ratio;
+}
+
+PIXMAN_EXPORT
+int downscale_box_mult_filter(const struct rescale_fetcher *fetcher, void *closure, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dstStride)
+{
+   // printf("%d %d %d %d\n", scaledWidth, scaledHeight, origWidth, origHeight);
+        int yd1 = 0;
+        int d;
+
+        assert(width + startColumn <= scaledWidth);
+        /* we need to allocate enough room for ceil(src_height/dest_height)+1
+          Example:
+          src_height = 140
+          dest_height = 50
+          src_height/dest_height = 2.8
+
+          |-------------|       2.8 pixels
+          |----|----|----|----| 4 pixels
+          need to sample 3 pixels
+
+             |-------------|       2.8 pixels
+          |----|----|----|----| 4 pixels
+          need to sample 4 pixels
+        */
+        void (*fetch_scanline)(void *closure, int y, uint32_t **scanline) = fetcher->fetch_scanline;
+        uint32_t *temp_buf = pixman_malloc_abc ((origHeight + scaledHeight-1)/scaledHeight+1, scaledWidth, sizeof(uint32_t));
+	if (!temp_buf)
+            return -1;
+
+	//XXX: I suppose we should check whether this will succeed
+        uint32_t *scanline = fetcher->init(closure);
+
+        int *x_coverage = pixman_malloc_abc (origWidth, 1, sizeof(int));
+        int *y_coverage = pixman_malloc_abc (origHeight, 1, sizeof(int));
+        int pixel_coverage_x = compute_coverage(x_coverage, origWidth, scaledWidth);
+        int pixel_coverage_y = compute_coverage(y_coverage, origHeight, scaledHeight);
+
+        int y = 0;
+        for (d = 0; d<startRow; d++) {
+            int box = 1 << FIXED_SHIFT;
+            int start_coverage_y = y_coverage[d];
+            box -= start_coverage_y;
+            y++;
+            while (box >= pixel_coverage_y) {
+                box -= pixel_coverage_y;
+                y++;
+            }
+        }
+        for (d = startRow; d < startRow + height; d++)
+	{
+            int columns = 0;
+            int box = 1 << FIXED_SHIFT;
+            int start_coverage_y = y_coverage[d];
+            fetch_scanline(closure, y, &scanline);
+            downsample_row_box_filter(startColumn, width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+            columns++;
+            y++;
+            box -= start_coverage_y;
+
+            while (box >= pixel_coverage_y) {
+                fetch_scanline(closure, y, &scanline);
+                downsample_row_box_filter(startColumn, width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+                y++;
+                box -= pixel_coverage_y;
+            }
+            if (box > 0) {
+                fetch_scanline(closure, y, &scanline);
+                downsample_row_box_filter(startColumn, width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+            }
+            downsample_columns_box_filter(width, start_coverage_y, pixel_coverage_y, temp_buf, dest + (yd1 - startRow)*dstStride/4);
+            yd1++;
+            if (width*columns > ((origHeight + scaledHeight-1)/scaledHeight+1) * width) {
+                printf("%d %d\n", origHeight, scaledHeight);
+                printf("%d %d\n", columns, (origHeight + scaledHeight-1)/scaledHeight);
+                assert(width*columns <= (origHeight + scaledHeight-1)/scaledHeight * width);
+            }
+        }
+        if (y > origHeight) {
+            printf("%d %d\n", y, origHeight);
+            assert(y<=origHeight);
+        }
+        fetcher->fini(closure, scanline);
+        free(x_coverage);
+        free(y_coverage);
+        free(temp_buf);
+        return 0;
+}