pixman: add 0.21.4 from OE

Signed-off-by: Koen Kooi <koen@dominion.thruhere.net>
This commit is contained in:
Koen Kooi 2011-01-25 20:29:36 +01:00
parent 9777a97fe9
commit 486918c6ef
25 changed files with 41 additions and 1580 deletions

View File

@ -1,35 +0,0 @@
From e7ee43c39d2370716a4d011afa8f5067eced9899 Mon Sep 17 00:00:00 2001
From: Cyril Brulebois <kibi@debian.org>
Date: Wed, 17 Nov 2010 16:16:56 +0100
Subject: [PATCH 02/24] Fix argument quoting for AC_INIT.
One gets rid of this accordingly:
| autoreconf -vfi
| autoreconf: Entering directory `.'
| autoreconf: configure.ac: not using Gettext
| autoreconf: running: aclocal --force
| configure.ac:61: warning: AC_INIT: not a literal: "pixman@lists.freedesktop.org"
| autoreconf: configure.ac: tracing
| configure.ac:61: warning: AC_INIT: not a literal: "pixman@lists.freedesktop.org"
Signed-off-by: Cyril Brulebois <kibi@debian.org>
---
configure.ac | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/configure.ac b/configure.ac
index db1da21..147e1bf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -58,7 +58,7 @@ m4_define([pixman_micro], 3)
m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
-AC_INIT(pixman, pixman_version, "pixman@lists.freedesktop.org", pixman)
+AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
AM_INIT_AUTOMAKE([foreign dist-bzip2])
# Suppress verbose compile lines
--
1.6.6.1

View File

@ -1,39 +0,0 @@
From 654961efe405ad1a7e54a77548ca8af322ecc1f8 Mon Sep 17 00:00:00 2001
From: Alan Coopersmith <alan.coopersmith@oracle.com>
Date: Sun, 21 Nov 2010 11:42:22 -0800
Subject: [PATCH 03/24] Sun's copyrights belong to Oracle now
Signed-off-by: Alan Coopersmith <alan.coopersmith@oracle.com>
---
COPYING | 2 +-
pixman/solaris-hwcap.mapfile | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/COPYING b/COPYING
index 3092a34..15f9517 100644
--- a/COPYING
+++ b/COPYING
@@ -18,7 +18,7 @@ possible. They may also add themselves to the list below.
* Copyright 2008 André Tupinambá
* Copyright 2008 Mozilla Corporation
* Copyright 2008 Frederic Plourde
- * Copyright 2009 Sun Microsystems, Inc.
+ * Copyright 2009, Oracle and/or its affiliates. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
diff --git a/pixman/solaris-hwcap.mapfile b/pixman/solaris-hwcap.mapfile
index 3605ca7..87efce1 100644
--- a/pixman/solaris-hwcap.mapfile
+++ b/pixman/solaris-hwcap.mapfile
@@ -1,6 +1,6 @@
###############################################################################
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009, Oracle and/or its affiliates. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
--
1.6.6.1

View File

@ -1,159 +0,0 @@
From 4b5b5a2a832cd67f2a0ec231f75a2825b45571fa Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 15 Nov 2010 18:26:43 +0200
Subject: [PATCH 04/24] C fast path for a1 fill operation
Can be used as one of the solutions to fix bug
https://bugs.freedesktop.org/show_bug.cgi?id=31604
---
pixman/pixman-fast-path.c | 87 ++++++++++++++++++++++++++++++++++++++++++++-
pixman/pixman.c | 7 +++-
2 files changed, 91 insertions(+), 3 deletions(-)
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 5d5fa95..37dfbae 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1334,7 +1334,11 @@ fast_composite_solid_fill (pixman_implementation_t *imp,
src = _pixman_image_get_solid (src_image, dst_image->bits.format);
- if (dst_image->bits.format == PIXMAN_a8)
+ if (dst_image->bits.format == PIXMAN_a1)
+ {
+ src = src >> 31;
+ }
+ else if (dst_image->bits.format == PIXMAN_a8)
{
src = src >> 24;
}
@@ -1655,6 +1659,7 @@ static const pixman_fast_path_t c_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+ PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
@@ -1733,6 +1738,82 @@ static const pixman_fast_path_t c_fast_paths[] =
{ PIXMAN_OP_NONE },
};
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+ if (offs)
+ {
+ int leading_pixels = 32 - offs;
+ if (leading_pixels >= width)
+ {
+ if (v)
+ *dst |= A1_FILL_MASK (width, offs);
+ else
+ *dst &= ~A1_FILL_MASK (width, offs);
+ return;
+ }
+ else
+ {
+ if (v)
+ *dst++ |= A1_FILL_MASK (leading_pixels, offs);
+ else
+ *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+ width -= leading_pixels;
+ }
+ }
+ while (width >= 32)
+ {
+ if (v)
+ *dst++ = 0xFFFFFFFF;
+ else
+ *dst++ = 0;
+ width -= 32;
+ }
+ if (width > 0)
+ {
+ if (v)
+ *dst |= A1_FILL_MASK (width, 0);
+ else
+ *dst &= ~A1_FILL_MASK (width, 0);
+ }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+ int stride,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t xor)
+{
+ uint32_t *dst = bits + y * stride + (x >> 5);
+ int offs = x & 31;
+
+ if (xor & 1)
+ {
+ while (height--)
+ {
+ pixman_fill1_line (dst, offs, width, 1);
+ dst += stride;
+ }
+ }
+ else
+ {
+ while (height--)
+ {
+ pixman_fill1_line (dst, offs, width, 0);
+ dst += stride;
+ }
+ }
+}
+
static void
pixman_fill8 (uint32_t *bits,
int stride,
@@ -1819,6 +1900,10 @@ fast_path_fill (pixman_implementation_t *imp,
{
switch (bpp)
{
+ case 1:
+ pixman_fill1 (bits, stride, x, y, width, height, xor);
+ break;
+
case 8:
pixman_fill8 (bits, stride, x, y, width, height, xor);
break;
diff --git a/pixman/pixman.c b/pixman/pixman.c
index 045c556..ec565f9 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -875,7 +875,8 @@ color_to_pixel (pixman_color_t * color,
format == PIXMAN_b8g8r8x8 ||
format == PIXMAN_r5g6b5 ||
format == PIXMAN_b5g6r5 ||
- format == PIXMAN_a8))
+ format == PIXMAN_a8 ||
+ format == PIXMAN_a1))
{
return FALSE;
}
@@ -895,7 +896,9 @@ color_to_pixel (pixman_color_t * color,
((c & 0x000000ff) << 24);
}
- if (format == PIXMAN_a8)
+ if (format == PIXMAN_a1)
+ c = c >> 31;
+ else if (format == PIXMAN_a8)
c = c >> 24;
else if (format == PIXMAN_r5g6b5 ||
format == PIXMAN_b5g6r5)
--
1.6.6.1

View File

@ -1,113 +0,0 @@
From 98d08b37f17a3379d0ceff8bb7de8f943873fbd8 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Fri, 26 Nov 2010 08:55:49 +0200
Subject: [PATCH 05/24] ARM: added 'neon_composite_over_n_8_8' fast path
---
pixman/pixman-arm-neon-asm.S | 68 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 3 ++
2 files changed, 71 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 91ec27d..a3875ee 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1203,6 +1203,74 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d8
+ vmull.u8 q6, d26, d8
+ vmull.u8 q7, d27, d8
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vmvn.8 q12, q0
+ vmvn.8 q13, q1
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_tail
+ vld1.8 {d24, d25, d26, d27}, [MASK]!
+ cache_preload 32, 32
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d8[0]}, [DUMMY]
+ vdup.8 d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8_init, \
+ pixman_composite_over_n_8_8_cleanup, \
+ pixman_composite_over_n_8_8_process_pixblock_head, \
+ pixman_composite_over_n_8_8_process_pixblock_tail, \
+ pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
/*
* 'combine_mask_ca' replacement
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 2f82069..72ef75e 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -76,6 +76,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
@@ -235,6 +237,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, x8r8g8b8, neon_composite_src_0888_8888_rev),
PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, r5g6b5, neon_composite_src_0888_0565_rev),
PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8r8g8b8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, neon_composite_over_n_8_8888),
--
1.6.6.1

View File

@ -1,157 +0,0 @@
From 3be86a92ccab240859062a541cdb871d81c9501a Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sun, 28 Nov 2010 21:45:06 +0200
Subject: [PATCH 06/24] ARM: introduced 'fetch_mask_pixblock' macro to simplify code
This macro hides the implementation details of pixels fetching
for the mask image just like 'fetch_src_pixblock' does for the
source image. This provides more possibilities for reusing the
same code blocks in different compositing functions.
This patch does not introduce any functional changes and the
resulting code in the compiled object file is exactly the same.
---
pixman/pixman-arm-neon-asm.S | 26 +++++++++++++-------------
pixman/pixman-arm-neon-asm.h | 5 +++++
2 files changed, 18 insertions(+), 13 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index a3875ee..155a236 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -841,7 +841,7 @@ generate_composite_function \
pixman_composite_over_n_8_0565_process_pixblock_tail
vst1.16 {d28, d29}, [DST_W, :128]!
vld1.16 {d4, d5}, [DST_R, :128]!
- vld1.8 {d24}, [MASK]!
+ fetch_mask_pixblock
cache_preload 8, 8
pixman_composite_over_n_8_0565_process_pixblock_head
.endm
@@ -889,7 +889,7 @@ generate_composite_function \
pixman_composite_over_n_8_0565_process_pixblock_tail
fetch_src_pixblock
cache_preload 8, 8
- vld1.8 {d24}, [MASK]!
+ fetch_mask_pixblock
pixman_composite_over_n_8_0565_process_pixblock_head
vst1.16 {d28, d29}, [DST_W, :128]!
.endm
@@ -1171,7 +1171,7 @@ generate_composite_function \
pixman_composite_over_n_8_8888_process_pixblock_tail
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vld1.8 {d24}, [MASK]!
+ fetch_mask_pixblock
cache_preload 8, 8
pixman_composite_over_n_8_8888_process_pixblock_head
.endm
@@ -1241,7 +1241,7 @@ generate_composite_function \
.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
pixman_composite_over_n_8_8_process_pixblock_tail
- vld1.8 {d24, d25, d26, d27}, [MASK]!
+ fetch_mask_pixblock
cache_preload 32, 32
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
pixman_composite_over_n_8_8_process_pixblock_head
@@ -1341,7 +1341,7 @@ generate_composite_function \
vraddhn.u16 d29, q15, q9
vraddhn.u16 d30, q6, q10
vraddhn.u16 d31, q7, q11
- vld4.8 {d24, d25, d26, d27}, [MASK]!
+ fetch_mask_pixblock
vqadd.u8 q14, q0, q14
vqadd.u8 q15, q1, q15
cache_preload 8, 8
@@ -1405,7 +1405,7 @@ generate_composite_function \
pixman_composite_add_n_8_8_process_pixblock_tail
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vld1.8 {d24, d25, d26, d27}, [MASK]!
+ fetch_mask_pixblock
cache_preload 32, 32
pixman_composite_add_n_8_8_process_pixblock_head
.endm
@@ -1462,7 +1462,7 @@ generate_composite_function \
pixman_composite_add_8_8_8_process_pixblock_tail
vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vld1.8 {d24, d25, d26, d27}, [MASK]!
+ fetch_mask_pixblock
fetch_src_pixblock
cache_preload 32, 32
pixman_composite_add_8_8_8_process_pixblock_head
@@ -1515,7 +1515,7 @@ generate_composite_function \
pixman_composite_add_8888_8888_8888_process_pixblock_tail
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- vld4.8 {d24, d25, d26, d27}, [MASK]!
+ fetch_mask_pixblock
fetch_src_pixblock
cache_preload 8, 8
pixman_composite_add_8888_8888_8888_process_pixblock_head
@@ -1587,7 +1587,7 @@ generate_composite_function_single_scanline \
pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
fetch_src_pixblock
cache_preload 8, 8
- vld4.8 {d12, d13, d14, d15}, [MASK]!
+ fetch_mask_pixblock
pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
.endm
@@ -1658,7 +1658,7 @@ generate_composite_function \
pixman_composite_over_8888_n_8888_process_pixblock_tail
fetch_src_pixblock
cache_preload 8, 8
- vld4.8 {d12, d13, d14, d15}, [MASK]!
+ fetch_mask_pixblock
pixman_composite_over_8888_n_8888_process_pixblock_head
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
.endm
@@ -1700,7 +1700,7 @@ generate_composite_function_single_scanline \
pixman_composite_over_8888_n_8888_process_pixblock_tail
fetch_src_pixblock
cache_preload 8, 8
- vld1.8 {d15}, [MASK]!
+ fetch_mask_pixblock
pixman_composite_over_8888_n_8888_process_pixblock_head
vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
.endm
@@ -1917,7 +1917,7 @@ generate_composite_function \
/* TODO: expand macros and do better instructions scheduling */
.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
- vld1.8 {d15}, [MASK]!
+ fetch_mask_pixblock
pixman_composite_over_0565_8_0565_process_pixblock_tail
fetch_src_pixblock
vld1.16 {d10, d11}, [DST_R, :128]!
@@ -1969,7 +1969,7 @@ generate_composite_function \
/* TODO: expand macros and do better instructions scheduling */
.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
- vld1.8 {d15}, [MASK]!
+ fetch_mask_pixblock
pixman_composite_add_0565_8_0565_process_pixblock_tail
fetch_src_pixblock
vld1.16 {d10, d11}, [DST_R, :128]!
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index c75bdc3..24fa361 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -431,6 +431,11 @@
.endif
.endm
+.macro fetch_mask_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
/*
* Macro which is used to process leading pixels until destination
* pointer is properly aligned (at 16 bytes boundary). When destination
--
1.6.6.1

View File

@ -1,170 +0,0 @@
From e6814837a6ccd3e4db329e0131eaf2055d2c864b Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Fri, 26 Nov 2010 17:06:58 +0200
Subject: [PATCH 07/24] ARM: better NEON instructions scheduling for over_n_8_0565
Code rearranged to get better instructions scheduling for ARM Cortex-A8/A9.
Now it is ~30% faster for the pixel data in L1 cache and makes better use
of memory bandwidth when running at lower clock frequencies (ex. 500MHz).
Also register d24 (pixels from the mask image) is now not clobbered by
supplementary macros, which allows to reuse them for the other variants
of compositing operations later.
Benchmark from ARM Cortex-A8 @500MHz:
== before ==
over_n_8_0565 = L1: 63.90 L2: 63.15 M: 60.97 ( 73.53%)
HT: 28.89 VT: 24.14 R: 21.33 RT: 6.78 ( 67Kops/s)
== after ==
over_n_8_0565 = L1: 82.64 L2: 75.19 M: 71.52 ( 84.14%)
HT: 30.49 VT: 25.56 R: 22.36 RT: 6.89 ( 68Kops/s)
---
pixman/pixman-arm-neon-asm.S | 120 +++++++++++++++++++++++++++---------------
1 files changed, 77 insertions(+), 43 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 155a236..ffffc1c 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -792,58 +792,92 @@ generate_composite_function \
/******************************************************************************/
.macro pixman_composite_over_n_8_0565_process_pixblock_head
- /* in */
- vmull.u8 q0, d24, d8
- vmull.u8 q1, d24, d9
- vmull.u8 q6, d24, d10
- vmull.u8 q7, d24, d11
- vrshr.u16 q10, q0, #8
- vrshr.u16 q11, q1, #8
- vrshr.u16 q12, q6, #8
- vrshr.u16 q13, q7, #8
- vraddhn.u16 d0, q0, q10
- vraddhn.u16 d1, q1, q11
- vraddhn.u16 d2, q6, q12
- vraddhn.u16 d3, q7, q13
-
- vshrn.u16 d6, q2, #8
- vshrn.u16 d7, q2, #3
- vsli.u16 q2, q2, #5
- vsri.u8 d6, d6, #5
- vmvn.8 d3, d3
- vsri.u8 d7, d7, #6
- vshrn.u16 d30, q2, #2
- /* now do alpha blending */
- vmull.u8 q10, d3, d6
- vmull.u8 q11, d3, d7
- vmull.u8 q12, d3, d30
- vrshr.u16 q13, q10, #8
- vrshr.u16 q3, q11, #8
- vrshr.u16 q15, q12, #8
- vraddhn.u16 d20, q10, q13
- vraddhn.u16 d23, q11, q3
- vraddhn.u16 d22, q12, q15
+ vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
+ vmull.u8 q1, d24, d9
+ vmull.u8 q6, d24, d10
+ vmull.u8 q7, d24, d11
+ vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q8, d3, d6 /* now do alpha blending */
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
.endm
.macro pixman_composite_over_n_8_0565_process_pixblock_tail
- vqadd.u8 d16, d2, d20
- vqadd.u8 q9, q0, q11
- /* convert to r5g6b5 */
- vshll.u8 q14, d16, #8
- vshll.u8 q8, d19, #8
- vshll.u8 q9, d18, #8
- vsri.u16 q14, q8, #5
- vsri.u16 q14, q9, #11
+ /* 3 cycle bubble (after vmull.u8) */
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ /* 1 cycle bubble */
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8 /* convert to 16bpp */
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ /* 1 cycle bubble */
+ vsri.u16 q14, q9, #11
.endm
-/* TODO: expand macros and do better instructions scheduling */
.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
- pixman_composite_over_n_8_0565_process_pixblock_tail
- vst1.16 {d28, d29}, [DST_W, :128]!
vld1.16 {d4, d5}, [DST_R, :128]!
+ vshrn.u16 d6, q2, #8
fetch_mask_pixblock
+ vshrn.u16 d7, q2, #3
+ fetch_src_pixblock
+ vmull.u8 q6, d24, d10
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ vmull.u8 q1, d24, d9
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8
+ vmull.u8 q0, d24, d8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vmull.u8 q7, d24, d11
+ vsri.u16 q14, q9, #11
+
cache_preload 8, 8
- pixman_composite_over_n_8_0565_process_pixblock_head
+
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vmull.u8 q8, d3, d6
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
.endm
/*
--
1.6.6.1

View File

@ -1,74 +0,0 @@
From a7c36681c0c1955ff9110b81f1789e56abb10a95 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sat, 27 Nov 2010 03:53:12 +0200
Subject: [PATCH 08/24] ARM: added 'neon_composite_over_8888_n_0565' fast path
---
pixman/pixman-arm-neon-asm.S | 28 ++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 4 ++++
2 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index ffffc1c..3e52a49 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -917,6 +917,34 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_over_8888_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d24[0]}, [DUMMY]
+ vdup.8 d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_0565_init, \
+ pixman_composite_over_8888_n_0565_cleanup, \
+ pixman_composite_over_n_8_0565_process_pixblock_head, \
+ pixman_composite_over_n_8_0565_process_pixblock_tail, \
+ pixman_composite_over_n_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
/* TODO: expand macros and do better instructions scheduling */
.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
vld1.16 {d4, d5}, [DST_R, :128]!
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 72ef75e..8156bbb 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -83,6 +83,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8,
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_0565,
+ uint32_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
uint8_t, 1, uint8_t, 1, uint8_t, 1)
@@ -253,6 +255,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, neon_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, neon_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, neon_composite_over_8888_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, neon_composite_over_8888_n_0565),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, neon_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, neon_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, neon_composite_over_8888_8_8888),
--
1.6.6.1

View File

@ -1,139 +0,0 @@
From 3990931bf6197eff1cec06cf24bce53ddf9a539a Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sat, 27 Nov 2010 04:47:39 +0200
Subject: [PATCH 09/24] ARM: reuse common NEON code for over_{n_8|8888_n|8888_8}_0565
Renamed suppementary macros from 'over_n_8_0565' to 'over_8888_8_0565',
because they can actually support all variants of this operation:
over_8888_8_0565/over_n_8_0565/over_8888_n_0565.
Also 'over_8888_8_0565' now uses more optimized common code instead of its
own variant, improving performance a bit. Even though this operation is
still memory bandwidth limited, scaled variants of these fast paths may
put more stress on CPU later.
Benchmarked on ARM Cortex-A8 @500MHz:
== before ==
over_8888_8_0565 = L1: 67.10 L2: 53.82 M: 44.70 (105.17%)
HT: 18.73 VT: 16.91 R: 14.25 RT: 4.80 (52Kops/s)
== after ==
over_8888_8_0565 = L1: 77.83 L2: 58.14 M: 44.82 (105.52%)
HT: 20.58 VT: 17.44 R: 15.05 RT: 4.88 (52Kops/s)
---
pixman/pixman-arm-neon-asm.S | 61 +++++++++++++++++------------------------
1 files changed, 25 insertions(+), 36 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3e52a49..4175144 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -791,7 +791,7 @@ generate_composite_function \
/******************************************************************************/
-.macro pixman_composite_over_n_8_0565_process_pixblock_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
vmull.u8 q1, d24, d9
vmull.u8 q6, d24, d10
@@ -816,7 +816,7 @@ generate_composite_function \
vmull.u8 q10, d3, d30
.endm
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
/* 3 cycle bubble (after vmull.u8) */
vrshr.u16 q13, q8, #8
vrshr.u16 q11, q9, #8
@@ -835,7 +835,7 @@ generate_composite_function \
vsri.u16 q14, q9, #11
.endm
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
vld1.16 {d4, d5}, [DST_R, :128]!
vshrn.u16 d6, q2, #8
fetch_mask_pixblock
@@ -880,6 +880,23 @@ generate_composite_function \
vmull.u8 q10, d3, d30
.endm
+generate_composite_function \
+ pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
/*
* This function needs a special initialization of solid mask.
* Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
@@ -911,9 +928,9 @@ generate_composite_function \
5, /* prefetch distance */ \
pixman_composite_over_n_8_0565_init, \
pixman_composite_over_n_8_0565_cleanup, \
- pixman_composite_over_n_8_0565_process_pixblock_head, \
- pixman_composite_over_n_8_0565_process_pixblock_tail, \
- pixman_composite_over_n_8_0565_process_pixblock_tail_head
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head
/******************************************************************************/
@@ -935,36 +952,8 @@ generate_composite_function \
5, /* prefetch distance */ \
pixman_composite_over_8888_n_0565_init, \
pixman_composite_over_8888_n_0565_cleanup, \
- pixman_composite_over_n_8_0565_process_pixblock_head, \
- pixman_composite_over_n_8_0565_process_pixblock_tail, \
- pixman_composite_over_n_8_0565_process_pixblock_tail_head, \
- 28, /* dst_w_basereg */ \
- 4, /* dst_r_basereg */ \
- 8, /* src_basereg */ \
- 24 /* mask_basereg */
-
-/******************************************************************************/
-
-/* TODO: expand macros and do better instructions scheduling */
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
- vld1.16 {d4, d5}, [DST_R, :128]!
- pixman_composite_over_n_8_0565_process_pixblock_tail
- fetch_src_pixblock
- cache_preload 8, 8
- fetch_mask_pixblock
- pixman_composite_over_n_8_0565_process_pixblock_head
- vst1.16 {d28, d29}, [DST_W, :128]!
-.endm
-
-generate_composite_function \
- pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
- 8, /* number of pixels, processed in a single block */ \
- 5, /* prefetch distance */ \
- default_init_need_all_regs, \
- default_cleanup_need_all_regs, \
- pixman_composite_over_n_8_0565_process_pixblock_head, \
- pixman_composite_over_n_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
28, /* dst_w_basereg */ \
4, /* dst_r_basereg */ \
--
1.6.6.1

View File

@ -1,74 +0,0 @@
From 6d2f7f981b52b41f4321071c325babcf792bd666 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sat, 27 Nov 2010 15:53:54 +0200
Subject: [PATCH 10/24] ARM: added 'neon_composite_over_0565_n_0565' fast path
---
pixman/pixman-arm-neon-asm.S | 28 ++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 4 ++++
2 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 4175144..81c0a34 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1994,6 +1994,34 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_over_0565_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d15[0]}, [DUMMY]
+ vdup.8 d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_0565_n_0565_init, \
+ pixman_composite_over_0565_n_0565_cleanup, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
.macro pixman_composite_add_0565_8_0565_process_pixblock_head
/* mask is in d15 */
convert_0565_to_x888 q4, d2, d1, d0
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 8156bbb..b01c3e0 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -85,6 +85,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_0565,
uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_0565_n_0565,
+ uint16_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
uint8_t, 1, uint8_t, 1, uint8_t, 1)
@@ -257,6 +259,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, neon_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, r5g6b5, neon_composite_over_8888_n_0565),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, b5g6r5, neon_composite_over_8888_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, r5g6b5, solid, r5g6b5, neon_composite_over_0565_n_0565),
+ PIXMAN_STD_FAST_PATH (OVER, b5g6r5, solid, b5g6r5, neon_composite_over_0565_n_0565),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, neon_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, neon_composite_over_8888_8_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, neon_composite_over_8888_8_8888),
--
1.6.6.1

View File

@ -1,63 +0,0 @@
From c3f48b6aa2f9354af02ffc8c938ec6753fdcbde3 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sun, 28 Nov 2010 22:05:53 +0200
Subject: [PATCH 11/24] ARM: added 'neon_composite_add_8888_8_8888' fast path
---
pixman/pixman-arm-neon-asm.S | 17 +++++++++++++++++
pixman/pixman-arm-neon.c | 4 ++++
2 files changed, 21 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 81c0a34..11ef166 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1595,6 +1595,23 @@ generate_composite_function_single_scanline \
/******************************************************************************/
+generate_composite_function \
+ pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
/* expecting source data in {d0, d1, d2, d3} */
/* destination data in {d4, d5, d6, d7} */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index b01c3e0..eaf9787 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
uint8_t, 1, uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
uint16_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888,
+ uint32_t, 1, uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
uint32_t, 1, uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
@@ -282,6 +284,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8),
PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, neon_composite_add_0565_8_0565),
PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, neon_composite_add_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, neon_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, neon_composite_add_8888_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888),
--
1.6.6.1

View File

@ -1,105 +0,0 @@
From 1fba7790367d7b726d05a33bbbcebe10b9280a31 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 29 Nov 2010 02:10:22 +0200
Subject: [PATCH 12/24] ARM: better NEON instructions scheduling for add_8888_8888_8888
Provides a minor performance improvement by using pipelining and hiding
instructions latencies. Also do not clobber d0-d3 registers (source
image pixels) while doing calculations in order to allow the use of
the same macro for add_n_8_8888 fast path later.
Benchmark from ARM Cortex-A8 @500MHz:
== before ==
add_8888_8888_8888 = L1: 95.94 L2: 42.27 M: 25.60 (121.09%)
HT: 14.54 VT: 13.13 R: 12.77 RT: 4.49 (48Kops/s)
add_8888_8_8888 = L1: 104.51 L2: 57.81 M: 36.06 (106.62%)
HT: 19.24 VT: 16.45 R: 14.71 RT: 4.80 (51Kops/s)
== after ==
add_8888_8888_8888 = L1: 106.66 L2: 47.82 M: 27.32 (129.30%)
HT: 15.44 VT: 13.96 R: 12.86 RT: 4.48 (48Kops/s)
add_8888_8_8888 = L1: 107.72 L2: 61.02 M: 38.26 (113.16%)
HT: 19.48 VT: 16.72 R: 14.82 RT: 4.80 (51Kops/s)
---
pixman/pixman-arm-neon-asm.S | 52 +++++++++++++++++++++++++++--------------
1 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 11ef166..829ef84 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1542,34 +1542,50 @@ generate_composite_function \
/* expecting source data in {d0, d1, d2, d3} */
/* destination data in {d4, d5, d6, d7} */
/* mask in {d24, d25, d26, d27} */
- vmull.u8 q8, d27, d0
- vmull.u8 q9, d27, d1
+ vmull.u8 q8, d27, d0
+ vmull.u8 q9, d27, d1
vmull.u8 q10, d27, d2
vmull.u8 q11, d27, d3
- vrshr.u16 q0, q8, #8
- vrshr.u16 q1, q9, #8
- vrshr.u16 q12, q10, #8
- vrshr.u16 q13, q11, #8
- vraddhn.u16 d0, q0, q8
- vraddhn.u16 d1, q1, q9
- vraddhn.u16 d2, q12, q10
- vraddhn.u16 d3, q13, q11
- vqadd.u8 q14, q0, q2
- vqadd.u8 q15, q1, q3
+ /* 1 cycle bubble */
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
.endm
.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+ /* 2 cycle bubble */
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+ vqadd.u8 q14, q2, q14
+ /* 1 cycle bubble */
+ vqadd.u8 q15, q3, q15
.endm
-/* TODO: expand macros and do better instructions scheduling */
.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
- pixman_composite_add_8888_8888_8888_process_pixblock_tail
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
- fetch_mask_pixblock
fetch_src_pixblock
+ vrshrn.u16 d28, q8, #8
+ fetch_mask_pixblock
+ vrshrn.u16 d29, q9, #8
+ vmull.u8 q8, d27, d0
+ vrshrn.u16 d30, q10, #8
+ vmull.u8 q9, d27, d1
+ vrshrn.u16 d31, q11, #8
+ vmull.u8 q10, d27, d2
+ vqadd.u8 q14, q2, q14
+ vmull.u8 q11, d27, d3
+ vqadd.u8 q15, q3, q15
+ vrsra.u16 q8, q8, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrsra.u16 q9, q9, #8
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q10, q10, #8
+
cache_preload 8, 8
- pixman_composite_add_8888_8888_8888_process_pixblock_head
+
+ vrsra.u16 q11, q11, #8
.endm
generate_composite_function \
--
1.6.6.1

View File

@ -1,75 +0,0 @@
From b066b520dfaf0a9f4d1bc9a73c789091e9ce7cc8 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 29 Nov 2010 02:38:52 +0200
Subject: [PATCH 13/24] ARM: added 'neon_composite_add_n_8_8888' fast path
---
pixman/pixman-arm-neon-asm.S | 29 +++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 4 ++++
2 files changed, 33 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 829ef84..dd6f2c5 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1628,6 +1628,35 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_add_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8888_init, \
+ pixman_composite_add_n_8_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
/* expecting source data in {d0, d1, d2, d3} */
/* destination data in {d4, d5, d6, d7} */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index eaf9787..5ad58bd 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -80,6 +80,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -281,6 +283,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, neon_composite_add_n_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, neon_composite_add_n_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8),
PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, neon_composite_add_0565_8_0565),
PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, neon_composite_add_0565_8_0565),
--
1.6.6.1

View File

@ -1,72 +0,0 @@
From f6843e3797eea7e4aed7614b1086f5cefc06c0f9 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 29 Nov 2010 03:31:32 +0200
Subject: [PATCH 14/24] ARM: added 'neon_composite_add_8888_n_8888' fast path
---
pixman/pixman-arm-neon-asm.S | 26 ++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 4 ++++
2 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index dd6f2c5..2c0fd37 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1657,6 +1657,32 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_add_8888_n_8888_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vld1.32 {d27[0]}, [DUMMY]
+ vdup.8 d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8888_n_8888_init, \
+ pixman_composite_add_8888_n_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
/* expecting source data in {d0, d1, d2, d3} */
/* destination data in {d4, d5, d6, d7} */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 5ad58bd..f0dc111 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -89,6 +89,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_0565,
uint32_t, 1, uint16_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_0565_n_0565,
uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, add_8888_n_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
uint8_t, 1, uint8_t, 1, uint8_t, 1)
@@ -291,6 +293,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, neon_composite_add_8888_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, neon_composite_add_8888_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, neon_composite_add_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, neon_composite_add_8888_n_8888),
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888),
--
1.6.6.1

View File

@ -1,153 +0,0 @@
From af7a69d90ea2b43a4e850870727723d719f09a1c Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 29 Nov 2010 09:00:46 +0200
Subject: [PATCH 15/24] ARM: added flags parameter to some asm fast path wrapper macros
Not all types of operations can be skipped when having transparent
solid source or transparent solid mask. Add an extra flags parameter
for providing this information to the wrappers.
---
pixman/pixman-arm-common.h | 15 +++++++++------
pixman/pixman-arm-neon.c | 26 +++++++++++++-------------
pixman/pixman-arm-simd.c | 4 ++--
3 files changed, 24 insertions(+), 21 deletions(-)
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 2cff6c8..66f448d 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -47,6 +47,9 @@
* or mask), the corresponding stride argument is unused.
*/
+#define SKIP_ZERO_SRC 1
+#define SKIP_ZERO_MASK 2
+
#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name, \
src_type, src_cnt, \
dst_type, dst_cnt) \
@@ -87,7 +90,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \
src_line, src_stride); \
}
-#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(cputype, name, \
+#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name, \
dst_type, dst_cnt) \
void \
pixman_composite_##name##_asm_##cputype (int32_t w, \
@@ -117,7 +120,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \
\
src = _pixman_image_get_solid (src_image, dst_image->bits.format); \
\
- if (src == 0) \
+ if ((flags & SKIP_ZERO_SRC) && src == 0) \
return; \
\
PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
@@ -128,7 +131,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \
src); \
}
-#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(cputype, name, \
+#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name, \
mask_type, mask_cnt, \
dst_type, dst_cnt) \
void \
@@ -163,7 +166,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \
\
src = _pixman_image_get_solid (src_image, dst_image->bits.format); \
\
- if (src == 0) \
+ if ((flags & SKIP_ZERO_SRC) && src == 0) \
return; \
\
PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
@@ -177,7 +180,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \
mask_line, mask_stride); \
}
-#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(cputype, name, \
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name, \
src_type, src_cnt, \
dst_type, dst_cnt) \
void \
@@ -211,7 +214,7 @@ cputype##_composite_##name (pixman_implementation_t *imp, \
\
mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);\
\
- if (mask == 0) \
+ if ((flags & SKIP_ZERO_MASK) && mask == 0) \
return; \
\
PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type, \
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index f0dc111..1a3741c 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -63,33 +63,33 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
uint8_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_n_0565,
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565,
uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_n_8888,
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_reverse_n_8888,
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_0565,
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
uint8_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8888,
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8888_8888_ca,
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8,
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8,
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8888,
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_8888,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_0565,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565,
uint32_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_0565_n_0565,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565,
uint16_t, 1, uint16_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, add_8888_n_8888,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888,
uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 3b05007..dc2f471 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -381,10 +381,10 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
-PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
--
1.6.6.1

View File

@ -1,97 +0,0 @@
From 733f68912f4a44c24ad3973049a7e1d98f4c6ea8 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 29 Nov 2010 09:11:29 +0200
Subject: [PATCH 16/24] ARM: added 'neon_composite_in_n_8' fast path
---
pixman/pixman-arm-neon-asm.S | 52 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 3 ++
2 files changed, 55 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 2c0fd37..cf014fa 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1427,6 +1427,58 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_in_n_8_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* and destination data in {d4, d5, d6, d7} */
+ vmull.u8 q8, d4, d3
+ vmull.u8 q9, d5, d3
+ vmull.u8 q10, d6, d3
+ vmull.u8 q11, d7, d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q8, q14
+ vraddhn.u16 d29, q9, q15
+ vraddhn.u16 d30, q10, q12
+ vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+ pixman_composite_in_n_8_process_pixblock_tail
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ cache_preload 32, 32
+ pixman_composite_in_n_8_process_pixblock_head
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_in_n_8_init, \
+ pixman_composite_in_n_8_cleanup, \
+ pixman_composite_in_n_8_process_pixblock_head, \
+ pixman_composite_in_n_8_process_pixblock_tail, \
+ pixman_composite_in_n_8_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
.macro pixman_composite_add_n_8_8_process_pixblock_head
/* expecting source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 1a3741c..e3eca2b 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -69,6 +69,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
+ uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
uint8_t, 1, uint16_t, 1)
@@ -298,6 +300,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN, solid, null, a8, neon_composite_in_n_8),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, neon_composite_out_reverse_8_0565),
--
1.6.6.1

View File

@ -55,21 +55,21 @@ index f1ce0ba..b33da29 100644
void
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 383748a..969dfab 100644
index 1662d2c..c0f9af4 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -197,6 +197,11 @@ void
_pixman_bits_image_setup_accessors (bits_image_t *image);
@@ -256,6 +256,11 @@ _pixman_conical_gradient_iter_init (pixman_image_t *image,
int x, int y, int width, int height,
uint8_t *buffer, iter_flags_t flags);
void
+_pixman_bits_override_accessors (pixman_format_code_t format,
+ fetch_scanline_t fetch_func,
+ store_scanline_t store_func);
+
+void
_pixman_image_get_scanline_generic_64 (pixman_image_t *image,
int x,
int y,
+_pixman_bits_override_accessors (pixman_format_code_t format,
+ fetch_scanline_t fetch_func,
+ store_scanline_t store_func);
+
pixman_image_t *
_pixman_image_allocate (void);
--
1.6.6.1
1.7.4.rc2

View File

@ -1,43 +0,0 @@
require pixman.inc
SRC_URI[archive.md5sum] = "9e09fd6e58cbf9717140891e0b7d4a7a"
SRC_URI[archive.sha256sum] = "295f51416caf307ff7caf1153ee9b1d86b9f7f02a7876d12db6538d80451c5de"
LICENSE = "MIT & MIT-style & Public Domain"
LIC_FILES_CHKSUM = "file://COPYING;md5=e9d61bf6d32b58021e0eb0d41f223b6f \
file://pixman/pixman-matrix.c;endline=25;md5=ba6e8769bfaaee2c41698755af04c4be \
file://pixman/pixman-arm-neon-asm.h;endline=24;md5=9a9cc1e51abbf1da58f4d9528ec9d49b \
file://pixman/pixman-x64-mmx-emulation.h;beginline=4;endline=9;md5=4e32716f2efaa6c4659222667c339bb8"
PR = "${INC_PR}.2"
SRC_URI += "\
file://0002-Fix-argument-quoting-for-AC_INIT.patch \
file://0003-Sun-s-copyrights-belong-to-Oracle-now.patch \
file://0004-C-fast-path-for-a1-fill-operation.patch \
file://0005-ARM-added-neon_composite_over_n_8_8-fast-path.patch \
file://0006-ARM-introduced-fetch_mask_pixblock-macro-to-simplify.patch \
file://0007-ARM-better-NEON-instructions-scheduling-for-over_n_8.patch \
file://0008-ARM-added-neon_composite_over_8888_n_0565-fast-path.patch \
file://0009-ARM-reuse-common-NEON-code-for-over_-n_8-8888_n-8888.patch \
file://0010-ARM-added-neon_composite_over_0565_n_0565-fast-path.patch \
file://0011-ARM-added-neon_composite_add_8888_8_8888-fast-path.patch \
file://0012-ARM-better-NEON-instructions-scheduling-for-add_8888.patch \
file://0013-ARM-added-neon_composite_add_n_8_8888-fast-path.patch \
file://0014-ARM-added-neon_composite_add_8888_n_8888-fast-path.patch \
file://0015-ARM-added-flags-parameter-to-some-asm-fast-path-wrap.patch \
file://0016-ARM-added-neon_composite_in_n_8-fast-path.patch \
file://0017-add-_pixman_bits_override_accessors.patch \
file://0018-Generic-C-implementation-of-pixman_blt-with-overlapp.patch \
file://0019-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch \
file://0020-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch \
file://0021-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch \
file://0022-ARM-added-NEON-optimizations-for-fetch-store-r5g6b5-.patch \
file://0023-ARM-added-NEON-optimizations-for-fetch-store-a8-scan.patch \
file://0024-ARM-added-NEON-optimizations-for-fetching-x8r8g8b8-s.patch \
"
NEON = " --disable-arm-neon "
NEON_armv7a = " "
EXTRA_OECONF = "${NEON} --disable-gtk"

View File

@ -0,0 +1,29 @@
require pixman.inc
SRC_URI[archive.md5sum] = "e50975ace979cd416a505827c15191b4"
SRC_URI[archive.sha256sum] = "57783330ee2f96121dc267b7f25b98356fd09fe9de185cd39e72e906b6444013"
LICENSE = "MIT & MIT-style & Public Domain"
LIC_FILES_CHKSUM = "file://COPYING;md5=e9d61bf6d32b58021e0eb0d41f223b6f \
file://pixman/pixman-matrix.c;endline=25;md5=ba6e8769bfaaee2c41698755af04c4be \
file://pixman/pixman-arm-neon-asm.h;endline=24;md5=9a9cc1e51abbf1da58f4d9528ec9d49b \
file://pixman/pixman-x64-mmx-emulation.h;beginline=4;endline=9;md5=4e32716f2efaa6c4659222667c339bb8"
PR = "${INC_PR}.0"
SRC_URI += "\
file://0017-add-_pixman_bits_override_accessors.patch \
file://0018-Generic-C-implementation-of-pixman_blt-with-overlapp.patch \
file://0019-Support-of-overlapping-src-dst-for-pixman_blt_mmx.patch \
file://0020-Support-of-overlapping-src-dst-for-pixman_blt_sse2.patch \
file://0021-Support-of-overlapping-src-dst-for-pixman_blt_neon.patch \
file://0022-ARM-added-NEON-optimizations-for-fetch-store-r5g6b5-.patch \
file://0023-ARM-added-NEON-optimizations-for-fetch-store-a8-scan.patch \
file://0024-ARM-added-NEON-optimizations-for-fetching-x8r8g8b8-s.patch \
"
NEON = " --disable-arm-neon "
NEON_armv7a = " "
EXTRA_OECONF = "${NEON} --disable-gtk"