diff --git a/Makefile.common b/Makefile.common
index 7267055445..dbe2281a44 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -512,6 +512,10 @@ ifeq ($(HAVE_EXYNOS), 1)
DEFINES += $(DRM_CFLAGS) $(EXYNOS_CFLAGS)
endif
+ifeq ($(HAVE_SUNXI), 1)
+ OBJ += gfx/drivers/sunxi_gfx.o gfx/pixman/pixman-arm-neon-asm.o
+endif
+
ifeq ($(HAVE_VG), 1)
OBJ += gfx/drivers/vg.o libretro-common/gfx/math/matrix_3x3.o
DEFINES += $(VG_CFLAGS)
diff --git a/config.def.h b/config.def.h
index 554af06680..274ed32739 100644
--- a/config.def.h
+++ b/config.def.h
@@ -43,6 +43,7 @@ enum
VIDEO_NULL,
VIDEO_OMAP,
VIDEO_EXYNOS,
+ VIDEO_SUNXI,
AUDIO_RSOUND,
AUDIO_OSS,
diff --git a/gfx/drivers/sunxi_gfx.c b/gfx/drivers/sunxi_gfx.c
new file mode 100644
index 0000000000..2ede566ba0
--- /dev/null
+++ b/gfx/drivers/sunxi_gfx.c
@@ -0,0 +1,1387 @@
+/* RetroArch - A frontend for libretro.
+ * Copyright (C) 2013-2014 - Tobias Jakobi
+ *
+ * RetroArch is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with RetroArch.
+ * If not, see .
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include "../../general.h"
+#include "../../retroarch.h"
+#include "../video_viewport.h"
+#include "../video_monitor.h"
+#include "../font_renderer_driver.h"
+
+//MAC Specific includes for the driver
+#include
+#include
+#include
+#include
+#include
+
+// Lowlevel SunxiG2D functions block
+
+/* for tracking the ioctls API/ABI */
+#define SUNXI_DISP_VERSION_MAJOR 1
+#define SUNXI_DISP_VERSION_MINOR 0
+
+#define SUNXI_DISP_VERSION ((SUNXI_DISP_VERSION_MAJOR << 16) | SUNXI_DISP_VERSION_MINOR)
+
+#define FBIOGET_LAYER_HDL_0 0x4700
+#define FBIOGET_LAYER_HDL_1 0x4701
+
+#define FALLBACK_BLT() sunxi_g2d_try_fallback_blt(self, src_bits, \
+ dst_bits, src_stride, \
+ dst_stride, src_bpp, \
+ dst_bpp, src_x, src_y, \
+ dst_x, dst_y, w, h);
+
+#define __bool signed char
+
+typedef struct {
+ __u8 alpha;
+ __u8 red;
+ __u8 green;
+ __u8 blue;
+} __disp_color_t;
+typedef struct {
+ __s32 x;
+ __s32 y;
+ __u32 width;
+ __u32 height;
+} __disp_rect_t;
+typedef struct {
+ __u32 width;
+ __u32 height;
+} __disp_rectsz_t;
+typedef struct {
+ __s32 x;
+ __s32 y;
+} __disp_pos_t;
+
+typedef enum tag_DISP_CMD {
+ /* ----disp global---- */
+ DISP_CMD_VERSION = 0x00,
+ DISP_CMD_RESERVE1 = 0x01,
+ /* fail when the value is 0x02 in linux,why??? */
+ DISP_CMD_SET_BKCOLOR = 0x3f,
+ DISP_CMD_GET_BKCOLOR = 0x03,
+ DISP_CMD_SET_COLORKEY = 0x04,
+ DISP_CMD_GET_COLORKEY = 0x05,
+ DISP_CMD_SET_PALETTE_TBL = 0x06,
+ DISP_CMD_GET_PALETTE_TBL = 0x07,
+ DISP_CMD_SCN_GET_WIDTH = 0x08,
+ DISP_CMD_SCN_GET_HEIGHT = 0x09,
+ DISP_CMD_GET_OUTPUT_TYPE = 0x0a,
+ DISP_CMD_SET_EXIT_MODE = 0x0c,
+ DISP_CMD_SET_GAMMA_TABLE = 0x0d,
+ DISP_CMD_GAMMA_CORRECTION_ON = 0x0e,
+ DISP_CMD_GAMMA_CORRECTION_OFF = 0x0f,
+ DISP_CMD_START_CMD_CACHE = 0x10,
+ DISP_CMD_EXECUTE_CMD_AND_STOP_CACHE = 0x11,
+ DISP_CMD_SET_BRIGHT = 0x12,
+ DISP_CMD_SET_CONTRAST = 0x13,
+ DISP_CMD_SET_SATURATION = 0x14,
+ DISP_CMD_GET_BRIGHT = 0x16,
+ DISP_CMD_GET_CONTRAST = 0x17,
+ DISP_CMD_GET_SATURATION = 0x18,
+ DISP_CMD_ENHANCE_ON = 0x1a,
+ DISP_CMD_ENHANCE_OFF = 0x1b,
+ DISP_CMD_GET_ENHANCE_EN = 0x1c,
+ DISP_CMD_CLK_ON = 0x1d,
+ DISP_CMD_CLK_OFF = 0x1e,
+ /*
+ * when the screen is not used to display(lcd/tv/vga/hdmi) directly,
+ * maybe capture the screen and scaler to dram, or as a layer of
+ * another screen
+ */
+ DISP_CMD_SET_SCREEN_SIZE = 0x1f,
+ DISP_CMD_CAPTURE_SCREEN = 0x20, /* caputre screen and scaler to dram */
+ DISP_CMD_DE_FLICKER_ON = 0x21,
+ DISP_CMD_DE_FLICKER_OFF = 0x22,
+ DISP_CMD_SET_HUE = 0x23,
+ DISP_CMD_GET_HUE = 0x24,
+ DISP_CMD_DRC_OFF = 0x25,
+ DISP_CMD_GET_DRC_EN = 0x26,
+ DISP_CMD_DE_FLICKER_SET_WINDOW = 0x27,
+ DISP_CMD_DRC_SET_WINDOW = 0x28,
+ DISP_CMD_DRC_ON = 0x29,
+ DISP_CMD_GET_DE_FLICKER_EN = 0x2a,
+
+ /* ----layer---- */
+ DISP_CMD_LAYER_REQUEST = 0x40,
+ DISP_CMD_LAYER_RELEASE = 0x41,
+ DISP_CMD_LAYER_OPEN = 0x42,
+ DISP_CMD_LAYER_CLOSE = 0x43,
+ DISP_CMD_LAYER_SET_FB = 0x44,
+ DISP_CMD_LAYER_GET_FB = 0x45,
+ DISP_CMD_LAYER_SET_SRC_WINDOW = 0x46,
+ DISP_CMD_LAYER_GET_SRC_WINDOW = 0x47,
+ DISP_CMD_LAYER_SET_SCN_WINDOW = 0x48,
+ DISP_CMD_LAYER_GET_SCN_WINDOW = 0x49,
+ DISP_CMD_LAYER_SET_PARA = 0x4a,
+ DISP_CMD_LAYER_GET_PARA = 0x4b,
+ DISP_CMD_LAYER_ALPHA_ON = 0x4c,
+ DISP_CMD_LAYER_ALPHA_OFF = 0x4d,
+ DISP_CMD_LAYER_GET_ALPHA_EN = 0x4e,
+ DISP_CMD_LAYER_SET_ALPHA_VALUE = 0x4f,
+ DISP_CMD_LAYER_GET_ALPHA_VALUE = 0x50,
+ DISP_CMD_LAYER_CK_ON = 0x51,
+ DISP_CMD_LAYER_CK_OFF = 0x52,
+ DISP_CMD_LAYER_GET_CK_EN = 0x53,
+ DISP_CMD_LAYER_SET_PIPE = 0x54,
+ DISP_CMD_LAYER_GET_PIPE = 0x55,
+ DISP_CMD_LAYER_TOP = 0x56,
+ DISP_CMD_LAYER_BOTTOM = 0x57,
+ DISP_CMD_LAYER_GET_PRIO = 0x58,
+ DISP_CMD_LAYER_SET_SMOOTH = 0x59,
+ DISP_CMD_LAYER_GET_SMOOTH = 0x5a,
+ DISP_CMD_LAYER_SET_BRIGHT = 0x5b, /* brightness */
+ DISP_CMD_LAYER_SET_CONTRAST = 0x5c, /* contrast */
+ DISP_CMD_LAYER_SET_SATURATION = 0x5d, /* saturation */
+ DISP_CMD_LAYER_SET_HUE = 0x5e, /* hue, chroma */
+ DISP_CMD_LAYER_GET_BRIGHT = 0x5f,
+ DISP_CMD_LAYER_GET_CONTRAST = 0x60,
+ DISP_CMD_LAYER_GET_SATURATION = 0x61,
+ DISP_CMD_LAYER_GET_HUE = 0x62,
+ DISP_CMD_LAYER_ENHANCE_ON = 0x63,
+ DISP_CMD_LAYER_ENHANCE_OFF = 0x64,
+ DISP_CMD_LAYER_GET_ENHANCE_EN = 0x65,
+ DISP_CMD_LAYER_VPP_ON = 0x67,
+ DISP_CMD_LAYER_VPP_OFF = 0x68,
+ DISP_CMD_LAYER_GET_VPP_EN = 0x69,
+ DISP_CMD_LAYER_SET_LUMA_SHARP_LEVEL = 0x6a,
+ DISP_CMD_LAYER_GET_LUMA_SHARP_LEVEL = 0x6b,
+ DISP_CMD_LAYER_SET_CHROMA_SHARP_LEVEL = 0x6c,
+ DISP_CMD_LAYER_GET_CHROMA_SHARP_LEVEL = 0x6d,
+ DISP_CMD_LAYER_SET_WHITE_EXTEN_LEVEL = 0x6e,
+ DISP_CMD_LAYER_GET_WHITE_EXTEN_LEVEL = 0x6f,
+ DISP_CMD_LAYER_SET_BLACK_EXTEN_LEVEL = 0x70,
+ DISP_CMD_LAYER_GET_BLACK_EXTEN_LEVEL = 0x71,
+
+ /* ----scaler---- */
+ DISP_CMD_SCALER_REQUEST = 0x80,
+ DISP_CMD_SCALER_RELEASE = 0x81,
+ DISP_CMD_SCALER_EXECUTE = 0x82,
+
+ /* ----hwc---- */
+ DISP_CMD_HWC_OPEN = 0xc0,
+ DISP_CMD_HWC_CLOSE = 0xc1,
+ DISP_CMD_HWC_SET_POS = 0xc2,
+ DISP_CMD_HWC_GET_POS = 0xc3,
+ DISP_CMD_HWC_SET_FB = 0xc4,
+ DISP_CMD_HWC_SET_PALETTE_TABLE = 0xc5,
+
+ /* ----video---- */
+ DISP_CMD_VIDEO_START = 0x100,
+ DISP_CMD_VIDEO_STOP = 0x101,
+ DISP_CMD_VIDEO_SET_FB = 0x102,
+ DISP_CMD_VIDEO_GET_FRAME_ID = 0x103,
+ DISP_CMD_VIDEO_GET_DIT_INFO = 0x104,
+
+ /* ----lcd---- */
+ DISP_CMD_LCD_ON = 0x140,
+ DISP_CMD_LCD_OFF = 0x141,
+ DISP_CMD_LCD_SET_BRIGHTNESS = 0x142,
+ DISP_CMD_LCD_GET_BRIGHTNESS = 0x143,
+ DISP_CMD_LCD_CPUIF_XY_SWITCH = 0x146,
+ DISP_CMD_LCD_CHECK_OPEN_FINISH = 0x14a,
+ DISP_CMD_LCD_CHECK_CLOSE_FINISH = 0x14b,
+ DISP_CMD_LCD_SET_SRC = 0x14c,
+ DISP_CMD_LCD_USER_DEFINED_FUNC = 0x14d,
+
+ /* ----tv---- */
+ DISP_CMD_TV_ON = 0x180,
+ DISP_CMD_TV_OFF = 0x181,
+ DISP_CMD_TV_SET_MODE = 0x182,
+ DISP_CMD_TV_GET_MODE = 0x183,
+ DISP_CMD_TV_AUTOCHECK_ON = 0x184,
+ DISP_CMD_TV_AUTOCHECK_OFF = 0x185,
+ DISP_CMD_TV_GET_INTERFACE = 0x186,
+ DISP_CMD_TV_SET_SRC = 0x187,
+ DISP_CMD_TV_GET_DAC_STATUS = 0x188,
+ DISP_CMD_TV_SET_DAC_SOURCE = 0x189,
+ DISP_CMD_TV_GET_DAC_SOURCE = 0x18a,
+
+ /* ----hdmi---- */
+ DISP_CMD_HDMI_ON = 0x1c0,
+ DISP_CMD_HDMI_OFF = 0x1c1,
+ DISP_CMD_HDMI_SET_MODE = 0x1c2,
+ DISP_CMD_HDMI_GET_MODE = 0x1c3,
+ DISP_CMD_HDMI_SUPPORT_MODE = 0x1c4,
+ DISP_CMD_HDMI_GET_HPD_STATUS = 0x1c5,
+ DISP_CMD_HDMI_SET_SRC = 0x1c6,
+
+ /* ----vga---- */
+ DISP_CMD_VGA_ON = 0x200,
+ DISP_CMD_VGA_OFF = 0x201,
+ DISP_CMD_VGA_SET_MODE = 0x202,
+ DISP_CMD_VGA_GET_MODE = 0x203,
+ DISP_CMD_VGA_SET_SRC = 0x204,
+
+ /* ----sprite---- */
+ DISP_CMD_SPRITE_OPEN = 0x240,
+ DISP_CMD_SPRITE_CLOSE = 0x241,
+ DISP_CMD_SPRITE_SET_FORMAT = 0x242,
+ DISP_CMD_SPRITE_GLOBAL_ALPHA_ENABLE = 0x243,
+ DISP_CMD_SPRITE_GLOBAL_ALPHA_DISABLE = 0x244,
+ DISP_CMD_SPRITE_GET_GLOBAL_ALPHA_ENABLE = 0x252,
+ DISP_CMD_SPRITE_SET_GLOBAL_ALPHA_VALUE = 0x245,
+ DISP_CMD_SPRITE_GET_GLOBAL_ALPHA_VALUE = 0x253,
+ DISP_CMD_SPRITE_SET_ORDER = 0x246,
+ DISP_CMD_SPRITE_GET_TOP_BLOCK = 0x250,
+ DISP_CMD_SPRITE_GET_BOTTOM_BLOCK = 0x251,
+ DISP_CMD_SPRITE_SET_PALETTE_TBL = 0x247,
+ DISP_CMD_SPRITE_GET_BLOCK_NUM = 0x259,
+ DISP_CMD_SPRITE_BLOCK_REQUEST = 0x248,
+ DISP_CMD_SPRITE_BLOCK_RELEASE = 0x249,
+ DISP_CMD_SPRITE_BLOCK_OPEN = 0x257,
+ DISP_CMD_SPRITE_BLOCK_CLOSE = 0x258,
+ DISP_CMD_SPRITE_BLOCK_SET_SOURCE_WINDOW = 0x25a,
+ DISP_CMD_SPRITE_BLOCK_GET_SOURCE_WINDOW = 0x25b,
+ DISP_CMD_SPRITE_BLOCK_SET_SCREEN_WINDOW = 0x24a,
+ DISP_CMD_SPRITE_BLOCK_GET_SCREEN_WINDOW = 0x24c,
+ DISP_CMD_SPRITE_BLOCK_SET_FB = 0x24b,
+ DISP_CMD_SPRITE_BLOCK_GET_FB = 0x24d,
+ DISP_CMD_SPRITE_BLOCK_SET_PARA = 0x25c,
+ DISP_CMD_SPRITE_BLOCK_GET_PARA = 0x25d,
+ DISP_CMD_SPRITE_BLOCK_SET_TOP = 0x24e,
+ DISP_CMD_SPRITE_BLOCK_SET_BOTTOM = 0x24f,
+ DISP_CMD_SPRITE_BLOCK_GET_PREV_BLOCK = 0x254,
+ DISP_CMD_SPRITE_BLOCK_GET_NEXT_BLOCK = 0x255,
+ DISP_CMD_SPRITE_BLOCK_GET_PRIO = 0x256,
+
+ /* ----framebuffer---- */
+ DISP_CMD_FB_REQUEST = 0x280,
+ DISP_CMD_FB_RELEASE = 0x281,
+ DISP_CMD_FB_GET_PARA = 0x282,
+ DISP_CMD_GET_DISP_INIT_PARA = 0x283,
+
+ /* ---for Displayer Test -------- */
+ DISP_CMD_MEM_REQUEST = 0x2c0,
+ DISP_CMD_MEM_RELASE = 0x2c1,
+ DISP_CMD_MEM_GETADR = 0x2c2,
+ DISP_CMD_MEM_SELIDX = 0x2c3,
+
+ DISP_CMD_SUSPEND = 0x2d0,
+ DISP_CMD_RESUME = 0x2d1,
+
+ DISP_CMD_PRINT_REG = 0x2e0,
+
+ /* ---pwm -------- */
+ DISP_CMD_PWM_SET_PARA = 0x300,
+ DISP_CMD_PWM_GET_PARA = 0x301,
+} __disp_cmd_t;
+
+typedef enum {
+ DISP_FORMAT_1BPP = 0x0,
+ DISP_FORMAT_2BPP = 0x1,
+ DISP_FORMAT_4BPP = 0x2,
+ DISP_FORMAT_8BPP = 0x3,
+ DISP_FORMAT_RGB655 = 0x4,
+ DISP_FORMAT_RGB565 = 0x5,
+ DISP_FORMAT_RGB556 = 0x6,
+ DISP_FORMAT_ARGB1555 = 0x7,
+ DISP_FORMAT_RGBA5551 = 0x8,
+ DISP_FORMAT_ARGB888 = 0x9, /* alpha padding to 0xff */
+ DISP_FORMAT_ARGB8888 = 0xa,
+ DISP_FORMAT_RGB888 = 0xb,
+ DISP_FORMAT_ARGB4444 = 0xc,
+
+ DISP_FORMAT_YUV444 = 0x10,
+ DISP_FORMAT_YUV422 = 0x11,
+ DISP_FORMAT_YUV420 = 0x12,
+ DISP_FORMAT_YUV411 = 0x13,
+ DISP_FORMAT_CSIRGB = 0x14,
+} __disp_pixel_fmt_t;
+
+typedef enum {
+ /* interleaved,1 address */
+ DISP_MOD_INTERLEAVED = 0x1,
+ /*
+ * No macroblock plane mode, 3 address, RGB/YUV each channel were stored
+ */
+ DISP_MOD_NON_MB_PLANAR = 0x0,
+ /* No macroblock UV packaged mode, 2 address, Y and UV were stored */
+ DISP_MOD_NON_MB_UV_COMBINED = 0x2,
+ /* Macroblock plane mode, 3 address,RGB/YUV each channel were stored */
+ DISP_MOD_MB_PLANAR = 0x4,
+ /* Macroblock UV packaged mode, 2 address, Y and UV were stored */
+ DISP_MOD_MB_UV_COMBINED = 0x6,
+} __disp_pixel_mod_t;
+
+typedef enum {
+ /* for interleave argb8888 */
+ DISP_SEQ_ARGB = 0x0, /* A at a high level */
+ DISP_SEQ_BGRA = 0x2,
+
+ /* for interleaved yuv422 */
+ DISP_SEQ_UYVY = 0x3,
+ DISP_SEQ_YUYV = 0x4,
+ DISP_SEQ_VYUY = 0x5,
+ DISP_SEQ_YVYU = 0x6,
+
+ /* for interleaved yuv444 */
+ DISP_SEQ_AYUV = 0x7,
+ DISP_SEQ_VUYA = 0x8,
+
+ /* for uv_combined yuv420 */
+ DISP_SEQ_UVUV = 0x9,
+ DISP_SEQ_VUVU = 0xa,
+
+ /* for 16bpp rgb */
+ DISP_SEQ_P10 = 0xd, /* p1 high */
+ DISP_SEQ_P01 = 0xe, /* p0 high */
+
+ /* for planar format or 8bpp rgb */
+ DISP_SEQ_P3210 = 0xf, /* p3 high */
+ DISP_SEQ_P0123 = 0x10, /* p0 high */
+
+ /* for 4bpp rgb */
+ DISP_SEQ_P76543210 = 0x11,
+ DISP_SEQ_P67452301 = 0x12,
+ DISP_SEQ_P10325476 = 0x13,
+ DISP_SEQ_P01234567 = 0x14,
+
+ /* for 2bpp rgb */
+ /* 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 */
+ DISP_SEQ_2BPP_BIG_BIG = 0x15,
+ /* 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3 */
+ DISP_SEQ_2BPP_BIG_LITTER = 0x16,
+ /* 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12 */
+ DISP_SEQ_2BPP_LITTER_BIG = 0x17,
+ /* 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 */
+ DISP_SEQ_2BPP_LITTER_LITTER = 0x18,
+
+ /* for 1bpp rgb */
+ /*
+ * 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,
+ * 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ */
+ DISP_SEQ_1BPP_BIG_BIG = 0x19,
+ /*
+ * 24,25,26,27,28,29,30,31,16,17,18,19,20,21,22,23,
+ * 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7
+ */
+ DISP_SEQ_1BPP_BIG_LITTER = 0x1a,
+ /*
+ * 7, 6, 5, 4, 3, 2, 1, 0,15,14,13,12,11,10, 9, 8,
+ * 23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24
+ */
+ DISP_SEQ_1BPP_LITTER_BIG = 0x1b,
+ /*
+ * 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+ * 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+ */
+ DISP_SEQ_1BPP_LITTER_LITTER = 0x1c,
+} __disp_pixel_seq_t;
+
+typedef enum {
+ DISP_3D_SRC_MODE_TB = 0x0, /* top bottom */
+ DISP_3D_SRC_MODE_FP = 0x1, /* frame packing */
+ DISP_3D_SRC_MODE_SSF = 0x2, /* side by side full */
+ DISP_3D_SRC_MODE_SSH = 0x3, /* side by side half */
+ DISP_3D_SRC_MODE_LI = 0x4, /* line interleaved */
+} __disp_3d_src_mode_t;
+
+typedef enum {
+ DISP_3D_OUT_MODE_CI_1 = 0x5, /* column interlaved 1 */
+ DISP_3D_OUT_MODE_CI_2 = 0x6, /* column interlaved 2 */
+ DISP_3D_OUT_MODE_CI_3 = 0x7, /* column interlaved 3 */
+ DISP_3D_OUT_MODE_CI_4 = 0x8, /* column interlaved 4 */
+ DISP_3D_OUT_MODE_LIRGB = 0x9, /* line interleaved rgb */
+
+ DISP_3D_OUT_MODE_TB = 0x0, /* top bottom */
+ DISP_3D_OUT_MODE_FP = 0x1, /* frame packing */
+ DISP_3D_OUT_MODE_SSF = 0x2, /* side by side full */
+ DISP_3D_OUT_MODE_SSH = 0x3, /* side by side half */
+ DISP_3D_OUT_MODE_LI = 0x4, /* line interleaved */
+ DISP_3D_OUT_MODE_FA = 0xa, /* field alternative */
+} __disp_3d_out_mode_t;
+
+typedef enum {
+ DISP_BT601 = 0,
+ DISP_BT709 = 1,
+ DISP_YCC = 2,
+ DISP_VXYCC = 3,
+} __disp_cs_mode_t;
+
+typedef enum {
+ DISP_LAYER_WORK_MODE_NORMAL = 0, /* normal work mode */
+ DISP_LAYER_WORK_MODE_PALETTE = 1, /* palette work mode */
+ /* internal frame buffer work mode */
+ DISP_LAYER_WORK_MODE_INTER_BUF = 2,
+ DISP_LAYER_WORK_MODE_GAMMA = 3, /* gamma correction work mode */
+ DISP_LAYER_WORK_MODE_SCALER = 4, /* scaler work mode */
+} __disp_layer_work_mode_t;
+
+typedef struct {
+ int fd_fb;
+ int fd_disp;
+ int fd_g2d;
+ int fb_id; /* /dev/fb0 = 0, /dev/fb1 = 1 */
+
+ int xres, yres, bits_per_pixel;
+ uint8_t *framebuffer_addr; /* mmapped address */
+ uintptr_t framebuffer_paddr; /* physical address */
+ uint32_t framebuffer_size; /* total size of the framebuffer */
+ int framebuffer_height;/* virtual vertical resolution */
+ uint32_t gfx_layer_size; /* the size of the primary layer */
+
+ uint8_t *xserver_fbmem; /* framebuffer mapping done by xserver */
+
+ /* Hardware cursor support */
+ int cursor_enabled;
+ int cursor_x, cursor_y;
+
+ /* Layers support */
+ int gfx_layer_id;
+ int layer_id;
+ int layer_has_scaler;
+
+ int layer_buf_x, layer_buf_y, layer_buf_w, layer_buf_h;
+ int layer_win_x, layer_win_y;
+ int layer_scaler_is_enabled;
+ int layer_format;
+
+ /* G2D accelerated implementation of blt2d_i interface */
+ //blt2d_i blt2d;
+ /* Optional fallback interface to handle unsupported operations */
+ //blt2d_i *fallback_blt2d;
+} sunxi_disp_t;
+
+typedef struct {
+ /*
+ * The way these are treated today, these are physical addresses. Are
+ * there any actual userspace applications out there that use this?
+ * -- libv.
+ */
+ /*
+ * the contents of the frame buffer address for rgb type only addr[0]
+ * valid
+ */
+ __u32 addr[3];
+ __disp_rectsz_t size; /* unit is pixel */
+ __disp_pixel_fmt_t format;
+ __disp_pixel_seq_t seq;
+ __disp_pixel_mod_t mode;
+ /*
+ * blue red color swap flag, FALSE:RGB; TRUE:BGR,only used in rgb format
+ */
+ __bool br_swap;
+ __disp_cs_mode_t cs_mode; /* color space */
+ __bool b_trd_src; /* if 3d source, used for scaler mode layer */
+ /* source 3d mode, used for scaler mode layer */
+ __disp_3d_src_mode_t trd_mode;
+ __u32 trd_right_addr[3]; /* used when in frame packing 3d mode */
+} __disp_fb_t;
+
+typedef struct {
+ __disp_layer_work_mode_t mode; /* layer work mode */
+ __bool b_from_screen;
+ /*
+ * layer pipe,0/1,if in scaler mode, scaler0 must be pipe0,
+ * scaler1 must be pipe1
+ */
+ __u8 pipe;
+ /*
+ * layer priority,can get layer prio,but never set layer prio.
+ * From bottom to top, priority from low to high
+ */
+ __u8 prio;
+ __bool alpha_en; /* layer global alpha enable */
+ __u16 alpha_val; /* layer global alpha value */
+ __bool ck_enable; /* layer color key enable */
+ /* framebuffer source window,only care x,y if is not scaler mode */
+ __disp_rect_t src_win;
+ __disp_rect_t scn_win; /* screen window */
+ __disp_fb_t fb; /* framebuffer */
+ __bool b_trd_out; /* if output 3d mode, used for scaler mode layer */
+ /* output 3d mode, used for scaler mode layer */
+ __disp_3d_out_mode_t out_trd_mode;
+} __disp_layer_info_t;
+
+int sunxi_hw_cursor_hide(sunxi_disp_t *ctx)
+{
+ int result;
+ uint32_t tmp[4];
+ tmp[0] = ctx->fb_id;
+ result = ioctl(ctx->fd_disp, DISP_CMD_HWC_CLOSE, &tmp);
+ if (result >= 0)
+ ctx->cursor_enabled = 0;
+ return result;
+}
+
+/*****************************************************************************
+ * Support for scaled layers *
+ *****************************************************************************/
+
+static int sunxi_layer_change_work_mode(sunxi_disp_t *ctx, int new_mode)
+{
+ __disp_layer_info_t layer_info;
+ uint32_t tmp[4];
+
+ if (ctx->layer_id < 0)
+ return -1;
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&layer_info;
+ if (ioctl(ctx->fd_disp, DISP_CMD_LAYER_GET_PARA, tmp) < 0)
+ return -1;
+
+ layer_info.mode = new_mode;
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&layer_info;
+ return ioctl(ctx->fd_disp, DISP_CMD_LAYER_SET_PARA, tmp);
+}
+
+int sunxi_layer_reserve(sunxi_disp_t *ctx)
+{
+ __disp_layer_info_t layer_info;
+ uint32_t tmp[4];
+
+ /* try to allocate a layer */
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = DISP_LAYER_WORK_MODE_NORMAL;
+ ctx->layer_id = ioctl(ctx->fd_disp, DISP_CMD_LAYER_REQUEST, &tmp);
+ if (ctx->layer_id < 0)
+ return -1;
+
+ /* Initially set the layer configuration to something reasonable */
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&layer_info;
+ if (ioctl(ctx->fd_disp, DISP_CMD_LAYER_GET_PARA, tmp) < 0)
+ return -1;
+
+ /* the screen and overlay layers need to be in different pipes */
+ layer_info.pipe = 1;
+ layer_info.alpha_en = 1;
+ layer_info.alpha_val = 255;
+
+ layer_info.fb.addr[0] = ctx->framebuffer_paddr;
+ layer_info.fb.size.width = 1;
+ layer_info.fb.size.height = 1;
+ layer_info.fb.format = DISP_FORMAT_ARGB8888;
+ layer_info.fb.seq = DISP_SEQ_ARGB;
+ layer_info.fb.mode = DISP_MOD_INTERLEAVED;
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&layer_info;
+ if (ioctl(ctx->fd_disp, DISP_CMD_LAYER_SET_PARA, tmp) < 0)
+ return -1;
+
+ /* Now probe the scaler mode to see if there is a free scaler available */
+ if (sunxi_layer_change_work_mode(ctx, DISP_LAYER_WORK_MODE_SCALER) == 0)
+ ctx->layer_has_scaler = 1;
+
+ /* Revert back to normal mode */
+ sunxi_layer_change_work_mode(ctx, DISP_LAYER_WORK_MODE_NORMAL);
+ ctx->layer_scaler_is_enabled = 0;
+ ctx->layer_format = DISP_FORMAT_ARGB8888;
+
+ return ctx->layer_id;
+}
+
+int sunxi_layer_set_output_window(sunxi_disp_t *ctx, int x, int y, int w, int h)
+{
+ __disp_rect_t buf_rect = {
+ ctx->layer_buf_x, ctx->layer_buf_y,
+ ctx->layer_buf_w, ctx->layer_buf_h
+ };
+ __disp_rect_t win_rect = { x, y, w, h };
+ uint32_t tmp[4];
+ int err;
+
+ if (ctx->layer_id < 0 || w <= 0 || h <= 0)
+ return -1;
+
+ /*
+ * Handle negative window Y coordinates (workaround a bug).
+ * The Allwinner A10/A13 display controller hardware is expected to
+ * support negative coordinates of the top left corners of the layers.
+ * But there is some bug either in the kernel driver or in the hardware,
+ * which messes up the picture on screen when the Y coordinate is negative
+ * for YUV layer. Negative X coordinates are not affected. RGB formats
+ * are not affected too.
+ *
+ * We fix this by just recalculating which part of the buffer in memory
+ * corresponds to Y=0 on screen and adjust the input buffer settings.
+ */
+ if (ctx->layer_format == DISP_FORMAT_YUV420 &&
+ (y < 0 || ctx->layer_win_y < 0)) {
+ if (win_rect.y < 0) {
+ int y_shift = -(double)y * buf_rect.height / win_rect.height;
+ buf_rect.y += y_shift;
+ buf_rect.height -= y_shift;
+ win_rect.height += win_rect.y;
+ win_rect.y = 0;
+ }
+
+ if (buf_rect.height <= 0 || win_rect.height <= 0) {
+ /* No part of the window is visible. Just construct a fake rectangle
+ * outside the screen as a window placement (but with a non-negative Y
+ * coordinate). Do this to avoid passing bogus negative heights to
+ * the kernel driver (who knows how it would react?) */
+ win_rect.x = -1;
+ win_rect.y = 0;
+ win_rect.width = 1;
+ win_rect.height = 1;
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&win_rect;
+ return ioctl(ctx->fd_disp, DISP_CMD_LAYER_SET_SCN_WINDOW, &tmp);
+ }
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&buf_rect;
+ if ((err = ioctl(ctx->fd_disp, DISP_CMD_LAYER_SET_SRC_WINDOW, &tmp)))
+ return err;
+ }
+ /* Save the new non-adjusted window position */
+ ctx->layer_win_x = x;
+ ctx->layer_win_y = y;
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&win_rect;
+ return ioctl(ctx->fd_disp, DISP_CMD_LAYER_SET_SCN_WINDOW, &tmp);
+}
+
+int sunxi_layer_show(sunxi_disp_t *ctx)
+{
+ uint32_t tmp[4];
+
+ if (ctx->layer_id < 0)
+ return -1;
+
+ /* YUV formats need to use a scaler */
+ if (ctx->layer_format == DISP_FORMAT_YUV420 && !ctx->layer_scaler_is_enabled) {
+ if (sunxi_layer_change_work_mode(ctx, DISP_LAYER_WORK_MODE_SCALER) == 0)
+ ctx->layer_scaler_is_enabled = 1;
+ }
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ return ioctl(ctx->fd_disp, DISP_CMD_LAYER_OPEN, &tmp);
+}
+
+int sunxi_layer_release(sunxi_disp_t *ctx)
+{
+ int result;
+ uint32_t tmp[4];
+
+ if (ctx->layer_id < 0)
+ return -1;
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ ioctl(ctx->fd_disp, DISP_CMD_LAYER_RELEASE, &tmp);
+
+ ctx->layer_id = -1;
+ ctx->layer_has_scaler = 0;
+ return 0;
+}
+
+
+int sunxi_layer_set_rgb_input_buffer(sunxi_disp_t *ctx,
+ int bpp,
+ uint32_t offset_in_framebuffer,
+ int width,
+ int height,
+ int stride)
+{
+ __disp_fb_t fb;
+ __disp_rect_t rect = { 0, 0, width, height };
+ uint32_t tmp[4];
+ memset(&fb, 0, sizeof(fb));
+
+ if (ctx->layer_id < 0)
+ return -1;
+
+ if (!ctx->layer_scaler_is_enabled) {
+ if (sunxi_layer_change_work_mode(ctx, DISP_LAYER_WORK_MODE_SCALER) == 0)
+ ctx->layer_scaler_is_enabled = 1;
+ else
+ return -1;
+ }
+
+ fb.addr[0] = ctx->framebuffer_paddr + offset_in_framebuffer;
+ fb.size.height = height;
+ if (bpp == 32) {
+ fb.format = DISP_FORMAT_ARGB8888;
+ fb.seq = DISP_SEQ_ARGB;
+ fb.mode = DISP_MOD_INTERLEAVED;
+ fb.size.width = stride;
+ } else if (bpp == 16) {
+ fb.format = DISP_FORMAT_RGB565;
+ fb.seq = DISP_SEQ_P10;
+ fb.mode = DISP_MOD_INTERLEAVED;
+ fb.size.width = stride * 2;
+ } else {
+ return -1;
+ }
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)&fb;
+ if (ioctl(ctx->fd_disp, DISP_CMD_LAYER_SET_FB, &tmp) < 0)
+ return -1;
+
+ ctx->layer_buf_x = rect.x;
+ ctx->layer_buf_y = rect.y;
+ ctx->layer_buf_w = rect.width;
+ ctx->layer_buf_h = rect.height;
+ ctx->layer_format = fb.format;
+
+ tmp[0] = ctx->fb_id;
+ tmp[1] = ctx->layer_id;
+ tmp[2] = (uintptr_t)▭
+ return ioctl(ctx->fd_disp, DISP_CMD_LAYER_SET_SRC_WINDOW, &tmp);
+}
+
+sunxi_disp_t *sunxi_disp_init(const char *device, void *xserver_fbmem)
+{
+ sunxi_disp_t *ctx = calloc(sizeof(sunxi_disp_t), 1);
+ struct fb_var_screeninfo fb_var;
+ struct fb_fix_screeninfo fb_fix;
+
+ int tmp, version;
+ int gfx_layer_size;
+ int ovl_layer_size;
+
+ /* use /dev/fb0 by default */
+ if (!device)
+ device = "/dev/fb0";
+
+ if (strcmp(device, "/dev/fb0") == 0) {
+ ctx->fb_id = 0;
+ }
+ else if (strcmp(device, "/dev/fb1") == 0) {
+ ctx->fb_id = 1;
+ }
+ else
+ {
+ free(ctx);
+ return NULL;
+ }
+
+ /* store the already existing mapping done by xserver */
+ ctx->xserver_fbmem = xserver_fbmem;
+
+ ctx->fd_disp = open("/dev/disp", O_RDWR);
+
+ /* maybe it's even not a sunxi hardware */
+ if (ctx->fd_disp < 0) {
+ free(ctx);
+ return NULL;
+ }
+
+ /* version check */
+ tmp = SUNXI_DISP_VERSION;
+ version = ioctl(ctx->fd_disp, DISP_CMD_VERSION, &tmp);
+ if (version < 0) {
+ close(ctx->fd_disp);
+ free(ctx);
+ return NULL;
+ }
+
+ ctx->fd_fb = open(device, O_RDWR);
+ if (ctx->fd_fb < 0) {
+ close(ctx->fd_disp);
+ free(ctx);
+ return NULL;
+ }
+
+ if (ioctl(ctx->fd_fb, FBIOGET_VSCREENINFO, &fb_var) < 0 ||
+ ioctl(ctx->fd_fb, FBIOGET_FSCREENINFO, &fb_fix) < 0)
+ {
+ close(ctx->fd_fb);
+ close(ctx->fd_disp);
+ free(ctx);
+ return NULL;
+ }
+
+ ctx->xres = fb_var.xres;
+ ctx->yres = fb_var.yres;
+ ctx->bits_per_pixel = fb_var.bits_per_pixel;
+ ctx->framebuffer_paddr = fb_fix.smem_start;
+ ctx->framebuffer_size = fb_fix.smem_len;
+ ctx->framebuffer_height = ctx->framebuffer_size /
+ (ctx->xres * ctx->bits_per_pixel / 8);
+ ctx->gfx_layer_size = ctx->xres * ctx->yres * fb_var.bits_per_pixel / 8;
+
+ if (ctx->framebuffer_size < ctx->gfx_layer_size) {
+ close(ctx->fd_fb);
+ close(ctx->fd_disp);
+ free(ctx);
+ return NULL;
+ }
+
+ if (ctx->xserver_fbmem) {
+ /* use already existing mapping */
+ ctx->framebuffer_addr = ctx->xserver_fbmem;
+ }
+ else {
+ /* mmap framebuffer memory */
+ ctx->framebuffer_addr = (uint8_t *)mmap(0, ctx->framebuffer_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, ctx->fd_fb, 0);
+ if (ctx->framebuffer_addr == MAP_FAILED) {
+ close(ctx->fd_fb);
+ close(ctx->fd_disp);
+ free(ctx);
+ return NULL;
+ }
+ }
+
+ ctx->cursor_enabled = 0;
+ ctx->cursor_x = -1;
+ ctx->cursor_y = -1;
+
+ /* Get the id of the screen layer */
+ if (ioctl(ctx->fd_fb,
+ ctx->fb_id == 0 ? FBIOGET_LAYER_HDL_0 : FBIOGET_LAYER_HDL_1,
+ &ctx->gfx_layer_id))
+ {
+ close(ctx->fd_fb);
+ close(ctx->fd_disp);
+ free(ctx);
+ return NULL;
+ }
+
+ if (sunxi_layer_reserve(ctx) < 0)
+ {
+ close(ctx->fd_fb);
+ close(ctx->fd_disp);
+ free(ctx);
+ return NULL;
+ }
+
+ ctx->fd_g2d = open("/dev/g2d", O_RDWR);
+
+ return ctx;
+}
+
+int sunxi_disp_close(sunxi_disp_t *ctx)
+{
+ if (ctx->fd_disp >= 0) {
+ if (ctx->fd_g2d >= 0) {
+ close(ctx->fd_g2d);
+ }
+ /* release layer */
+ sunxi_layer_release(ctx);
+ /* disable cursor */
+ if (ctx->cursor_enabled)
+ sunxi_hw_cursor_hide(ctx);
+ /* close descriptors */
+ if (!ctx->xserver_fbmem)
+ munmap(ctx->framebuffer_addr, ctx->framebuffer_size);
+ close(ctx->fd_fb);
+ close(ctx->fd_disp);
+ ctx->fd_disp = -1;
+ free(ctx);
+ }
+ return 0;
+}
+
+int sunxi_wait_for_vsync(sunxi_disp_t *ctx)
+{
+ return ioctl(ctx->fd_fb, FBIO_WAITFORVSYNC, 0);
+}
+
+// END of lowlevel SunxiG2D functions block
+
+void pixman_composite_src_0565_8888_asm_neon(int width,
+ int height,
+ uint32_t *dst,
+ int dst_stride_pixels,
+ uint16_t *src,
+ int src_stride_pixels);
+
+void pixman_composite_src_8888_8888_asm_neon(int width,
+ int height,
+ uint32_t *dst,
+ int dst_stride_pixels,
+ uint16_t *src,
+ int src_stride_pixels);
+
+// Pointer to the blitting function. Will be asigned when we find out what bpp the core uses.
+void(*pixman_blit)();
+
+extern void *memcpy_neon(void *dst, const void *src, size_t n);
+
+static void *vsync_thread_func (void *arg);
+
+pthread_t vsync_thread;
+
+pthread_cond_t vsync_condition;
+pthread_mutex_t queue_mutex;
+pthread_mutex_t vsync_cond_mutex;
+
+sunxi_disp_t *disp;
+
+struct sunxi_page {
+ unsigned numpage;
+ unsigned offset;
+ unsigned yoffset;
+ bool used;
+ // Since each page has it's own used bool, it needs it's own mutex
+ // to isolate write access to that bool.
+ pthread_mutex_t page_used_mutex;
+ struct sunxi_page *next;
+};
+
+struct sunxi_video {
+ void *font;
+ const font_renderer_driver_t *font_driver;
+
+ uint8_t font_rgb[4];
+
+ /* current dimensions of the emulator fb */
+ unsigned src_width;
+ unsigned src_height;
+ unsigned src_pitch;
+ unsigned src_bpp;
+ unsigned src_bytes_per_pixel;
+ unsigned dst_pitch;
+ unsigned visible_width;
+ unsigned bytes_per_pixel;
+ unsigned numpages;
+
+ struct sunxi_page *pages;
+ struct sunxi_page *queue;
+ struct sunxi_page *current_page;
+ unsigned pageflip_pending;
+
+ // Keep the vsync while loop going. Set to false to exit.
+ bool keep_vsync;
+
+ // Variables to restore screen on exit
+ unsigned screensize;
+ char *screen_bck;
+
+ /* menu data */
+ unsigned menu_rotation;
+ bool menu_active;
+
+ bool aspect_changed;
+};
+
+int sunxi_wait_flip()
+{
+ // Wait for next vsync
+ return ioctl(disp->fd_fb, FBIO_WAITFORVSYNC, 0);
+}
+
+void queue_page (struct sunxi_page *page, void *data) {
+ struct sunxi_video *_dispvars = data;
+ struct sunxi_page *ppage = _dispvars->queue;
+ if (ppage == NULL)
+ _dispvars->queue = page;
+ else {
+ while (ppage->next != NULL) {
+ ppage = ppage->next;
+ }
+ ppage->next = page;
+ page->next = NULL;
+ }
+}
+
+struct sunxi_page *unqueue_page (void *data) {
+ struct sunxi_video *_dispvars = data;
+ struct sunxi_page *page;
+ page = _dispvars->queue;
+ if (page != NULL) {
+ _dispvars->queue = page->next;
+ page->next = NULL;
+ return page;
+ }
+ else return NULL;
+}
+
+/* Find a free page, clear it if necessary, and return the page. If *
+ * no free page is available when called, wait for a page flip. */
+static struct sunxi_page *sunxi_get_free_page(void *data)
+{
+ struct sunxi_video *_dispvars = data;
+ struct sunxi_page *page = NULL;
+ unsigned i;
+
+ /* Wait until a free page is available. */
+ while (page == NULL) {
+ for (i = 0; i < _dispvars->numpages; ++i) {
+ if (!_dispvars->pages[i].used){
+ page = &_dispvars->pages[i];
+ break;
+ }
+ }
+ if (page == NULL) {
+ pthread_mutex_lock (&vsync_cond_mutex);
+ pthread_cond_wait (&vsync_condition, &vsync_cond_mutex);
+ pthread_mutex_unlock (&vsync_cond_mutex);
+ }
+ }
+
+ pthread_mutex_lock(&page->page_used_mutex);
+ page->used = true;
+ pthread_mutex_unlock(&page->page_used_mutex);
+ return page;
+}
+
+void sunxi_blank_console (void *data) {
+ struct sunxi_video *_dispvars = data;
+
+ // Disable cursor blinking so it's not visible in RetroArch
+ system("setterm -cursor off");
+
+ // Figure out the size of the screen in bytes
+ _dispvars->screensize = disp->xres * disp->yres * disp->bits_per_pixel / 8;
+
+ // Backup screen contents
+ _dispvars->screen_bck = (char *) malloc (_dispvars->screensize * sizeof (char));
+ memcpy ((char*)_dispvars->screen_bck, (char*)disp->framebuffer_addr, _dispvars->screensize);
+
+ // Blank screen
+ memset ((char*)(disp->framebuffer_addr), 0x00, _dispvars->screensize);
+}
+
+void sunxi_unblank_console (void *data) {
+ struct sunxi_video *_dispvars = data;
+
+ system("setterm -cursor on");
+ //memcpy ((char*)disp->framebuffer_addr, (char*)_dispvars->screen_bck, _dispvars->screensize);
+ free (_dispvars->screen_bck);
+}
+
+static void *sunxi_gfx_init(const video_info_t *video, const input_driver_t **input, void **input_data) {
+ struct sunxi_video *_dispvars;
+ int i;
+
+ _dispvars = calloc(1, sizeof(struct sunxi_video));
+ _dispvars->src_bytes_per_pixel = video->rgb32 ? 4 : 2;
+
+ disp = sunxi_disp_init("/dev/fb0", NULL);
+
+ // Blank text console and disable cursor blinking
+ sunxi_blank_console(_dispvars);
+
+ _dispvars->numpages = 2;
+ _dispvars->pages = calloc (_dispvars->numpages, sizeof (struct sunxi_page));
+
+ _dispvars->dst_pitch = disp->xres * disp->bits_per_pixel / 8;
+ _dispvars->pageflip_pending = 0;
+ _dispvars->current_page = NULL;
+ _dispvars->queue = NULL;
+ _dispvars->keep_vsync = true;
+
+ _dispvars->src_bpp = video->rgb32 ? 32 : 16;
+ _dispvars->bytes_per_pixel = _dispvars->src_bpp / 8;
+
+ switch (_dispvars->src_bpp){
+ case 16:
+ pixman_blit = pixman_composite_src_0565_8888_asm_neon;
+ break;
+ case 32:
+ pixman_blit = pixman_composite_src_8888_8888_asm_neon;
+ break;
+ default:
+ return NULL;
+ }
+
+ pthread_mutex_init(&queue_mutex, NULL);
+ pthread_mutex_init(&vsync_cond_mutex, NULL);
+ pthread_cond_init(&vsync_condition, NULL);
+
+ for (i = 0; i < _dispvars->numpages; i++) {
+ pthread_mutex_init(&_dispvars->pages[i].page_used_mutex, NULL);
+ _dispvars->pages[i].numpage = i;
+ _dispvars->pages[i].next = NULL;
+ }
+
+ if (input && input_data) {
+ *input = NULL;
+ }
+
+ // Launching vsync thread
+ pthread_create(&vsync_thread, NULL, vsync_thread_func, _dispvars);
+
+ return _dispvars;
+}
+
+static void *vsync_thread_func (void *arg) {
+ struct sunxi_video *_dispvars = arg;
+ struct sunxi_page *page;
+ while (_dispvars->keep_vsync) {
+ sunxi_wait_flip(disp);
+
+ pthread_mutex_lock(&vsync_cond_mutex);
+ pthread_cond_signal (&vsync_condition);
+ pthread_mutex_unlock(&vsync_cond_mutex);
+
+ pthread_mutex_lock(&queue_mutex);
+ page = unqueue_page((void*)_dispvars);
+ //_dispvars->pageflip_pending--;
+ pthread_mutex_unlock(&queue_mutex);
+
+ // We mark as free the page that was visible until now.
+ if (_dispvars->current_page != NULL) {
+ pthread_mutex_lock (&_dispvars->current_page->page_used_mutex);
+ _dispvars->current_page->used = false;
+ pthread_mutex_unlock (&_dispvars->current_page->page_used_mutex);
+ }
+ // The page on which we just issued a flip becomes the visible one, with the only purpose that
+ // we can mark it as free next time we get here.
+ // This variable is only accessed from this same thread over and over, so
+ // it doesn't need to be isolated.
+ _dispvars->current_page = page;
+ }
+ return 0;
+}
+
+static void sunxi_gfx_free(void *data) {
+ struct sunxi_video *_dispvars = data;
+ int i;
+
+ // Stop the vsync thread
+ _dispvars->keep_vsync = false;
+ pthread_join(vsync_thread, NULL);
+
+ for (i = 0; i < _dispvars->numpages; i++)
+ {
+ pthread_mutex_destroy(&_dispvars->pages[i].page_used_mutex);
+ }
+ pthread_mutex_destroy(&queue_mutex);
+ pthread_mutex_destroy(&vsync_cond_mutex);
+ pthread_cond_destroy(&vsync_condition);
+
+ free(_dispvars->pages);
+
+ // Restore text console contents and reactivate cursor blinking
+ sunxi_unblank_console(_dispvars);
+ sunxi_disp_close(disp);
+ free(_dispvars);
+}
+
+
+
+void sunxi_blit_flip (struct sunxi_page *page, const void *frame, void *data) {
+ struct sunxi_video *_dispvars = data;
+
+ pixman_blit(
+ _dispvars->src_width,
+ _dispvars->src_height,
+ ((uint32_t*) disp->framebuffer_addr + (disp->yres + page->yoffset) * _dispvars->dst_pitch/4),
+ _dispvars->dst_pitch/4,
+ (uint16_t*)frame,
+ _dispvars->src_pitch/_dispvars->bytes_per_pixel
+ );
+
+ // We DO allow queue multiple page flips in this backend, that's why this is commented:
+ // since we have 2 pages, multiple flip issuing can not happen anyway: the game will have
+ // to wait the second time it completes a loop because one page is the one in the screen (used)
+ // and the other is the one the game used in the previous loop.
+ /*if (_dispvars->pageflip_pending > 0) {
+ pthread_mutex_lock(&vsync_cond_mutex);
+ pthread_cond_wait (&vsync_condition, &vsync_cond_mutex);
+ pthread_mutex_unlock(&vsync_cond_mutex);
+ }*/
+
+ // Issue pageflip. Will flip on next vsync.
+ sunxi_layer_set_rgb_input_buffer(disp, disp->bits_per_pixel, (disp->yres + page->yoffset) * disp->xres * 4,
+ _dispvars->src_width, _dispvars->src_height, disp->xres);
+
+ pthread_mutex_lock(&queue_mutex);
+ queue_page(page, (void *)_dispvars);
+ //_dispvars->pageflip_pending++;
+ pthread_mutex_unlock(&queue_mutex);
+}
+
+static bool sunxi_gfx_frame(void *data, const void *frame, unsigned width,
+ unsigned height, unsigned pitch, const char *msg) {
+
+ struct sunxi_video *_dispvars = data;
+
+ if (_dispvars->src_width != width || _dispvars->src_height != height) {
+ // Sanity check on new dimensions
+ if (width == 0 || height == 0) return true;
+ RARCH_LOG("video_sunxi: internal resolution changed by core: %ux%u -> %ux%u\n",
+ _dispvars->src_width, _dispvars->src_height, width, height);
+
+ _dispvars->src_width = width;
+ _dispvars->src_height = height;
+ // Total pitch, including things the cores render between "visible" scanlines
+ _dispvars->src_pitch = pitch;
+
+ // Incremental offset that sums up on each previous page offset.
+ // Total offset of each page has to be adjusted when internal resolution changes.
+ unsigned inc_yoffset = _dispvars->src_height;
+ int i;
+ for (i = 0; i < _dispvars->numpages; i++)
+ _dispvars->pages[i].yoffset = i * inc_yoffset;
+
+ float aspect;
+ switch (g_settings.video.aspect_ratio_idx){
+ case ASPECT_RATIO_4_3:
+ aspect = (float)4 / (float)3;
+ break;
+ case ASPECT_RATIO_16_9:
+ aspect = (float)16 / (float)9;
+ break;
+ case ASPECT_RATIO_16_10:
+ aspect = (float)16 / (float)10;
+ break;
+ case ASPECT_RATIO_16_15:
+ aspect = (float)16 / (float)15;
+ break;
+ case ASPECT_RATIO_CORE:
+ aspect = (float)_dispvars->src_width / (float)_dispvars->src_height;
+ break;
+ default:
+ aspect = (float)_dispvars->src_width / (float)_dispvars->src_height;
+ break;
+ }
+
+ unsigned visible_width = disp->yres * aspect;
+ unsigned xpos = (disp->xres - visible_width) / 2;
+
+ // setup layer window
+ sunxi_layer_set_output_window(disp, xpos, 0, visible_width, disp->yres);
+
+ // make the layer visible
+ sunxi_layer_show(disp);
+ }
+
+ struct sunxi_page *page = NULL;
+ page = sunxi_get_free_page((void*)_dispvars);
+ sunxi_blit_flip(page, frame, (void *)_dispvars);
+
+ return true;
+}
+
+static void sunxi_gfx_set_nonblock_state(void *data, bool state) {
+ struct sunxi_video *vid = data;
+
+ //vid->data->sync = !state;
+}
+
+static bool sunxi_gfx_alive(void *data) {
+ (void)data;
+ return true; /* always alive */
+}
+
+static bool sunxi_gfx_focus(void *data) {
+ (void)data;
+ return true; /* fb device always has focus */
+}
+
+static void sunxi_gfx_set_rotation(void *data, unsigned rotation) {
+ (void)data;
+}
+
+static bool sunxi_gfx_has_windowed(void *data)
+{
+ (void)data;
+
+ return false;
+}
+
+static bool sunxi_gfx_suppress_screensaver(void *data, bool enable)
+{
+ (void)data;
+ (void)enable;
+
+ return false;
+}
+
+static void sunxi_gfx_viewport_info(void *data, struct video_viewport *vp) {
+ struct sunxi_video *_dispvars = data;
+
+ vp->x = vp->y = 0;
+
+ vp->width = vp->full_width = _dispvars->src_width;
+ vp->height = vp->full_height = _dispvars->src_height;
+}
+
+static bool sunxi_gfx_set_shader(void *data,
+ enum rarch_shader_type type, const char *path)
+{
+ (void)data;
+ (void)type;
+ (void)path;
+
+ return false;
+}
+
+static const video_poke_interface_t sunxi_poke_interface = {
+ NULL, /* set_video_mode */
+ NULL, /* set_filtering */
+ NULL, /* get_video_output_size */
+ NULL, /* get_video_output_prev */
+ NULL, /* get_video_output_next */
+#ifdef HAVE_FBO
+ NULL, /* get_current_framebuffer */
+#endif
+ NULL, /* get_proc_address */
+ //sunxi_set_aspect_ratio,
+ //sunxi_apply_state_changes,
+#ifdef HAVE_MENU
+ //sunxi_set_texture_frame,
+ //sunxi_set_texture_enable,
+#endif
+ //sunxi_set_osd_msg,
+ //sunxi_show_mouse
+};
+
+static void sunxi_gfx_get_poke_interface(void *data,
+ const video_poke_interface_t **iface)
+{
+ (void)data;
+ *iface = &sunxi_poke_interface;
+}
+
+video_driver_t video_sunxi = {
+ sunxi_gfx_init,
+ sunxi_gfx_frame,
+ sunxi_gfx_set_nonblock_state,
+ sunxi_gfx_alive,
+ sunxi_gfx_focus,
+ sunxi_gfx_suppress_screensaver,
+ sunxi_gfx_has_windowed,
+ sunxi_gfx_set_shader,
+ sunxi_gfx_free,
+ "sunxi",
+ sunxi_gfx_set_rotation,
+ sunxi_gfx_viewport_info,
+ NULL, /* read_viewport */
+
+#ifdef HAVE_OVERLAY
+ NULL, /* overlay_interface */
+#endif
+ sunxi_gfx_get_poke_interface
+};
diff --git a/gfx/pixman/pixman-arm-asm.h b/gfx/pixman/pixman-arm-asm.h
new file mode 100644
index 0000000000..ee78541087
--- /dev/null
+++ b/gfx/pixman/pixman-arm-asm.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
diff --git a/gfx/pixman/pixman-arm-neon-asm.S b/gfx/pixman/pixman-arm-neon-asm.S
new file mode 100644
index 0000000000..7e949a38fd
--- /dev/null
+++ b/gfx/pixman/pixman-arm-neon-asm.S
@@ -0,0 +1,3627 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ * - pixman_composite_over_8888_0565_asm_neon
+ * - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+ .text
+ .fpu neon
+ .arch armv7a
+ .object_arch armv4
+ .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+ .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+ .arm
+ .altmacro
+ .p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-asm.h"
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ * to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0, d1, d2, d3 - contain loaded source pixel data
+ * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0, d1, d2, d3 - contain loaded source pixel data
+ * d4, d5 - contain loaded destination pixels (they are needed)
+ * d28, d29 - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ * vuzp.8 d0, d1
+ * vuzp.8 d2, d3
+ * vuzp.8 d1, d3
+ * vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ * vzip.8 d0, d2
+ * vzip.8 d1, d3
+ * vzip.8 d2, d3
+ * vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ and put data into d6 - red, d7 - green, d30 - blue */
+ vshrn.u16 d6, q2, #8
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vsri.u8 d6, d6, #5
+ vmvn.8 d3, d3 /* invert source alpha */
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into d16 - red, d19 - green, d18 - blue */
+ vmull.u8 q10, d3, d6
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ vrshr.u16 q13, q10, #8
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ vqadd.u8 d16, d2, d20
+ vqadd.u8 q9, q0, q11
+ /* convert the result to r5g6b5 and store it into {d28, d29} */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ * head
+ * while (...) {
+ * tail
+ * head
+ * }
+ * tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ * pixman_composite_over_8888_0565_process_pixblock_tail
+ * vst1.16 {d28, d29}, [DST_W, :128]!
+ * vld1.16 {d4, d5}, [DST_R, :128]!
+ * vld4.32 {d0, d1, d2, d3}, [SRC]!
+ * pixman_composite_over_8888_0565_process_pixblock_head
+ * cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ vqadd.u8 d16, d2, d20
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vqadd.u8 q9, q0, q11
+ vshrn.u16 d6, q2, #8
+ fetch_src_pixblock
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vshll.u8 q14, d16, #8
+ PF add PF_X, PF_X, #8
+ vshll.u8 q8, d19, #8
+ PF tst PF_CTL, #0xF
+ vsri.u8 d6, d6, #5
+ PF addne PF_X, PF_X, #8
+ vmvn.8 d3, d3
+ PF subne PF_CTL, PF_CTL, #1
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q10, d3, d6
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vsri.u16 q14, q8, #5
+ PF cmp PF_X, ORIG_W
+ vshll.u8 q9, d18, #8
+ vrshr.u16 q13, q10, #8
+ PF subge PF_X, PF_X, ORIG_W
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ PF subges PF_CTL, PF_CTL, #0x10
+ vsri.u16 q14, q9, #11
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vraddhn.u16 d22, q12, q15
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+ pixman_composite_over_8888_0565_process_pixblock_tail
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ fetch_src_pixblock
+ pixman_composite_over_8888_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ * FLAG_DST_READWRITE - tells that the destination buffer is both read
+ * and written, for write-only buffer we would use
+ * FLAG_DST_WRITEONLY flag instead
+ * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ * and separate color channels for 32bpp format.
+ * The next things are:
+ * - the number of pixels processed per iteration (8 in this case, because
+ * that's the maximum what can fit into four 64-bit NEON registers).
+ * - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ * prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+ pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+ /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ and put data into d6 - red, d7 - green, d30 - blue */
+ vshrn.u16 d6, q2, #8
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vsri.u8 d6, d6, #5
+ vsri.u8 d7, d7, #6
+ vshrn.u16 d30, q2, #2
+ /* now do alpha blending, storing results in 8-bit planar format
+ into d16 - red, d19 - green, d18 - blue */
+ vmull.u8 q10, d3, d6
+ vmull.u8 q11, d3, d7
+ vmull.u8 q12, d3, d30
+ vrshr.u16 q13, q10, #8
+ vrshr.u16 q3, q11, #8
+ vrshr.u16 q15, q12, #8
+ vraddhn.u16 d20, q10, q13
+ vraddhn.u16 d23, q11, q3
+ vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+ /* ... continue alpha blending */
+ vqadd.u8 d16, d2, d20
+ vqadd.u8 q9, q0, q11
+ /* convert the result to r5g6b5 and store it into {d28, d29} */
+ vshll.u8 q14, d16, #8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+ pixman_composite_over_n_0565_process_pixblock_tail
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ pixman_composite_over_n_0565_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+ vmvn.8 d3, d3 /* invert source alpha */
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_0565_init, \
+ default_cleanup, \
+ pixman_composite_over_n_0565_process_pixblock_head, \
+ pixman_composite_over_n_0565_process_pixblock_tail, \
+ pixman_composite_over_n_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+ vshll.u8 q8, d1, #8
+ vshll.u8 q14, d2, #8
+ vshll.u8 q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+ vsri.u16 q14, q8, #5
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ fetch_src_pixblock
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vsri.u16 q14, q9, #11
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vshll.u8 q8, d1, #8
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vshll.u8 q14, d2, #8
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vshll.u8 q9, d0, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+ vshrn.u16 d30, q0, #8
+ vshrn.u16 d29, q0, #3
+ vsli.u16 q0, q0, #5
+ vmov.u8 d31, #255
+ vsri.u8 d30, d30, #5
+ vsri.u8 d29, d29, #6
+ vshrn.u16 d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+ pixman_composite_src_0565_8888_process_pixblock_tail
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ fetch_src_pixblock
+ pixman_composite_src_0565_8888_process_pixblock_head
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add PF_X, PF_X, #32
+ PF tst PF_CTL, #0xF
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ PF addne PF_X, PF_X, #32
+ PF subne PF_CTL, PF_CTL, #1
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ vqadd.u8 q14, q0, q2
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q15, q1, q3
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ fetch_src_pixblock
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+ /* deinterleaved source pixels in {d0, d1, d2, d3} */
+ /* inverted alpha in {d24} */
+ /* destination pixels in {d4, d5, d6, d7} */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q2, q10, #8
+ vrshr.u16 q3, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q2, q10
+ vraddhn.u16 d31, q3, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q2, q10, #8
+ vrshr.u16 q3, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q2, q10
+ vraddhn.u16 d31, q3, q11
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vqadd.u8 q14, q0, q14
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0x0F
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vqadd.u8 q15, q1, q15
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q8, d24, d4
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vmull.u8 q9, d24, d5
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d6
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d7
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+ vmvn.8 d24, d3 /* get inverted alpha */
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
+ vmvn.8 d22, d3
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d22, d4
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d22, d5
+ vmull.u8 q10, d22, d6
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vmull.u8 q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d7[0]}, [DUMMY]
+ vdup.8 d4, d7[0]
+ vdup.8 d5, d7[1]
+ vdup.8 d6, d7[2]
+ vdup.8 d7, d7[3]
+.endm
+
+generate_composite_function \
+ pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_reverse_n_8888_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 4, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+ vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
+ vmull.u8 q1, d24, d9
+ vmull.u8 q6, d24, d10
+ vmull.u8 q7, d24, d11
+ vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
+ vshrn.u16 d7, q2, #3
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vmull.u8 q8, d3, d6 /* now do alpha blending */
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+ /* 3 cycle bubble (after vmull.u8) */
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ /* 1 cycle bubble */
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8 /* convert to 16bpp */
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ /* 1 cycle bubble */
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vshrn.u16 d6, q2, #8
+ fetch_mask_pixblock
+ vshrn.u16 d7, q2, #3
+ fetch_src_pixblock
+ vmull.u8 q6, d24, d10
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q11, q9, #8
+ vrshr.u16 q15, q10, #8
+ vraddhn.u16 d16, q8, q13
+ vraddhn.u16 d27, q9, q11
+ vraddhn.u16 d26, q10, q15
+ vqadd.u8 d16, d2, d16
+ vmull.u8 q1, d24, d9
+ vqadd.u8 q9, q0, q13
+ vshll.u8 q14, d16, #8
+ vmull.u8 q0, d24, d8
+ vshll.u8 q8, d19, #8
+ vshll.u8 q9, d18, #8
+ vsri.u16 q14, q8, #5
+ vmull.u8 q7, d24, d11
+ vsri.u16 q14, q9, #11
+
+ cache_preload 8, 8
+
+ vsli.u16 q2, q2, #5
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q9, q1, #8
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q6, q10
+ vraddhn.u16 d3, q7, q11
+ vsri.u8 d6, d6, #5
+ vsri.u8 d7, d7, #6
+ vmvn.8 d3, d3
+ vshrn.u16 d30, q2, #2
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vmull.u8 q8, d3, d6
+ vmull.u8 q9, d3, d7
+ vmull.u8 q10, d3, d30
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_0565_init, \
+ pixman_composite_over_n_8_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d24[0]}, [DUMMY]
+ vdup.8 d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_0565_init, \
+ pixman_composite_over_8888_n_0565_cleanup, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+ vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ cache_preload 16, 16
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_0565_process_pixblock_head, \
+ pixman_composite_src_0565_0565_process_pixblock_tail, \
+ pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+ vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #8
+ vsli.u64 d0, d0, #16
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8_init, \
+ pixman_composite_src_n_8_cleanup, \
+ pixman_composite_src_n_8_process_pixblock_head, \
+ pixman_composite_src_n_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+ vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #16
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 16, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_0565_init, \
+ pixman_composite_src_n_0565_cleanup, \
+ pixman_composite_src_n_0565_process_pixblock_head, \
+ pixman_composite_src_n_0565_process_pixblock_tail, \
+ pixman_composite_src_n_0565_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d0[0]}, [DUMMY]
+ vsli.u64 d0, d0, #32
+ vorr d1, d0, d0
+ vorr q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 0, /* prefetch distance */ \
+ pixman_composite_src_n_8888_init, \
+ pixman_composite_src_n_8888_cleanup, \
+ pixman_composite_src_n_8888_process_pixblock_head, \
+ pixman_composite_src_n_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_8888_process_pixblock_head, \
+ pixman_composite_src_8888_8888_process_pixblock_tail, \
+ pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+ vorr q0, q0, q2
+ vorr q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+ vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+ fetch_src_pixblock
+ vorr q0, q0, q2
+ vorr q1, q1, q2
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+ vmov.u8 q2, #0xFF
+ vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+ pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_x888_8888_init, \
+ default_cleanup, \
+ pixman_composite_src_x888_8888_process_pixblock_head, \
+ pixman_composite_src_x888_8888_process_pixblock_tail, \
+ pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+ /* expecting solid source in {d0, d1, d2, d3} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d24, d1
+ vmull.u8 q10, d24, d2
+ vmull.u8 q11, d24, d3
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q8, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q9, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q10, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q11, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q8, d24, d0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q9, d24, d1
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d2
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d3
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8888_init, \
+ pixman_composite_src_n_8_8888_cleanup, \
+ pixman_composite_src_n_8_8888_process_pixblock_head, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d16
+ vmull.u8 q1, d25, d16
+ vmull.u8 q2, d26, d16
+ vmull.u8 q3, d27, d16
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+ vrshrn.u16 d28, q0, #8
+ vrshrn.u16 d29, q1, #8
+ vrshrn.u16 d30, q2, #8
+ vrshrn.u16 d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q0, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q1, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q2, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q3, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q0, d24, d16
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q1, d25, d16
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q2, d26, d16
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q3, d27, d16
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d16[0]}, [DUMMY]
+ vdup.8 d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8_init, \
+ pixman_composite_src_n_8_8_cleanup, \
+ pixman_composite_src_n_8_8_process_pixblock_head, \
+ pixman_composite_src_n_8_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+ /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+ /* and destination data in {d4, d5, d6, d7} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q6, d24, d8
+ vmull.u8 q7, d24, d9
+ vmull.u8 q8, d24, d10
+ vmull.u8 q9, d24, d11
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vrshr.u16 q12, q8, #8
+ vrshr.u16 q13, q9, #8
+ vraddhn.u16 d0, q6, q10
+ vraddhn.u16 d1, q7, q11
+ vraddhn.u16 d2, q8, q12
+ vraddhn.u16 d3, q9, q13
+ vmvn.8 d25, d3 /* get inverted alpha */
+ /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
+ /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d25, d4
+ vmull.u8 q9, d25, d5
+ vmull.u8 q10, d25, d6
+ vmull.u8 q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q6, q10, #8
+ vrshr.u16 q7, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q6, q10
+ vraddhn.u16 d31, q7, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q15, q9, #8
+ fetch_mask_pixblock
+ vrshr.u16 q6, q10, #8
+ PF add PF_X, PF_X, #8
+ vrshr.u16 q7, q11, #8
+ PF tst PF_CTL, #0x0F
+ vraddhn.u16 d28, q14, q8
+ PF addne PF_X, PF_X, #8
+ vraddhn.u16 d29, q15, q9
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d30, q6, q10
+ PF cmp PF_X, ORIG_W
+ vraddhn.u16 d31, q7, q11
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vmull.u8 q6, d24, d8
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q7, d24, d9
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q8, d24, d10
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q9, d24, d11
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vqadd.u8 q14, q0, q14
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vqadd.u8 q15, q1, q15
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q11, q7, #8
+ vrshr.u16 q12, q8, #8
+ vrshr.u16 q13, q9, #8
+ vraddhn.u16 d0, q6, q10
+ vraddhn.u16 d1, q7, q11
+ vraddhn.u16 d2, q8, q12
+ vraddhn.u16 d3, q9, q13
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vmvn.8 d25, d3
+ vmull.u8 q8, d25, d4
+ vmull.u8 q9, d25, d5
+ vmull.u8 q10, d25, d6
+ vmull.u8 q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8888_init, \
+ pixman_composite_over_n_8_8888_cleanup, \
+ pixman_composite_over_n_8_8888_process_pixblock_head, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail, \
+ pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d8
+ vmull.u8 q6, d26, d8
+ vmull.u8 q7, d27, d8
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vmvn.8 q12, q0
+ vmvn.8 q13, q1
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_tail
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d8[0]}, [DUMMY]
+ vdup.8 d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8_8_init, \
+ pixman_composite_over_n_8_8_cleanup, \
+ pixman_composite_over_n_8_8_process_pixblock_head, \
+ pixman_composite_over_n_8_8_process_pixblock_tail, \
+ pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {d8, d9, d10, d11}
+ * dest in {d4, d5, d6, d7 }
+ * mask in {d24, d25, d26, d27}
+ * output: updated src in {d0, d1, d2, d3 }
+ * updated mask in {d24, d25, d26, d3 }
+ */
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d9
+ vmull.u8 q6, d26, d10
+ vmull.u8 q7, d27, d11
+ vmull.u8 q9, d11, d25
+ vmull.u8 q12, d11, d24
+ vmull.u8 q13, d11, d26
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q10, q1, #8
+ vrshr.u16 q11, q6, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q10
+ vraddhn.u16 d2, q6, q11
+ vrshr.u16 q11, q12, #8
+ vrshr.u16 q8, q9, #8
+ vrshr.u16 q6, q13, #8
+ vrshr.u16 q10, q7, #8
+ vraddhn.u16 d24, q12, q11
+ vraddhn.u16 d25, q9, q8
+ vraddhn.u16 d26, q13, q6
+ vraddhn.u16 d3, q7, q10
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in {d28, d29, d30, d31}
+ */
+ vmvn.8 q12, q12
+ vmvn.8 d26, d26
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d25, d5
+ vmvn.8 d27, d3
+ vmull.u8 q10, d26, d6
+ vmull.u8 q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+ /* ... continue 'combine_over_ca' replacement */
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q6, q10, #8
+ vrshr.u16 q7, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q6, q10
+ vraddhn.u16 d31, q7, q11
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrshr.u16 q6, q10, #8
+ vrshr.u16 q7, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q6, q10
+ vraddhn.u16 d31, q7, q11
+ fetch_mask_pixblock
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+ cache_preload 8, 8
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_8888_ca_init, \
+ pixman_composite_over_n_8888_8888_ca_cleanup, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
+ * mask in {d24, d25, d26} [B, G, R]
+ * output: updated src in {d0, d1, d2 } [B, G, R]
+ * updated mask in {d24, d25, d26} [B, G, R]
+ */
+ vmull.u8 q0, d24, d8
+ vmull.u8 q1, d25, d9
+ vmull.u8 q6, d26, d10
+ vmull.u8 q9, d11, d25
+ vmull.u8 q12, d11, d24
+ vmull.u8 q13, d11, d26
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q10, q1, #8
+ vrshr.u16 q11, q6, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q10
+ vraddhn.u16 d2, q6, q11
+ vrshr.u16 q11, q12, #8
+ vrshr.u16 q8, q9, #8
+ vrshr.u16 q6, q13, #8
+ vraddhn.u16 d24, q12, q11
+ vraddhn.u16 d25, q9, q8
+ /*
+ * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+ * and put data into d16 - blue, d17 - green, d18 - red
+ */
+ vshrn.u16 d17, q2, #3
+ vshrn.u16 d18, q2, #8
+ vraddhn.u16 d26, q13, q6
+ vsli.u16 q2, q2, #5
+ vsri.u8 d18, d18, #5
+ vsri.u8 d17, d17, #6
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in d16 - blue, d17 - green, d18 - red
+ */
+ vmvn.8 q12, q12
+ vshrn.u16 d16, q2, #2
+ vmvn.8 d26, d26
+ vmull.u8 q6, d16, d24
+ vmull.u8 q7, d17, d25
+ vmull.u8 q11, d18, d26
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+ /* ... continue 'combine_over_ca' replacement */
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q14, q7, #8
+ vrshr.u16 q15, q11, #8
+ vraddhn.u16 d16, q10, q6
+ vraddhn.u16 d17, q14, q7
+ vraddhn.u16 d18, q15, q11
+ vqadd.u8 q8, q0, q8
+ vqadd.u8 d18, d2, d18
+ /*
+ * convert the results in d16, d17, d18 to r5g6b5 and store
+ * them into {d28, d29}
+ */
+ vshll.u8 q14, d18, #8
+ vshll.u8 q10, d17, #8
+ vshll.u8 q15, d16, #8
+ vsri.u16 q14, q10, #5
+ vsri.u16 q14, q15, #11
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+ fetch_mask_pixblock
+ vrshr.u16 q10, q6, #8
+ vrshr.u16 q14, q7, #8
+ vld1.16 {d4, d5}, [DST_R, :128]!
+ vrshr.u16 q15, q11, #8
+ vraddhn.u16 d16, q10, q6
+ vraddhn.u16 d17, q14, q7
+ vraddhn.u16 d22, q15, q11
+ /* process_pixblock_head */
+ /*
+ * 'combine_mask_ca' replacement
+ *
+ * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
+ * mask in {d24, d25, d26} [B, G, R]
+ * output: updated src in {d0, d1, d2 } [B, G, R]
+ * updated mask in {d24, d25, d26} [B, G, R]
+ */
+ vmull.u8 q6, d26, d10
+ vqadd.u8 q8, q0, q8
+ vmull.u8 q0, d24, d8
+ vqadd.u8 d22, d2, d22
+ vmull.u8 q1, d25, d9
+ /*
+ * convert the result in d16, d17, d22 to r5g6b5 and store
+ * it into {d28, d29}
+ */
+ vshll.u8 q14, d22, #8
+ vshll.u8 q10, d17, #8
+ vshll.u8 q15, d16, #8
+ vmull.u8 q9, d11, d25
+ vsri.u16 q14, q10, #5
+ vmull.u8 q12, d11, d24
+ vmull.u8 q13, d11, d26
+ vsri.u16 q14, q15, #11
+ cache_preload 8, 8
+ vrshr.u16 q8, q0, #8
+ vrshr.u16 q10, q1, #8
+ vrshr.u16 q11, q6, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q10
+ vraddhn.u16 d2, q6, q11
+ vrshr.u16 q11, q12, #8
+ vrshr.u16 q8, q9, #8
+ vrshr.u16 q6, q13, #8
+ vraddhn.u16 d24, q12, q11
+ vraddhn.u16 d25, q9, q8
+ /*
+ * convert 8 r5g6b5 pixel data from {d4, d5} to planar
+ * 8-bit format and put data into d16 - blue, d17 - green,
+ * d18 - red
+ */
+ vshrn.u16 d17, q2, #3
+ vshrn.u16 d18, q2, #8
+ vraddhn.u16 d26, q13, q6
+ vsli.u16 q2, q2, #5
+ vsri.u8 d17, d17, #6
+ vsri.u8 d18, d18, #5
+ /*
+ * 'combine_over_ca' replacement
+ *
+ * output: updated dest in d16 - blue, d17 - green, d18 - red
+ */
+ vmvn.8 q12, q12
+ vshrn.u16 d16, q2, #2
+ vmvn.8 d26, d26
+ vmull.u8 q7, d17, d25
+ vmull.u8 q6, d16, d24
+ vmull.u8 q11, d18, d26
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d8, d11[0]
+ vdup.8 d9, d11[1]
+ vdup.8 d10, d11[2]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_n_8888_0565_ca_init, \
+ pixman_composite_over_n_8888_0565_ca_cleanup, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* and destination data in {d4, d5, d6, d7} */
+ vmull.u8 q8, d4, d3
+ vmull.u8 q9, d5, d3
+ vmull.u8 q10, d6, d3
+ vmull.u8 q11, d7, d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q8, q14
+ vraddhn.u16 d29, q9, q15
+ vraddhn.u16 d30, q10, q12
+ vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+ pixman_composite_in_n_8_process_pixblock_tail
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ cache_preload 32, 32
+ pixman_composite_in_n_8_process_pixblock_head
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_in_n_8_init, \
+ pixman_composite_in_n_8_cleanup, \
+ pixman_composite_in_n_8_process_pixblock_head, \
+ pixman_composite_in_n_8_process_pixblock_tail, \
+ pixman_composite_in_n_8_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+ /* expecting source data in {d8, d9, d10, d11} */
+ /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+ /* and destination data in {d4, d5, d6, d7} */
+ /* mask is in d24, d25, d26, d27 */
+ vmull.u8 q0, d24, d11
+ vmull.u8 q1, d25, d11
+ vmull.u8 q6, d26, d11
+ vmull.u8 q7, d27, d11
+ vrshr.u16 q10, q0, #8
+ vrshr.u16 q11, q1, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q13, q7, #8
+ vraddhn.u16 d0, q0, q10
+ vraddhn.u16 d1, q1, q11
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d3, q7, q13
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+ pixman_composite_add_n_8_8_process_pixblock_tail
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vpush {d8-d15}
+ vld1.32 {d11[0]}, [DUMMY]
+ vdup.8 d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8_init, \
+ pixman_composite_add_n_8_8_cleanup, \
+ pixman_composite_add_n_8_8_process_pixblock_head, \
+ pixman_composite_add_n_8_8_process_pixblock_tail, \
+ pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* mask in {d24, d25, d26, d27} */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d25, d1
+ vmull.u8 q10, d26, d2
+ vmull.u8 q11, d27, d3
+ vrshr.u16 q0, q8, #8
+ vrshr.u16 q1, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d0, q0, q8
+ vraddhn.u16 d1, q1, q9
+ vraddhn.u16 d2, q12, q10
+ vraddhn.u16 d3, q13, q11
+ vqadd.u8 q14, q0, q2
+ vqadd.u8 q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+ pixman_composite_add_8_8_8_process_pixblock_tail
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ fetch_mask_pixblock
+ fetch_src_pixblock
+ cache_preload 32, 32
+ pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+ FLAG_DST_READWRITE, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8_8_8_init, \
+ pixman_composite_add_8_8_8_cleanup, \
+ pixman_composite_add_8_8_8_process_pixblock_head, \
+ pixman_composite_add_8_8_8_process_pixblock_tail, \
+ pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* mask in {d24, d25, d26, d27} */
+ vmull.u8 q8, d27, d0
+ vmull.u8 q9, d27, d1
+ vmull.u8 q10, d27, d2
+ vmull.u8 q11, d27, d3
+ /* 1 cycle bubble */
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+ /* 2 cycle bubble */
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+ vqadd.u8 q14, q2, q14
+ /* 1 cycle bubble */
+ vqadd.u8 q15, q3, q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ vrshrn.u16 d28, q8, #8
+ fetch_mask_pixblock
+ vrshrn.u16 d29, q9, #8
+ vmull.u8 q8, d27, d0
+ vrshrn.u16 d30, q10, #8
+ vmull.u8 q9, d27, d1
+ vrshrn.u16 d31, q11, #8
+ vmull.u8 q10, d27, d2
+ vqadd.u8 q14, q2, q14
+ vmull.u8 q11, d27, d3
+ vqadd.u8 q15, q3, q15
+ vrsra.u16 q8, q8, #8
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ vrsra.u16 q9, q9, #8
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q10, q10, #8
+
+ cache_preload 8, 8
+
+ vrsra.u16 q11, q11, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+ pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_n_8_8888_init, \
+ pixman_composite_add_n_8_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vld1.32 {d27[0]}, [DUMMY]
+ vdup.8 d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_add_8888_n_8888_init, \
+ pixman_composite_add_8888_n_8888_cleanup, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 27 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ /* expecting source data in {d0, d1, d2, d3} */
+ /* destination data in {d4, d5, d6, d7} */
+ /* solid mask is in d15 */
+
+ /* 'in' */
+ vmull.u8 q8, d15, d3
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vrshr.u16 q13, q8, #8
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d3, q8, q13
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+ vmvn.8 d24, d3 /* get inverted alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+ vmull.u8 q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+ vqadd.u8 q14, q0, q14
+ vqadd.u8 q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+ add DUMMY, sp, #48
+ vpush {d8-d15}
+ vld1.32 {d15[0]}, [DUMMY]
+ vdup.8 d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_8888_n_8888_init, \
+ pixman_composite_over_8888_n_8888_cleanup, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+generate_composite_function_single_scanline \
+ pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 12 /* mask_basereg */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ pixman_composite_over_8888_n_8888_process_pixblock_tail
+ fetch_src_pixblock
+ cache_preload 8, 8
+ fetch_mask_pixblock
+ pixman_composite_over_8888_n_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_n_8888_process_pixblock_head, \
+ pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+ vst3.8 {d0, d1, d2}, [DST_W]!
+ fetch_src_pixblock
+ cache_preload 8, 8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0888_process_pixblock_head, \
+ pixman_composite_src_0888_0888_process_pixblock_tail, \
+ pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+ vswp d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+ vst4.8 {d0, d1, d2, d3}, [DST_W]!
+ fetch_src_pixblock
+ vswp d0, d2
+ cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+ veor d3, d3, d3
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ pixman_composite_src_0888_8888_rev_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+ 0, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+ vshll.u8 q8, d1, #8
+ vshll.u8 q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+ vshll.u8 q14, d0, #8
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+ vshll.u8 q14, d0, #8
+ fetch_src_pixblock
+ vsri.u16 q14, q8, #5
+ vsri.u16 q14, q9, #11
+ vshll.u8 q8, d1, #8
+ vst1.16 {d28, d29}, [DST_W, :128]!
+ vshll.u8 q9, d2, #8
+.endm
+
+generate_composite_function \
+ pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+ FLAG_DST_WRITEONLY, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+ pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ vraddhn.u16 d30, q11, q8
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d30, q11, q8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d28, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ vraddhn.u16 d28, q11, q8
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+ vrshr.u16 q11, q8, #8
+ vswp d3, d31
+ vrshr.u16 q12, q9, #8
+ vrshr.u16 q13, q10, #8
+ fetch_src_pixblock
+ vraddhn.u16 d28, q11, q8
+ PF add PF_X, PF_X, #8
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vraddhn.u16 d29, q12, q9
+ vraddhn.u16 d30, q13, q10
+ vmull.u8 q8, d3, d0
+ vmull.u8 q9, d3, d1
+ vmull.u8 q10, d3, d2
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ PF cmp PF_X, ORIG_W
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 10, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+ pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 0, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q4, d2, d1, d0
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* source pixel data is in {d0, d1, d2, XX} */
+ /* destination pixel data is in {d4, d5, d6, XX} */
+ vmvn.8 d7, d15
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vmull.u8 q8, d7, d4
+ vmull.u8 q9, d7, d5
+ vmull.u8 q13, d7, d6
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q13, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q13
+ vqadd.u8 q0, q0, q14
+ vqadd.u8 q1, q1, q15
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_over_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_over_0565_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+ add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+ vpush {d8-d15}
+ vld1.32 {d15[0]}, [DUMMY]
+ vdup.8 d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+ vpop {d8-d15}
+.endm
+
+generate_composite_function \
+ pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_over_0565_n_0565_init, \
+ pixman_composite_over_0565_n_0565_cleanup, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q4, d2, d1, d0
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* source pixel data is in {d0, d1, d2, XX} */
+ /* destination pixel data is in {d4, d5, d6, XX} */
+ vmull.u8 q6, d15, d2
+ vmull.u8 q5, d15, d1
+ vmull.u8 q4, d15, d0
+ vrshr.u16 q12, q6, #8
+ vrshr.u16 q11, q5, #8
+ vrshr.u16 q10, q4, #8
+ vraddhn.u16 d2, q6, q12
+ vraddhn.u16 d1, q5, q11
+ vraddhn.u16 d0, q4, q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+ vqadd.u8 q0, q0, q2
+ vqadd.u8 q1, q1, q3
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+ fetch_mask_pixblock
+ pixman_composite_add_0565_8_0565_process_pixblock_tail
+ fetch_src_pixblock
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_add_0565_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_add_0565_8_0565_process_pixblock_head, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+ /* mask is in d15 */
+ convert_0565_to_x888 q5, d6, d5, d4
+ /* destination pixel data is in {d4, d5, d6, xx} */
+ vmvn.8 d24, d15 /* get inverted alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d24, d4
+ vmull.u8 q9, d24, d5
+ vmull.u8 q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vraddhn.u16 d0, q14, q8
+ vraddhn.u16 d1, q15, q9
+ vraddhn.u16 d2, q12, q10
+ /* 32bpp result is in {d0, d1, d2, XX} */
+ convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+ fetch_src_pixblock
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail
+ vld1.16 {d10, d11}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_out_reverse_8_0565_process_pixblock_head
+ vst1.16 {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+ pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 15, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+ /* src is in d0 */
+ /* destination pixel data is in {d4, d5, d6, d7} */
+ vmvn.8 d1, d0 /* get inverted alpha */
+ /* now do alpha blending */
+ vmull.u8 q8, d1, d4
+ vmull.u8 q9, d1, d5
+ vmull.u8 q10, d1, d6
+ vmull.u8 q11, d1, d7
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+ vrshr.u16 q14, q8, #8
+ vrshr.u16 q15, q9, #8
+ vrshr.u16 q12, q10, #8
+ vrshr.u16 q13, q11, #8
+ vraddhn.u16 d28, q14, q8
+ vraddhn.u16 d29, q15, q9
+ vraddhn.u16 d30, q12, q10
+ vraddhn.u16 d31, q13, q11
+ /* 32bpp result is in {d28, d29, d30, d31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+ fetch_src_pixblock
+ pixman_composite_out_reverse_8_8888_process_pixblock_tail
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
+ cache_preload 8, 8
+ pixman_composite_out_reverse_8_8888_process_pixblock_head
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+ pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+ pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+ pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 0 /* mask_basereg */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_8888_process_pixblock_head, \
+ pixman_composite_over_8888_8888_process_pixblock_tail, \
+ pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_over_8888_0565_process_pixblock_head, \
+ pixman_composite_over_8888_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 0, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_8888_0565_process_pixblock_head, \
+ pixman_composite_src_8888_0565_process_pixblock_tail, \
+ pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init, \
+ default_cleanup, \
+ pixman_composite_src_0565_8888_process_pixblock_head, \
+ pixman_composite_src_0565_8888_process_pixblock_tail, \
+ pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 4, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 24 /* mask_basereg */
+
+generate_composite_function_nearest_scanline \
+ pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+ FLAG_DST_READWRITE, \
+ 8, /* number of pixels, processed in a single block */ \
+ default_init_need_all_regs, \
+ default_cleanup_need_all_regs, \
+ pixman_composite_over_0565_8_0565_process_pixblock_head, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+ 28, /* dst_w_basereg */ \
+ 10, /* dst_r_basereg */ \
+ 8, /* src_basereg */ \
+ 15 /* mask_basereg */
+
+/******************************************************************************/
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vld1.32 {reg1}, [TMP1], STRIDE
+ vld1.32 {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ vld1.32 {reg2[0]}, [TMP1], STRIDE
+ vld1.32 {reg2[1]}, [TMP1]
+ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 reg1, reg2, tmp1
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ bilinear_load_8888 reg3, reg4, tmp2
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {acc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {acc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {acc2lo[1]}, [TMP1]
+ vld1.32 {acc2hi[1]}, [TMP2]
+ convert_0565_to_x888 acc2, reg3, reg2, reg1
+ vzip.u8 reg1, reg3
+ vzip.u8 reg2, reg4
+ vzip.u8 reg3, reg4
+ vzip.u8 reg1, reg2
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
+ vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
+ vld1.32 {xacc2lo[1]}, [TMP1]
+ vld1.32 {xacc2hi[1]}, [TMP2]
+ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #1
+ vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
+ vzip.u8 xreg1, xreg3
+ vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
+ vzip.u8 xreg2, xreg4
+ vld1.32 {yacc2lo[1]}, [TMP1]
+ vzip.u8 xreg3, xreg4
+ vld1.32 {yacc2hi[1]}, [TMP2]
+ vzip.u8 xreg1, xreg2
+ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+ vmull.u8 xacc1, xreg1, d28
+ vzip.u8 yreg1, yreg3
+ vmlal.u8 xacc1, xreg2, d29
+ vzip.u8 yreg2, yreg4
+ vmull.u8 xacc2, xreg3, d28
+ vzip.u8 yreg3, yreg4
+ vmlal.u8 xacc2, xreg4, d29
+ vzip.u8 yreg1, yreg2
+ vmull.u8 yacc1, yreg1, d28
+ vmlal.u8 yacc1, yreg2, d29
+ vmull.u8 yacc2, yreg3, d28
+ vmlal.u8 yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+ vst1.32 {d0, d1}, [OUT, :128]!
+.elseif numpix == 2
+ vst1.32 {d0}, [OUT, :64]!
+.elseif numpix == 1
+ vst1.32 {d0[0]}, [OUT, :32]!
+.else
+ .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp.u8 d0, d1
+ vuzp.u8 d2, d3
+ vuzp.u8 d1, d3
+ vuzp.u8 d0, d2
+ convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+ vst1.16 {d2}, [OUT, :64]!
+.elseif numpix == 2
+ vst1.32 {d2[0]}, [OUT, :32]!
+.elseif numpix == 1
+ vst1.16 {d2[0]}, [OUT, :16]!
+.else
+ .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_load_&src_fmt d0, d1, d2
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ /* 5 cycles bubble */
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+ bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d0, q0
+ bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23 \
+ q3, q9, d4, d5, d16, d17, d18, d19
+ pld [TMP1, PF_OFFS]
+ sub TMP1, TMP1, STRIDE
+ vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q2, d6, d30
+ vmlal.u16 q2, d7, d30
+ vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
+ pld [TMP2, PF_OFFS]
+ vmlsl.u16 q8, d18, d31
+ vmlal.u16 q8, d19, d31
+ vadd.u16 q12, q12, q13
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ vadd.u16 q12, q12, q13
+ bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.else
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+ bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+ bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4, 0
+.set BILINEAR_FLAG_UNROLL_8, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+ src_bpp_shift, dst_bpp_shift, \
+ prefetch_distance, flags
+
+pixman_asm_function fname
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+ STRIDE .req r2
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+ mul PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpush {d8-d15}
+.endif
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 3f
+
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 0f
+ tst OUT, #(1 << dst_bpp_shift)
+ beq 0f
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ sub WIDTH, WIDTH, #1
+0:
+ vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+
+ cmp WIDTH, #2
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 1))
+ beq 0f
+ bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ sub WIDTH, WIDTH, #2
+0:
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+ cmp WIDTH, #4
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 2))
+ beq 0f
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ sub WIDTH, WIDTH, #4
+0:
+ subs WIDTH, WIDTH, #8
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+ bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #8
+ blt 5f
+0:
+ bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #8
+ bge 0b
+5:
+ bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+1:
+ tst WIDTH, #4
+ beq 2f
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+2:
+.else
+/*********** 4 pixels per iteration *****************/
+ subs WIDTH, WIDTH, #4
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+ bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #4
+ blt 5f
+0:
+ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #4
+ bge 0b
+5:
+ bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+1:
+/****************************************************/
+.endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 2f
+ bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpop {d8-d15}
+.endif
+ pop {r4, r5, r6, r7, r8, r9}
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+.endfunc
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+
+ vld1.32 {d22}, [TMP1], STRIDE
+ vld1.32 {d23}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmull.u8 q8, d22, d28
+ vmlal.u8 q8, d23, d29
+
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmull.u8 q9, d22, d28
+ vmlal.u8 q9, d23, d29
+
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vadd.u16 q12, q12, q13
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vmovn.u16 d6, q0
+ vmovn.u16 d7, q2
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vld1.32 {d22}, [TMP3], STRIDE
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmovn.u16 d6, q0
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+ vmovn.u16 d7, q2
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vadd.u16 q12, q12, q13
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vst1.32 {d6, d7}, [OUT, :128]!
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q1, d18, d31
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_head
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vld1.32 {d20}, [TMP1], STRIDE
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q1, d18, d31
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vld1.32 {d22}, [TMP3], STRIDE
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmovn.u16 d8, q0
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+ vmovn.u16 d9, q2
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vadd.u16 q12, q12, q13
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vadd.u16 q12, q12, q13
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vmovn.u16 d10, q0
+ vmovn.u16 d11, q2
+ vadd.u16 q12, q12, q13
+
+ vuzp.u8 d8, d9
+ vuzp.u8 d10, d11
+ vuzp.u8 d9, d11
+ vuzp.u8 d8, d10
+ vshll.u8 q6, d9, #8
+ vshll.u8 q5, d10, #8
+ vshll.u8 q7, d8, #8
+ vsri.u16 q5, q6, #5
+ vsri.u16 q5, q7, #11
+ vst1.32 {d10, d11}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vuzp.u8 d8, d9
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vld1.32 {d22}, [TMP3], STRIDE
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmovn.u16 d10, q0
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+ vmovn.u16 d11, q2
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vadd.u16 q12, q12, q13
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vuzp.u8 d10, d11
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+ vmlsl.u16 q1, d18, d31
+
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q1, d19, d31
+ vuzp.u8 d9, d11
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+ vuzp.u8 d8, d10
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshll.u8 q6, d9, #8
+ vshll.u8 q5, d10, #8
+ vshll.u8 q7, d8, #8
+ vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vsri.u16 q5, q6, #5
+ vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vsri.u16 q5, q7, #11
+ vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+ vadd.u16 q12, q12, q13
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vld1.32 {d22}, [TMP3], STRIDE
+ vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmovn.u16 d8, q0
+ vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+ vmovn.u16 d9, q2
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vadd.u16 q12, q12, q13
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+ vst1.32 {d10, d11}, [OUT, :128]!
+ vmlsl.u16 q1, d18, d31
+.endm
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+ 2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+ 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+ 1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+ 1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/gfx/pixman/pixman-arm-neon-asm.h b/gfx/pixman/pixman-arm-neon-asm.h
new file mode 100644
index 0000000000..bdcf6a9d47
--- /dev/null
+++ b/gfx/pixman/pixman-arm-neon-asm.h
@@ -0,0 +1,1184 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ * - handling of leading and trailing unaligned pixels
+ * - doing most of the work related to L2 cache preload
+ * - encourages the use of software pipelining for better instructions
+ * scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ * - try overlapped pixel method (from Ian Rickards) when processing
+ * exactly two blocks of pixels
+ * - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY, 0
+.set FLAG_DST_READWRITE, 1
+.set FLAG_DEINTERLEAVE_32BPP, 2
+
+/*
+ * Offset in stack where mask and source pointer/stride can be accessed
+ * from 'init' macro. This is useful for doing special handling for solid mask.
+ */
+.set ARGS_STACK_OFFSET, 40
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+ op&.&elem_size {d®1}, [&mem_operand&, :&abits&]!
+.else
+ op&.&elem_size {d®1}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+ op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]!
+.else
+ op&.&elem_size {d®1, d®2}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+ op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]!
+.else
+ op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+ op&.&elem_size {d®1[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+ op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+ op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+ pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+ %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+ pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+ pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+ .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+ pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+ .elseif elem_size == 16
+ pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+ pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+ .else
+ pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+ pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+ pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+ pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+ .endif
+.elseif numbytes == 2
+ .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+ pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+ .else
+ pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+ pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+ .endif
+.elseif numbytes == 1
+ pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+ .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+ %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+ pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+ pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+ pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+ %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+ pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+ pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+ pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+ pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+ pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+ pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+ pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP1, mem_operand, TMP1, asl #1
+ mov TMP2, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP2, mem_operand, TMP2, asl #1
+ vld1.16 {d®1&[0]}, [TMP1, :16]
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP1, mem_operand, TMP1, asl #1
+ vld1.16 {d®1&[1]}, [TMP2, :16]
+ mov TMP2, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP2, mem_operand, TMP2, asl #1
+ vld1.16 {d®1&[2]}, [TMP1, :16]
+ vld1.16 {d®1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP1, mem_operand, TMP1, asl #2
+ mov TMP2, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP2, mem_operand, TMP2, asl #2
+ vld1.32 {d®1&[0]}, [TMP1, :32]
+ vld1.32 {d®1&[1]}, [TMP2, :32]
+.else
+ .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if 0 /* elem_size == 32 */
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X, asl #1
+ add TMP1, mem_operand, TMP1, asl #2
+ mov TMP2, VX, asr #16
+ sub VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #2
+ vld1.32 {d®1&[0]}, [TMP1, :32]
+ mov TMP1, VX, asr #16
+ add VX, VX, UNIT_X, asl #1
+ add TMP1, mem_operand, TMP1, asl #2
+ vld1.32 {d®2&[0]}, [TMP2, :32]
+ mov TMP2, VX, asr #16
+ add VX, VX, UNIT_X
+ add TMP2, mem_operand, TMP2, asl #2
+ vld1.32 {d®1&[1]}, [TMP1, :32]
+ vld1.32 {d®2&[1]}, [TMP2, :32]
+.else
+ pixld1_s elem_size, reg1, mem_operand
+ pixld1_s elem_size, reg2, mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP1, mem_operand, TMP1, asl #1
+ vld1.16 {d®1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+ mov TMP1, VX, asr #16
+ adds VX, VX, UNIT_X
+5: subpls VX, VX, SRC_WIDTH_FIXED
+ bpl 5b
+ add TMP1, mem_operand, TMP1, asl #2
+ vld1.32 {d®1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+ pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+ pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
+ pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+ pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
+.elseif numbytes == 8
+ pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+ .if elem_size == 32
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+ .elseif elem_size == 16
+ pixld0_s elem_size, %(basereg+0), 2, mem_operand
+ pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ .else
+ pixld0_s elem_size, %(basereg+0), 4, mem_operand
+ pixld0_s elem_size, %(basereg+0), 5, mem_operand
+ pixld0_s elem_size, %(basereg+0), 6, mem_operand
+ pixld0_s elem_size, %(basereg+0), 7, mem_operand
+ .endif
+.elseif numbytes == 2
+ .if elem_size == 16
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+ .else
+ pixld0_s elem_size, %(basereg+0), 2, mem_operand
+ pixld0_s elem_size, %(basereg+0), 3, mem_operand
+ .endif
+.elseif numbytes == 1
+ pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+ .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+ pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+ vuzp.8 d®1, d®2
+.endm
+
+.macro vzip8 reg1, reg2
+ vzip.8 d®1, d®2
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vuzp8 %(basereg+0), %(basereg+1)
+ vuzp8 %(basereg+2), %(basereg+3)
+ vuzp8 %(basereg+1), %(basereg+3)
+ vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+ vzip8 %(basereg+0), %(basereg+2)
+ vzip8 %(basereg+1), %(basereg+3)
+ vzip8 %(basereg+2), %(basereg+3)
+ vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in software
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+ a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+ PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+ PF add PF_X, PF_X, #std_increment
+.endif
+ PF tst PF_CTL, #0xF
+ PF addne PF_X, PF_X, #boost_increment
+ PF subne PF_CTL, PF_CTL, #1
+ PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+ PF subge PF_X, PF_X, ORIG_W
+ PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+ pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+ pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+ pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+.if dst_w_bpp != 24
+ tst DST_R, #0xF
+ beq 2f
+
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_R, #lowbit
+ beq 1f
+.endif
+ pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+ pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+ pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+ add DST_R, DST_R, #lowbit
+.endif
+ PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+ sub W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+ pixdeinterleave src_bpp, src_basereg
+ pixdeinterleave mask_bpp, mask_basereg
+ pixdeinterleave dst_r_bpp, dst_r_basereg
+
+ process_pixblock_head
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+ process_pixblock_tail
+
+ pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+ tst DST_W, #lowbit
+ beq 1f
+.endif
+ pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ * set to 0
+ * dst_aligned_flag - selects whether destination buffer
+ * is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+ dst_aligned_flag, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+ tst W, #(pixblock_size - 1)
+ beq 2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+ tst W, #chunk_size
+ beq 1f
+ pixld_src chunk_size, src_bpp, src_basereg, SRC
+ pixld chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+ pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+ pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+ PF add PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+ pixdeinterleave src_bpp, src_basereg
+ pixdeinterleave mask_bpp, mask_basereg
+ pixdeinterleave dst_r_bpp, dst_r_basereg
+
+ process_pixblock_head
+.if cache_preload_flag != 0
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+.endif
+ process_pixblock_tail
+ pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+ tst W, #chunk_size
+ beq 1f
+.if dst_aligned_flag != 0
+ pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+ pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+ ldrd W, [sp] /* load W and H (width and height) from stack */
+.else
+ mov W, ORIG_W
+.endif
+ add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+ add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+ add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+ sub DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+ sub SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+ sub MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+ subs H, H, #1
+ mov DST_R, DST_W
+.if regs_shortage
+ str H, [sp, #4] /* save updated height to stack */
+.endif
+ bge start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3 - reserved for loading source pixel data
+ * d4, d5, d6, d7 - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags, \
+ pixblock_size_, \
+ prefetch_distance, \
+ init, \
+ cleanup, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head, \
+ dst_w_basereg_ = 28, \
+ dst_r_basereg_ = 4, \
+ src_basereg_ = 0, \
+ mask_basereg_ = 24
+
+ pixman_asm_function fname
+
+ push {r4-r12, lr} /* save all registers */
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+ ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, src_bpp_
+ .set mask_bpp, mask_bpp_
+ .set dst_w_bpp, dst_w_bpp_
+ .set pixblock_size, pixblock_size_
+ .set dst_w_basereg, dst_w_basereg_
+ .set dst_r_basereg, dst_r_basereg_
+ .set src_basereg, src_basereg_
+ .set mask_basereg, mask_basereg_
+
+ .macro pixld_src x:vararg
+ pixld x
+ .endm
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+/*
+ * Assign symbolic names to registers
+ */
+ W .req r0 /* width (is updated during processing) */
+ H .req r1 /* height (is updated during processing) */
+ DST_W .req r2 /* destination buffer pointer for writes */
+ DST_STRIDE .req r3 /* destination image stride */
+ SRC .req r4 /* source buffer pointer */
+ SRC_STRIDE .req r5 /* source image stride */
+ DST_R .req r6 /* destination buffer pointer for reads */
+
+ MASK .req r7 /* mask pointer */
+ MASK_STRIDE .req r8 /* mask stride */
+
+ PF_CTL .req r9 /* combined lines counter and prefetch */
+ /* distance increment counter */
+ PF_X .req r10 /* pixel index in a scanline for current */
+ /* pretetch position */
+ PF_SRC .req r11 /* pointer to source scanline start */
+ /* for prefetch purposes */
+ PF_DST .req r12 /* pointer to destination scanline start */
+ /* for prefetch purposes */
+ PF_MASK .req r14 /* pointer to mask scanline start */
+ /* for prefetch purposes */
+/*
+ * Check whether we have enough registers for all the local variables.
+ * If we don't have enough registers, original width and height are
+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
+ * this for the rest of code). Even if there are enough registers, the
+ * allocation scheme may be a bit different depending on whether source
+ * or mask is not used.
+ */
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+ ORIG_W .req r10 /* saved original width */
+ DUMMY .req r12 /* temporary register */
+ .set regs_shortage, 0
+.elseif mask_bpp == 0
+ ORIG_W .req r7 /* saved original width */
+ DUMMY .req r8 /* temporary register */
+ .set regs_shortage, 0
+.elseif src_bpp == 0
+ ORIG_W .req r4 /* saved original width */
+ DUMMY .req r5 /* temporary register */
+ .set regs_shortage, 0
+.else
+ ORIG_W .req r1 /* saved original width */
+ DUMMY .req r1 /* temporary register */
+ .set regs_shortage, 1
+.endif
+
+ .set mask_bpp_shift, -1
+.if src_bpp == 32
+ .set src_bpp_shift, 2
+.elseif src_bpp == 24
+ .set src_bpp_shift, 0
+.elseif src_bpp == 16
+ .set src_bpp_shift, 1
+.elseif src_bpp == 8
+ .set src_bpp_shift, 0
+.elseif src_bpp == 0
+ .set src_bpp_shift, -1
+.else
+ .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+ .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+ .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+ .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+ .set mask_bpp_shift, -1
+.else
+ .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+ .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+ .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+ .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+ .set dst_bpp_shift, 0
+.else
+ .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+.else
+ .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+ .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+ .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+ .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+.if src_bpp > 0
+ ldr SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+ ldr MASK, [sp, #48]
+.endif
+ PF mov PF_X, #0
+.if src_bpp > 0
+ ldr SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+ ldr MASK_STRIDE, [sp, #52]
+.endif
+ mov DST_R, DST_W
+
+.if src_bpp == 24
+ sub SRC_STRIDE, SRC_STRIDE, W
+ sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+ sub MASK_STRIDE, MASK_STRIDE, W
+ sub MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+ sub DST_STRIDE, DST_STRIDE, W
+ sub DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+ PF mov PF_SRC, SRC
+ PF mov PF_DST, DST_R
+ PF mov PF_MASK, MASK
+ /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+ PF mov PF_CTL, H, lsl #4
+ PF add PF_CTL, #(prefetch_distance - 0x10)
+
+ init
+.if regs_shortage
+ push {r0, r1}
+.endif
+ subs H, H, #1
+.if regs_shortage
+ str H, [sp, #4] /* save updated height to stack */
+.else
+ mov ORIG_W, W
+.endif
+ blt 9f
+ cmp W, #(pixblock_size * 2)
+ blt 8f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+ ensure_destination_ptr_alignment process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+
+ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+ pixld_a pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+ PF add PF_X, PF_X, #pixblock_size
+ process_pixblock_head
+ cache_preload 0, pixblock_size
+ cache_preload_simple
+ subs W, W, #(pixblock_size * 2)
+ blt 2f
+1:
+ process_pixblock_tail_head
+ cache_preload_simple
+ subs W, W, #pixblock_size
+ bge 1b
+2:
+ process_pixblock_tail
+ pixst_a pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+ /* Process the remaining trailing pixels in the scanline */
+ process_trailing_pixels 1, 1, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+ advance_to_next_scanline 0b
+
+.if regs_shortage
+ pop {r0, r1}
+.endif
+ cleanup
+ pop {r4-r12, pc} /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+8:
+ /* Process exactly pixblock_size pixels if needed */
+ tst W, #pixblock_size
+ beq 1f
+ pixld pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+ process_pixblock_head
+ process_pixblock_tail
+ pixst pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+ /* Process the remaining trailing pixels in the scanline */
+ process_trailing_pixels 0, 0, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+ advance_to_next_scanline 8b
+9:
+.if regs_shortage
+ pop {r0, r1}
+.endif
+ cleanup
+ pop {r4-r12, pc} /* exit */
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
+ .unreq SRC
+ .unreq MASK
+ .unreq DST_R
+ .unreq DST_W
+ .unreq ORIG_W
+ .unreq W
+ .unreq H
+ .unreq SRC_STRIDE
+ .unreq DST_STRIDE
+ .unreq MASK_STRIDE
+ .unreq PF_CTL
+ .unreq PF_X
+ .unreq PF_SRC
+ .unreq PF_DST
+ .unreq PF_MASK
+ .unreq DUMMY
+ .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline use_nearest_scaling, \
+ fname, \
+ src_bpp_, \
+ mask_bpp_, \
+ dst_w_bpp_, \
+ flags, \
+ pixblock_size_, \
+ init, \
+ cleanup, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head, \
+ dst_w_basereg_ = 28, \
+ dst_r_basereg_ = 4, \
+ src_basereg_ = 0, \
+ mask_basereg_ = 24
+
+ pixman_asm_function fname
+
+ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, src_bpp_
+ .set mask_bpp, mask_bpp_
+ .set dst_w_bpp, dst_w_bpp_
+ .set pixblock_size, pixblock_size_
+ .set dst_w_basereg, dst_w_basereg_
+ .set dst_r_basereg, dst_r_basereg_
+ .set src_basereg, src_basereg_
+ .set mask_basereg, mask_basereg_
+
+.if use_nearest_scaling != 0
+ /*
+ * Assign symbolic names to registers for nearest scaling
+ */
+ W .req r0
+ DST_W .req r1
+ SRC .req r2
+ VX .req r3
+ UNIT_X .req ip
+ MASK .req lr
+ TMP1 .req r4
+ TMP2 .req r5
+ DST_R .req r6
+ SRC_WIDTH_FIXED .req r7
+
+ .macro pixld_src x:vararg
+ pixld_s x
+ .endm
+
+ ldr UNIT_X, [sp]
+ push {r4-r8, lr}
+ ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)]
+ .if mask_bpp != 0
+ ldr MASK, [sp, #(24 + 8)]
+ .endif
+.else
+ /*
+ * Assign symbolic names to registers
+ */
+ W .req r0 /* width (is updated during processing) */
+ DST_W .req r1 /* destination buffer pointer for writes */
+ SRC .req r2 /* source buffer pointer */
+ DST_R .req ip /* destination buffer pointer for reads */
+ MASK .req r3 /* mask pointer */
+
+ .macro pixld_src x:vararg
+ pixld x
+ .endm
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+ .set dst_r_bpp, dst_w_bpp
+.else
+ .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+ .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+ .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+ .macro fetch_src_pixblock
+ pixld_src pixblock_size, src_bpp, \
+ (src_basereg - pixblock_size * src_bpp / 64), SRC
+ .endm
+
+ init
+ mov DST_R, DST_W
+
+ cmp W, #pixblock_size
+ blt 8f
+
+ ensure_destination_ptr_alignment process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+
+ subs W, W, #pixblock_size
+ blt 7f
+
+ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+ pixld_a pixblock_size, dst_r_bpp, \
+ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+ fetch_src_pixblock
+ pixld pixblock_size, mask_bpp, \
+ (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+ process_pixblock_head
+ subs W, W, #pixblock_size
+ blt 2f
+1:
+ process_pixblock_tail_head
+ subs W, W, #pixblock_size
+ bge 1b
+2:
+ process_pixblock_tail
+ pixst_a pixblock_size, dst_w_bpp, \
+ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+ /* Process the remaining trailing pixels in the scanline (dst aligned) */
+ process_trailing_pixels 0, 1, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+
+ cleanup
+.if use_nearest_scaling != 0
+ pop {r4-r8, pc} /* exit */
+.else
+ bx lr /* exit */
+.endif
+8:
+ /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+ process_trailing_pixels 0, 0, \
+ process_pixblock_head, \
+ process_pixblock_tail, \
+ process_pixblock_tail_head
+
+ cleanup
+
+.if use_nearest_scaling != 0
+ pop {r4-r8, pc} /* exit */
+
+ .unreq DST_R
+ .unreq SRC
+ .unreq W
+ .unreq VX
+ .unreq UNIT_X
+ .unreq TMP1
+ .unreq TMP2
+ .unreq DST_W
+ .unreq MASK
+ .unreq SRC_WIDTH_FIXED
+
+.else
+ bx lr /* exit */
+
+ .unreq SRC
+ .unreq MASK
+ .unreq DST_R
+ .unreq DST_W
+ .unreq W
+.endif
+
+ .purgem fetch_src_pixblock
+ .purgem pixld_src
+
+ .endfunc
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+ generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+ generate_composite_function_scanline 1, x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores d8-d15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+ vpush {d8-d15}
+.endm
+
+.macro default_cleanup_need_all_regs
+ vpop {d8-d15}
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ * value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+ vshrn.u16 out_r, in, #8
+ vshrn.u16 out_g, in, #3
+ vsli.u16 in, in, #5
+ vmov.u8 out_a, #255
+ vsri.u8 out_r, out_r, #5
+ vsri.u8 out_g, out_g, #6
+ vshrn.u16 out_b, in, #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+ vshrn.u16 out_r, in, #8
+ vshrn.u16 out_g, in, #3
+ vsli.u16 in, in, #5
+ vsri.u8 out_r, out_r, #5
+ vsri.u8 out_g, out_g, #6
+ vshrn.u16 out_b, in, #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+ vshll.u8 tmp1, in_g, #8
+ vshll.u8 out, in_r, #8
+ vshll.u8 tmp2, in_b, #8
+ vsri.u16 out, tmp1, #5
+ vsri.u16 out, tmp2, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+ vshl.u16 out0, in, #5 /* G top 6 bits */
+ vshl.u16 tmp, in, #11 /* B top 5 bits */
+ vsri.u16 in, in, #5 /* R is ready in top bits */
+ vsri.u16 out0, out0, #6 /* G is ready in top bits */
+ vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
+ vshr.u16 out1, in, #8 /* R is in place */
+ vsri.u16 out0, tmp, #8 /* G & B is in place */
+ vzip.u16 out0, out1 /* everything is in place */
+.endm
diff --git a/gfx/pixman/pixman-private.h b/gfx/pixman/pixman-private.h
new file mode 100644
index 0000000000..73108a01d3
--- /dev/null
+++ b/gfx/pixman/pixman-private.h
@@ -0,0 +1,1153 @@
+#include
+
+#ifndef PIXMAN_PRIVATE_H
+#define PIXMAN_PRIVATE_H
+
+/*
+ * The defines which are shared between C and assembly code
+ */
+
+/* bilinear interpolation precision (must be < 8) */
+#define BILINEAR_INTERPOLATION_BITS 7
+#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
+
+/*
+ * C specific part
+ */
+
+#ifndef __ASSEMBLER__
+
+#ifndef PACKAGE
+# error config.h must be included before pixman-private.h
+#endif
+
+#define PIXMAN_DISABLE_DEPRECATED
+#define PIXMAN_USE_INTERNAL_API
+
+#include "pixman.h"
+#include
+#include
+#include
+#include
+#include
+
+#include "pixman-compiler.h"
+
+/*
+ * Images
+ */
+typedef struct image_common image_common_t;
+typedef struct solid_fill solid_fill_t;
+typedef struct gradient gradient_t;
+typedef struct linear_gradient linear_gradient_t;
+typedef struct horizontal_gradient horizontal_gradient_t;
+typedef struct vertical_gradient vertical_gradient_t;
+typedef struct conical_gradient conical_gradient_t;
+typedef struct radial_gradient radial_gradient_t;
+typedef struct bits_image bits_image_t;
+typedef struct circle circle_t;
+
+typedef struct argb_t argb_t;
+
+struct argb_t
+{
+ float a;
+ float r;
+ float g;
+ float b;
+};
+
+typedef void (*fetch_scanline_t) (bits_image_t *image,
+ int x,
+ int y,
+ int width,
+ uint32_t *buffer,
+ const uint32_t *mask);
+
+typedef uint32_t (*fetch_pixel_32_t) (bits_image_t *image,
+ int x,
+ int y);
+
+typedef argb_t (*fetch_pixel_float_t) (bits_image_t *image,
+ int x,
+ int y);
+
+typedef void (*store_scanline_t) (bits_image_t * image,
+ int x,
+ int y,
+ int width,
+ const uint32_t *values);
+
+typedef enum
+{
+ BITS,
+ LINEAR,
+ CONICAL,
+ RADIAL,
+ SOLID
+} image_type_t;
+
+typedef void (*property_changed_func_t) (pixman_image_t *image);
+
+struct image_common
+{
+ image_type_t type;
+ int32_t ref_count;
+ pixman_region32_t clip_region;
+ int32_t alpha_count; /* How many times this image is being used as an alpha map */
+ pixman_bool_t have_clip_region; /* FALSE if there is no clip */
+ pixman_bool_t client_clip; /* Whether the source clip was
+ set by a client */
+ pixman_bool_t clip_sources; /* Whether the clip applies when
+ * the image is used as a source
+ */
+ pixman_bool_t dirty;
+ pixman_transform_t * transform;
+ pixman_repeat_t repeat;
+ pixman_filter_t filter;
+ pixman_fixed_t * filter_params;
+ int n_filter_params;
+ bits_image_t * alpha_map;
+ int alpha_origin_x;
+ int alpha_origin_y;
+ pixman_bool_t component_alpha;
+ property_changed_func_t property_changed;
+
+ pixman_image_destroy_func_t destroy_func;
+ void * destroy_data;
+
+ uint32_t flags;
+ pixman_format_code_t extended_format_code;
+};
+
+struct solid_fill
+{
+ image_common_t common;
+ pixman_color_t color;
+
+ uint32_t color_32;
+ argb_t color_float;
+};
+
+struct gradient
+{
+ image_common_t common;
+ int n_stops;
+ pixman_gradient_stop_t *stops;
+};
+
+struct linear_gradient
+{
+ gradient_t common;
+ pixman_point_fixed_t p1;
+ pixman_point_fixed_t p2;
+};
+
+struct circle
+{
+ pixman_fixed_t x;
+ pixman_fixed_t y;
+ pixman_fixed_t radius;
+};
+
+struct radial_gradient
+{
+ gradient_t common;
+
+ circle_t c1;
+ circle_t c2;
+
+ circle_t delta;
+ double a;
+ double inva;
+ double mindr;
+};
+
+struct conical_gradient
+{
+ gradient_t common;
+ pixman_point_fixed_t center;
+ double angle;
+};
+
+struct bits_image
+{
+ image_common_t common;
+ pixman_format_code_t format;
+ const pixman_indexed_t * indexed;
+ int width;
+ int height;
+ uint32_t * bits;
+ uint32_t * free_me;
+ int rowstride; /* in number of uint32_t's */
+
+ fetch_scanline_t fetch_scanline_32;
+ fetch_pixel_32_t fetch_pixel_32;
+ store_scanline_t store_scanline_32;
+
+ fetch_scanline_t fetch_scanline_float;
+ fetch_pixel_float_t fetch_pixel_float;
+ store_scanline_t store_scanline_float;
+
+ /* Used for indirect access to the bits */
+ pixman_read_memory_func_t read_func;
+ pixman_write_memory_func_t write_func;
+};
+
+union pixman_image
+{
+ image_type_t type;
+ image_common_t common;
+ bits_image_t bits;
+ gradient_t gradient;
+ linear_gradient_t linear;
+ conical_gradient_t conical;
+ radial_gradient_t radial;
+ solid_fill_t solid;
+};
+
+typedef struct pixman_iter_t pixman_iter_t;
+typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask);
+typedef void (* pixman_iter_write_back_t) (pixman_iter_t *iter);
+typedef void (* pixman_iter_fini_t) (pixman_iter_t *iter);
+
+typedef enum
+{
+ ITER_NARROW = (1 << 0),
+ ITER_WIDE = (1 << 1),
+
+ /* "Localized alpha" is when the alpha channel is used only to compute
+ * the alpha value of the destination. This means that the computation
+ * of the RGB values of the result is independent of the alpha value.
+ *
+ * For example, the OVER operator has localized alpha for the
+ * destination, because the RGB values of the result can be computed
+ * without knowing the destination alpha. Similarly, ADD has localized
+ * alpha for both source and destination because the RGB values of the
+ * result can be computed without knowing the alpha value of source or
+ * destination.
+ *
+ * When he destination is xRGB, this is useful knowledge, because then
+ * we can treat it as if it were ARGB, which means in some cases we can
+ * avoid copying it to a temporary buffer.
+ */
+ ITER_LOCALIZED_ALPHA = (1 << 2),
+ ITER_IGNORE_ALPHA = (1 << 3),
+ ITER_IGNORE_RGB = (1 << 4),
+
+ /* These indicate whether the iterator is for a source
+ * or a destination image
+ */
+ ITER_SRC = (1 << 5),
+ ITER_DEST = (1 << 6)
+} iter_flags_t;
+
+struct pixman_iter_t
+{
+ /* These are initialized by _pixman_implementation_{src,dest}_init */
+ pixman_image_t * image;
+ uint32_t * buffer;
+ int x, y;
+ int width;
+ int height;
+ iter_flags_t iter_flags;
+ uint32_t image_flags;
+
+ /* These function pointers are initialized by the implementation */
+ pixman_iter_get_scanline_t get_scanline;
+ pixman_iter_write_back_t write_back;
+ pixman_iter_fini_t fini;
+
+ /* These fields are scratch data that implementations can use */
+ void * data;
+ uint8_t * bits;
+ int stride;
+};
+
+typedef struct pixman_iter_info_t pixman_iter_info_t;
+typedef void (* pixman_iter_initializer_t) (pixman_iter_t *iter,
+ const pixman_iter_info_t *info);
+struct pixman_iter_info_t
+{
+ pixman_format_code_t format;
+ uint32_t image_flags;
+ iter_flags_t iter_flags;
+ pixman_iter_initializer_t initializer;
+ pixman_iter_get_scanline_t get_scanline;
+ pixman_iter_write_back_t write_back;
+};
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_image_init (pixman_image_t *image);
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t * image,
+ pixman_format_code_t format,
+ int width,
+ int height,
+ uint32_t * bits,
+ int rowstride,
+ pixman_bool_t clear);
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image);
+
+pixman_image_t *
+_pixman_image_allocate (void);
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t * gradient,
+ const pixman_gradient_stop_t *stops,
+ int n_stops);
+void
+_pixman_image_reset_clip_region (pixman_image_t *image);
+
+void
+_pixman_image_validate (pixman_image_t *image);
+
+#define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul) \
+ do \
+ { \
+ uint32_t *__bits__; \
+ int __stride__; \
+ \
+ __bits__ = image->bits.bits; \
+ __stride__ = image->bits.rowstride; \
+ (out_stride) = \
+ __stride__ * (int) sizeof (uint32_t) / (int) sizeof (type); \
+ (line) = \
+ ((type *) __bits__) + (out_stride) * (y) + (mul) * (x); \
+ } while (0)
+
+/*
+ * Gradient walker
+ */
+typedef struct
+{
+ float a_s, a_b;
+ float r_s, r_b;
+ float g_s, g_b;
+ float b_s, b_b;
+ pixman_fixed_48_16_t left_x;
+ pixman_fixed_48_16_t right_x;
+
+ pixman_gradient_stop_t *stops;
+ int num_stops;
+ pixman_repeat_t repeat;
+
+ pixman_bool_t need_reset;
+} pixman_gradient_walker_t;
+
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+ gradient_t * gradient,
+ pixman_repeat_t repeat);
+
+void
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t pos);
+
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+ pixman_fixed_48_16_t x);
+
+/*
+ * Edges
+ */
+
+#define MAX_ALPHA(n) ((1 << (n)) - 1)
+#define N_Y_FRAC(n) ((n) == 1 ? 1 : (1 << ((n) / 2)) - 1)
+#define N_X_FRAC(n) ((n) == 1 ? 1 : (1 << ((n) / 2)) + 1)
+
+#define STEP_Y_SMALL(n) (pixman_fixed_1 / N_Y_FRAC (n))
+#define STEP_Y_BIG(n) (pixman_fixed_1 - (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define Y_FRAC_FIRST(n) (STEP_Y_BIG (n) / 2)
+#define Y_FRAC_LAST(n) (Y_FRAC_FIRST (n) + (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define STEP_X_SMALL(n) (pixman_fixed_1 / N_X_FRAC (n))
+#define STEP_X_BIG(n) (pixman_fixed_1 - (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define X_FRAC_FIRST(n) (STEP_X_BIG (n) / 2)
+#define X_FRAC_LAST(n) (X_FRAC_FIRST (n) + (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define RENDER_SAMPLES_X(x, n) \
+ ((n) == 1? 0 : (pixman_fixed_frac (x) + \
+ X_FRAC_FIRST (n)) / STEP_X_SMALL (n))
+
+void
+pixman_rasterize_edges_accessors (pixman_image_t *image,
+ pixman_edge_t * l,
+ pixman_edge_t * r,
+ pixman_fixed_t t,
+ pixman_fixed_t b);
+
+/*
+ * Implementations
+ */
+typedef struct pixman_implementation_t pixman_implementation_t;
+
+typedef struct
+{
+ pixman_op_t op;
+ pixman_image_t * src_image;
+ pixman_image_t * mask_image;
+ pixman_image_t * dest_image;
+ int32_t src_x;
+ int32_t src_y;
+ int32_t mask_x;
+ int32_t mask_y;
+ int32_t dest_x;
+ int32_t dest_y;
+ int32_t width;
+ int32_t height;
+
+ uint32_t src_flags;
+ uint32_t mask_flags;
+ uint32_t dest_flags;
+} pixman_composite_info_t;
+
+#define PIXMAN_COMPOSITE_ARGS(info) \
+ MAYBE_UNUSED pixman_op_t op = info->op; \
+ MAYBE_UNUSED pixman_image_t * src_image = info->src_image; \
+ MAYBE_UNUSED pixman_image_t * mask_image = info->mask_image; \
+ MAYBE_UNUSED pixman_image_t * dest_image = info->dest_image; \
+ MAYBE_UNUSED int32_t src_x = info->src_x; \
+ MAYBE_UNUSED int32_t src_y = info->src_y; \
+ MAYBE_UNUSED int32_t mask_x = info->mask_x; \
+ MAYBE_UNUSED int32_t mask_y = info->mask_y; \
+ MAYBE_UNUSED int32_t dest_x = info->dest_x; \
+ MAYBE_UNUSED int32_t dest_y = info->dest_y; \
+ MAYBE_UNUSED int32_t width = info->width; \
+ MAYBE_UNUSED int32_t height = info->height
+
+typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
+ pixman_op_t op,
+ uint32_t * dest,
+ const uint32_t * src,
+ const uint32_t * mask,
+ int width);
+
+typedef void (*pixman_combine_float_func_t) (pixman_implementation_t *imp,
+ pixman_op_t op,
+ float * dest,
+ const float * src,
+ const float * mask,
+ int n_pixels);
+
+typedef void (*pixman_composite_func_t) (pixman_implementation_t *imp,
+ pixman_composite_info_t *info);
+typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dest_x,
+ int dest_y,
+ int width,
+ int height);
+typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t filler);
+
+void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
+void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp);
+
+typedef struct
+{
+ pixman_op_t op;
+ pixman_format_code_t src_format;
+ uint32_t src_flags;
+ pixman_format_code_t mask_format;
+ uint32_t mask_flags;
+ pixman_format_code_t dest_format;
+ uint32_t dest_flags;
+ pixman_composite_func_t func;
+} pixman_fast_path_t;
+
+struct pixman_implementation_t
+{
+ pixman_implementation_t * toplevel;
+ pixman_implementation_t * fallback;
+ const pixman_fast_path_t * fast_paths;
+ const pixman_iter_info_t * iter_info;
+
+ pixman_blt_func_t blt;
+ pixman_fill_func_t fill;
+
+ pixman_combine_32_func_t combine_32[PIXMAN_N_OPERATORS];
+ pixman_combine_32_func_t combine_32_ca[PIXMAN_N_OPERATORS];
+ pixman_combine_float_func_t combine_float[PIXMAN_N_OPERATORS];
+ pixman_combine_float_func_t combine_float_ca[PIXMAN_N_OPERATORS];
+};
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+ pixman_image_t * image,
+ pixman_format_code_t format);
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *fallback,
+ const pixman_fast_path_t *fast_paths);
+
+void
+_pixman_implementation_lookup_composite (pixman_implementation_t *toplevel,
+ pixman_op_t op,
+ pixman_format_code_t src_format,
+ uint32_t src_flags,
+ pixman_format_code_t mask_format,
+ uint32_t mask_flags,
+ pixman_format_code_t dest_format,
+ uint32_t dest_flags,
+ pixman_implementation_t **out_imp,
+ pixman_composite_func_t *out_func);
+
+pixman_combine_32_func_t
+_pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_bool_t component_alpha,
+ pixman_bool_t wide);
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t *imp,
+ uint32_t * src_bits,
+ uint32_t * dst_bits,
+ int src_stride,
+ int dst_stride,
+ int src_bpp,
+ int dst_bpp,
+ int src_x,
+ int src_y,
+ int dest_x,
+ int dest_y,
+ int width,
+ int height);
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+ uint32_t * bits,
+ int stride,
+ int bpp,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint32_t filler);
+
+void
+_pixman_implementation_iter_init (pixman_implementation_t *imp,
+ pixman_iter_t *iter,
+ pixman_image_t *image,
+ int x,
+ int y,
+ int width,
+ int height,
+ uint8_t *buffer,
+ iter_flags_t flags,
+ uint32_t image_flags);
+
+/* Specific implementations */
+pixman_implementation_t *
+_pixman_implementation_create_general (void);
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback);
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_SSE2
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_SSSE3
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_SIMD
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_MIPS_DSPR2
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_VMX
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback);
+#endif
+
+pixman_bool_t
+_pixman_implementation_disabled (const char *name);
+
+pixman_implementation_t *
+_pixman_x86_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_arm_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_ppc_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_mips_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_choose_implementation (void);
+
+pixman_bool_t
+_pixman_disabled (const char *name);
+
+
+/*
+ * Utilities
+ */
+pixman_bool_t
+_pixman_compute_composite_region32 (pixman_region32_t * region,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dest_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height);
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
+
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info);
+
+/* These "formats" all have depth 0, so they
+ * will never clash with any real ones
+ */
+#define PIXMAN_null PIXMAN_FORMAT (0, 0, 0, 0, 0, 0)
+#define PIXMAN_solid PIXMAN_FORMAT (0, 1, 0, 0, 0, 0)
+#define PIXMAN_pixbuf PIXMAN_FORMAT (0, 2, 0, 0, 0, 0)
+#define PIXMAN_rpixbuf PIXMAN_FORMAT (0, 3, 0, 0, 0, 0)
+#define PIXMAN_unknown PIXMAN_FORMAT (0, 4, 0, 0, 0, 0)
+#define PIXMAN_any PIXMAN_FORMAT (0, 5, 0, 0, 0, 0)
+
+#define PIXMAN_OP_any (PIXMAN_N_OPERATORS + 1)
+
+#define FAST_PATH_ID_TRANSFORM (1 << 0)
+#define FAST_PATH_NO_ALPHA_MAP (1 << 1)
+#define FAST_PATH_NO_CONVOLUTION_FILTER (1 << 2)
+#define FAST_PATH_NO_PAD_REPEAT (1 << 3)
+#define FAST_PATH_NO_REFLECT_REPEAT (1 << 4)
+#define FAST_PATH_NO_ACCESSORS (1 << 5)
+#define FAST_PATH_NARROW_FORMAT (1 << 6)
+#define FAST_PATH_COMPONENT_ALPHA (1 << 8)
+#define FAST_PATH_SAMPLES_OPAQUE (1 << 7)
+#define FAST_PATH_UNIFIED_ALPHA (1 << 9)
+#define FAST_PATH_SCALE_TRANSFORM (1 << 10)
+#define FAST_PATH_NEAREST_FILTER (1 << 11)
+#define FAST_PATH_HAS_TRANSFORM (1 << 12)
+#define FAST_PATH_IS_OPAQUE (1 << 13)
+#define FAST_PATH_NO_NORMAL_REPEAT (1 << 14)
+#define FAST_PATH_NO_NONE_REPEAT (1 << 15)
+#define FAST_PATH_X_UNIT_POSITIVE (1 << 16)
+#define FAST_PATH_AFFINE_TRANSFORM (1 << 17)
+#define FAST_PATH_Y_UNIT_ZERO (1 << 18)
+#define FAST_PATH_BILINEAR_FILTER (1 << 19)
+#define FAST_PATH_ROTATE_90_TRANSFORM (1 << 20)
+#define FAST_PATH_ROTATE_180_TRANSFORM (1 << 21)
+#define FAST_PATH_ROTATE_270_TRANSFORM (1 << 22)
+#define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST (1 << 23)
+#define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR (1 << 24)
+#define FAST_PATH_BITS_IMAGE (1 << 25)
+#define FAST_PATH_SEPARABLE_CONVOLUTION_FILTER (1 << 26)
+
+#define FAST_PATH_PAD_REPEAT \
+ (FAST_PATH_NO_NONE_REPEAT | \
+ FAST_PATH_NO_NORMAL_REPEAT | \
+ FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NORMAL_REPEAT \
+ (FAST_PATH_NO_NONE_REPEAT | \
+ FAST_PATH_NO_PAD_REPEAT | \
+ FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NONE_REPEAT \
+ (FAST_PATH_NO_NORMAL_REPEAT | \
+ FAST_PATH_NO_PAD_REPEAT | \
+ FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_REFLECT_REPEAT \
+ (FAST_PATH_NO_NONE_REPEAT | \
+ FAST_PATH_NO_NORMAL_REPEAT | \
+ FAST_PATH_NO_PAD_REPEAT)
+
+#define FAST_PATH_STANDARD_FLAGS \
+ (FAST_PATH_NO_CONVOLUTION_FILTER | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NARROW_FORMAT)
+
+#define FAST_PATH_STD_DEST_FLAGS \
+ (FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_NARROW_FORMAT)
+
+#define SOURCE_FLAGS(format) \
+ (FAST_PATH_STANDARD_FLAGS | \
+ ((PIXMAN_ ## format == PIXMAN_solid) ? \
+ 0 : (FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | FAST_PATH_NEAREST_FILTER | FAST_PATH_ID_TRANSFORM)))
+
+#define MASK_FLAGS(format, extra) \
+ ((PIXMAN_ ## format == PIXMAN_null) ? 0 : (SOURCE_FLAGS (format) | extra))
+
+#define FAST_PATH(op, src, src_flags, mask, mask_flags, dest, dest_flags, func) \
+ PIXMAN_OP_ ## op, \
+ PIXMAN_ ## src, \
+ src_flags, \
+ PIXMAN_ ## mask, \
+ mask_flags, \
+ PIXMAN_ ## dest, \
+ dest_flags, \
+ func
+
+#define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func) \
+ { FAST_PATH ( \
+ op, \
+ src, SOURCE_FLAGS (src), \
+ mask, MASK_FLAGS (mask, FAST_PATH_UNIFIED_ALPHA), \
+ dest, FAST_PATH_STD_DEST_FLAGS, \
+ func) }
+
+#define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func) \
+ { FAST_PATH ( \
+ op, \
+ src, SOURCE_FLAGS (src), \
+ mask, MASK_FLAGS (mask, FAST_PATH_COMPONENT_ALPHA), \
+ dest, FAST_PATH_STD_DEST_FLAGS, \
+ func) }
+
+extern pixman_implementation_t *global_implementation;
+
+static force_inline pixman_implementation_t *
+get_implementation (void)
+{
+#ifndef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+ if (!global_implementation)
+ global_implementation = _pixman_choose_implementation ();
+#endif
+ return global_implementation;
+}
+
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT pixman_implementation_t *
+_pixman_internal_only_get_implementation (void);
+
+/* Memory allocation helpers */
+void *
+pixman_malloc_ab (unsigned int n, unsigned int b);
+
+void *
+pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
+
+void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c);
+
+pixman_bool_t
+_pixman_multiply_overflows_size (size_t a, size_t b);
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b);
+
+/* Compositing utilities */
+void
+pixman_expand_to_float (argb_t *dst,
+ const uint32_t *src,
+ pixman_format_code_t format,
+ int width);
+
+void
+pixman_contract_from_float (uint32_t *dst,
+ const argb_t *src,
+ int width);
+
+/* Region Helpers */
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+ pixman_region16_t *src);
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+ pixman_region32_t *src);
+
+/* Doubly linked lists */
+typedef struct pixman_link_t pixman_link_t;
+struct pixman_link_t
+{
+ pixman_link_t *next;
+ pixman_link_t *prev;
+};
+
+typedef struct pixman_list_t pixman_list_t;
+struct pixman_list_t
+{
+ pixman_link_t *head;
+ pixman_link_t *tail;
+};
+
+static force_inline void
+pixman_list_init (pixman_list_t *list)
+{
+ list->head = (pixman_link_t *)list;
+ list->tail = (pixman_link_t *)list;
+}
+
+static force_inline void
+pixman_list_prepend (pixman_list_t *list, pixman_link_t *link)
+{
+ link->next = list->head;
+ link->prev = (pixman_link_t *)list;
+ list->head->prev = link;
+ list->head = link;
+}
+
+static force_inline void
+pixman_list_unlink (pixman_link_t *link)
+{
+ link->prev->next = link->next;
+ link->next->prev = link->prev;
+}
+
+static force_inline void
+pixman_list_move_to_front (pixman_list_t *list, pixman_link_t *link)
+{
+ pixman_list_unlink (link);
+ pixman_list_prepend (list, link);
+}
+
+/* Misc macros */
+
+#ifndef FALSE
+# define FALSE 0
+#endif
+
+#ifndef TRUE
+# define TRUE 1
+#endif
+
+#ifndef MIN
+# define MIN(a, b) ((a < b) ? a : b)
+#endif
+
+#ifndef MAX
+# define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+/* Integer division that rounds towards -infinity */
+#define DIV(a, b) \
+ ((((a) < 0) == ((b) < 0)) ? (a) / (b) : \
+ ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))
+
+/* Modulus that produces the remainder wrt. DIV */
+#define MOD(a, b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
+
+#define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v)))
+
+#define FLOAT_IS_ZERO(f) (-FLT_MIN < (f) && (f) < FLT_MIN)
+
+/* Conversion between 8888 and 0565 */
+
+static force_inline uint16_t
+convert_8888_to_0565 (uint32_t s)
+{
+ /* The following code can be compiled into just 4 instructions on ARM */
+ uint32_t a, b;
+ a = (s >> 3) & 0x1F001F;
+ b = s & 0xFC00;
+ a |= a >> 5;
+ a |= b >> 5;
+ return (uint16_t)a;
+}
+
+static force_inline uint32_t
+convert_0565_to_0888 (uint16_t s)
+{
+ return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |
+ ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |
+ ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)));
+}
+
+static force_inline uint32_t
+convert_0565_to_8888 (uint16_t s)
+{
+ return convert_0565_to_0888 (s) | 0xff000000;
+}
+
+/* Trivial versions that are useful in macros */
+
+static force_inline uint32_t
+convert_8888_to_8888 (uint32_t s)
+{
+ return s;
+}
+
+static force_inline uint32_t
+convert_x888_to_8888 (uint32_t s)
+{
+ return s | 0xff000000;
+}
+
+static force_inline uint16_t
+convert_0565_to_0565 (uint16_t s)
+{
+ return s;
+}
+
+#define PIXMAN_FORMAT_IS_WIDE(f) \
+ (PIXMAN_FORMAT_A (f) > 8 || \
+ PIXMAN_FORMAT_R (f) > 8 || \
+ PIXMAN_FORMAT_G (f) > 8 || \
+ PIXMAN_FORMAT_B (f) > 8 || \
+ PIXMAN_FORMAT_TYPE (f) == PIXMAN_TYPE_ARGB_SRGB)
+
+#ifdef WORDS_BIGENDIAN
+# define SCREEN_SHIFT_LEFT(x,n) ((x) << (n))
+# define SCREEN_SHIFT_RIGHT(x,n) ((x) >> (n))
+#else
+# define SCREEN_SHIFT_LEFT(x,n) ((x) >> (n))
+# define SCREEN_SHIFT_RIGHT(x,n) ((x) << (n))
+#endif
+
+static force_inline uint32_t
+unorm_to_unorm (uint32_t val, int from_bits, int to_bits)
+{
+ uint32_t result;
+
+ if (from_bits == 0)
+ return 0;
+
+ /* Delete any extra bits */
+ val &= ((1 << from_bits) - 1);
+
+ if (from_bits >= to_bits)
+ return val >> (from_bits - to_bits);
+
+ /* Start out with the high bit of val in the high bit of result. */
+ result = val << (to_bits - from_bits);
+
+ /* Copy the bits in result, doubling the number of bits each time, until
+ * we fill all to_bits. Unrolled manually because from_bits and to_bits
+ * are usually known statically, so the compiler can turn all of this
+ * into a few shifts.
+ */
+#define REPLICATE() \
+ do \
+ { \
+ if (from_bits < to_bits) \
+ { \
+ result |= result >> from_bits; \
+ \
+ from_bits *= 2; \
+ } \
+ } \
+ while (0)
+
+ REPLICATE();
+ REPLICATE();
+ REPLICATE();
+ REPLICATE();
+ REPLICATE();
+
+ return result;
+}
+
+uint16_t pixman_float_to_unorm (float f, int n_bits);
+float pixman_unorm_to_float (uint16_t u, int n_bits);
+
+/*
+ * Various debugging code
+ */
+
+#undef DEBUG
+
+#define COMPILE_TIME_ASSERT(x) \
+ do { typedef int compile_time_assertion [(x)?1:-1]; } while (0)
+
+/* Turn on debugging depending on what type of release this is
+ */
+#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1))
+
+/* Debugging gets turned on for development releases because these
+ * are the things that end up in bleeding edge distributions such
+ * as Rawhide etc.
+ *
+ * For performance reasons we don't turn it on for stable releases or
+ * random git checkouts. (Random git checkouts are often used for
+ * performance work).
+ */
+
+# define DEBUG
+
+#endif
+
+void
+_pixman_log_error (const char *function, const char *message);
+
+#define return_if_fail(expr) \
+ do \
+ { \
+ if (unlikely (!(expr))) \
+ { \
+ _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+ return; \
+ } \
+ } \
+ while (0)
+
+#define return_val_if_fail(expr, retval) \
+ do \
+ { \
+ if (unlikely (!(expr))) \
+ { \
+ _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+ return (retval); \
+ } \
+ } \
+ while (0)
+
+#define critical_if_fail(expr) \
+ do \
+ { \
+ if (unlikely (!(expr))) \
+ _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+ } \
+ while (0)
+
+/*
+ * Matrix
+ */
+
+typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
+
+pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t *t,
+ const pixman_vector_48_16_t *v,
+ pixman_vector_48_16_t *result);
+
+void
+pixman_transform_point_31_16_3d (const pixman_transform_t *t,
+ const pixman_vector_48_16_t *v,
+ pixman_vector_48_16_t *result);
+
+void
+pixman_transform_point_31_16_affine (const pixman_transform_t *t,
+ const pixman_vector_48_16_t *v,
+ pixman_vector_48_16_t *result);
+
+/*
+ * Timers
+ */
+
+#ifdef PIXMAN_TIMERS
+
+static inline uint64_t
+oil_profile_stamp_rdtsc (void)
+{
+ uint32_t hi, lo;
+
+ __asm__ __volatile__ ("rdtsc\n" : "=a" (lo), "=d" (hi));
+
+ return lo | (((uint64_t)hi) << 32);
+}
+
+#define OIL_STAMP oil_profile_stamp_rdtsc
+
+typedef struct pixman_timer_t pixman_timer_t;
+
+struct pixman_timer_t
+{
+ int initialized;
+ const char * name;
+ uint64_t n_times;
+ uint64_t total;
+ pixman_timer_t *next;
+};
+
+extern int timer_defined;
+
+void pixman_timer_register (pixman_timer_t *timer);
+
+#define TIMER_BEGIN(tname) \
+ { \
+ static pixman_timer_t timer ## tname; \
+ uint64_t begin ## tname; \
+ \
+ if (!timer ## tname.initialized) \
+ { \
+ timer ## tname.initialized = 1; \
+ timer ## tname.name = # tname; \
+ pixman_timer_register (&timer ## tname); \
+ } \
+ \
+ timer ## tname.n_times++; \
+ begin ## tname = OIL_STAMP ();
+
+#define TIMER_END(tname) \
+ timer ## tname.total += OIL_STAMP () - begin ## tname; \
+ }
+
+#else
+
+#define TIMER_BEGIN(tname)
+#define TIMER_END(tname)
+
+#endif /* PIXMAN_TIMERS */
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* PIXMAN_PRIVATE_H */
diff --git a/gfx/video_driver.c b/gfx/video_driver.c
index 3ef9650a65..e7f14acc46 100644
--- a/gfx/video_driver.c
+++ b/gfx/video_driver.c
@@ -59,6 +59,9 @@ static const video_driver_t *video_drivers[] = {
#endif
#ifdef HAVE_EXYNOS
&video_exynos,
+#endif
+#ifdef HAVE_SUNXI
+ &video_sunxi,
#endif
&video_null,
NULL,
diff --git a/gfx/video_driver.h b/gfx/video_driver.h
index 054fbc1ee3..972454dfd4 100644
--- a/gfx/video_driver.h
+++ b/gfx/video_driver.h
@@ -186,6 +186,7 @@ extern video_driver_t video_sdl2;
extern video_driver_t video_vg;
extern video_driver_t video_omap;
extern video_driver_t video_exynos;
+extern video_driver_t video_sunxi;
extern video_driver_t video_null;
enum rarch_display_type
diff --git a/qb/config.libs.sh b/qb/config.libs.sh
index 2db9ab6050..c497f18892 100644
--- a/qb/config.libs.sh
+++ b/qb/config.libs.sh
@@ -348,6 +348,6 @@ add_define_make OS "$OS"
# Creates config.mk and config.h.
add_define_make GLOBAL_CONFIG_DIR "$GLOBAL_CONFIG_DIR"
-VARS="RGUI LAKKA GLUI XMB ALSA OSS OSS_BSD OSS_LIB AL RSOUND ROAR JACK COREAUDIO PULSE SDL SDL2 D3D9 DINPUT WINXINPUT DSOUND XAUDIO OPENGL EXYNOS OMAP GLES GLES3 VG EGL KMS GBM DRM DYLIB GETOPT_LONG THREADS CG LIBXML2 ZLIB DYNAMIC FFMPEG AVCODEC AVFORMAT AVUTIL SWSCALE FREETYPE XKBCOMMON XVIDEO X11 XEXT XF86VM XINERAMA WAYLAND MALI_FBDEV VIVANTE_FBDEV NETWORKING NETPLAY NETWORK_CMD STDIN_CMD COMMAND SOCKET_LEGACY FBO STRL STRCASESTR MMAP PYTHON FFMPEG_ALLOC_CONTEXT3 FFMPEG_AVCODEC_OPEN2 FFMPEG_AVIO_OPEN FFMPEG_AVFORMAT_WRITE_HEADER FFMPEG_AVFORMAT_NEW_STREAM FFMPEG_AVCODEC_ENCODE_AUDIO2 FFMPEG_AVCODEC_ENCODE_VIDEO2 BSV_MOVIE VIDEOCORE NEON FLOATHARD FLOATSOFTFP UDEV V4L2 AV_CHANNEL_LAYOUT 7ZIP PARPORT"
+VARS="RGUI LAKKA GLUI XMB ALSA OSS OSS_BSD OSS_LIB AL RSOUND ROAR JACK COREAUDIO PULSE SDL SDL2 D3D9 DINPUT WINXINPUT DSOUND XAUDIO OPENGL EXYNOS SUNXI OMAP GLES GLES3 VG EGL KMS GBM DRM DYLIB GETOPT_LONG THREADS CG LIBXML2 ZLIB DYNAMIC FFMPEG AVCODEC AVFORMAT AVUTIL SWSCALE FREETYPE XKBCOMMON XVIDEO X11 XEXT XF86VM XINERAMA WAYLAND MALI_FBDEV VIVANTE_FBDEV NETWORKING NETPLAY NETWORK_CMD STDIN_CMD COMMAND SOCKET_LEGACY FBO STRL STRCASESTR MMAP PYTHON FFMPEG_ALLOC_CONTEXT3 FFMPEG_AVCODEC_OPEN2 FFMPEG_AVIO_OPEN FFMPEG_AVFORMAT_WRITE_HEADER FFMPEG_AVFORMAT_NEW_STREAM FFMPEG_AVCODEC_ENCODE_AUDIO2 FFMPEG_AVCODEC_ENCODE_VIDEO2 BSV_MOVIE VIDEOCORE NEON FLOATHARD FLOATSOFTFP UDEV V4L2 AV_CHANNEL_LAYOUT 7ZIP PARPORT"
create_config_make config.mk $VARS
create_config_header config.h $VARS
diff --git a/qb/config.params.sh b/qb/config.params.sh
index a054396b41..c6bb3e6c67 100644
--- a/qb/config.params.sh
+++ b/qb/config.params.sh
@@ -25,6 +25,7 @@ HAVE_OMAP=no # Enable OMAP video support
HAVE_XINERAMA=auto # Disable Xinerama support.
HAVE_KMS=auto # Enable KMS context support
HAVE_EXYNOS=no # Enable Exynos video support
+HAVE_SUNXI=no # Enable Sunxi video support
HAVE_EGL=auto # Enable EGL context support
HAVE_VG=auto # Enable OpenVG support
HAVE_CG=auto # Enable Cg shader support
diff --git a/settings.c b/settings.c
index 553a05dd4c..cfc05ce225 100644
--- a/settings.c
+++ b/settings.c
@@ -155,6 +155,8 @@ const char *config_get_default_video(void)
return "omap";
case VIDEO_EXYNOS:
return "exynos";
+ case VIDEO_SUNXI:
+ return "sunxi";
default:
break;
}
diff --git a/settings_data.c b/settings_data.c
index 4abd61e121..ce64aeda0d 100644
--- a/settings_data.c
+++ b/settings_data.c
@@ -1969,6 +1969,12 @@ int setting_data_get_description(const char *label, char *msg,
" \n"
"Performance for software rendered cores \n"
"should be optimal.");
+ else if (!strcmp(g_settings.video.driver, "sunxi"))
+ snprintf(msg, sizeof_msg,
+ " -- Sunxi-G2D Video Driver. \n"
+ " \n"
+ "This is a low-level Sunxi video driver. \n"
+ "Uses the G2D block in Allwinner SoCs.");
else
snprintf(msg, sizeof_msg,
" -- Current Video driver.");