mirror of
https://github.com/signalwire/freeswitch.git
synced 2025-08-13 09:36:46 +00:00
update to libvpx b46243d from repo https://chromium.googlesource.com/webm/libvpx
This commit is contained in:
493
libs/libvpx/third_party/libyuv/source/scale_gcc.cc
vendored
493
libs/libvpx/third_party/libyuv/source/scale_gcc.cc
vendored
@@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
@@ -16,7 +17,8 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC x86 and x64.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
|
||||
// Offsets for source bytes 0 to 9
|
||||
static uvec8 kShuf0 =
|
||||
@@ -96,8 +98,8 @@ static uvec16 kScaleAb2 =
|
||||
// Generated using gcc disassembly on Visual C object file:
|
||||
// objdump -D yuvscaler.obj >yuvscaler.txt
|
||||
|
||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@@ -118,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm2 \n"
|
||||
"pand %%xmm5,%%xmm3 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"pavgw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pavgw %%xmm5,%%xmm0 \n"
|
||||
"pavgw %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
@@ -145,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@@ -162,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
||||
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"pavgb %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm2 \n"
|
||||
"pand %%xmm5,%%xmm3 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"pavgw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x1,%%xmm0 \n"
|
||||
"psrlw $0x1,%%xmm1 \n"
|
||||
"pavgw %%xmm5,%%xmm0 \n"
|
||||
"pavgw %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
@@ -186,7 +188,105 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
|
||||
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
|
||||
"lea " MEMLEA(0x40,0) ",%0 \n"
|
||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:: "memory", "cc", "xmm0", "xmm1"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
|
||||
"vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
|
||||
"lea " MEMLEA(0x40,0) ",%0 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
|
||||
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
|
||||
MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
|
||||
MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
|
||||
"lea " MEMLEA(0x40,0) ",%0 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
||||
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
|
||||
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)) // %3
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_SCALEROWDOWN2_AVX2
|
||||
|
||||
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
@@ -214,12 +314,15 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
intptr_t stridex3 = 0;
|
||||
intptr_t stridex3;
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||
"psrlw $0x8,%%xmm7 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
"psllw $0x3,%%xmm5 \n"
|
||||
"lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
|
||||
|
||||
LABELALIGN
|
||||
@@ -228,31 +331,29 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
|
||||
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"pavgb %%xmm3,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
|
||||
MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
|
||||
MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
|
||||
MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
||||
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"pavgb %%xmm4,%%xmm2 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"pavgb %%xmm5,%%xmm3 \n"
|
||||
"pavgb %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"pand %%xmm7,%%xmm2 \n"
|
||||
"pand %%xmm7,%%xmm3 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"pavgw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"pand %%xmm7,%%xmm2 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
"phaddw %%xmm1,%%xmm0 \n"
|
||||
"paddw %%xmm5,%%xmm0 \n"
|
||||
"psrlw $0x4,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"movq %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
@@ -260,13 +361,100 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(stridex3) // %3
|
||||
"=&r"(stridex3) // %3
|
||||
: "r"((intptr_t)(src_stride)) // %4
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN4_AVX2
|
||||
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
|
||||
"vpslld $0x10,%%ymm5,%%ymm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
|
||||
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
|
||||
"lea " MEMLEA(0x40,0) ",%0 \n"
|
||||
"vpand %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpand %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
|
||||
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpsllw $0x3,%%ymm4,%%ymm5 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
|
||||
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
|
||||
MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
|
||||
MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
|
||||
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
||||
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
|
||||
MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
|
||||
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
||||
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
|
||||
MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
|
||||
"lea " MEMLEA(0x40,0) ",%0 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
||||
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw $0x4,%%ymm0,%%ymm0 \n"
|
||||
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(src_stride * 3)) // %4
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_SCALEROWDOWN4_AVX2
|
||||
|
||||
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
@@ -574,61 +762,89 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
}
|
||||
|
||||
// Reads 16xN bytes and produces 16 shorts at a time.
|
||||
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
int tmp_height = 0;
|
||||
intptr_t tmp_src = 0;
|
||||
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
asm volatile (
|
||||
"mov %0,%3 \n" // row pointer
|
||||
"mov %5,%2 \n" // height
|
||||
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(3) ",%%xmm2 \n"
|
||||
"add %6,%3 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"punpcklbw %%xmm4,%%xmm2 \n"
|
||||
"punpckhbw %%xmm4,%%xmm3 \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
|
||||
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
|
||||
"movdqa %%xmm3,%%xmm2 \n"
|
||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||
"punpckhbw %%xmm5,%%xmm3 \n"
|
||||
"paddusw %%xmm2,%%xmm0 \n"
|
||||
"paddusw %%xmm3,%%xmm1 \n"
|
||||
"sub $0x1,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
|
||||
"mov %0,%3 \n" // row pointer
|
||||
"mov %5,%2 \n" // height
|
||||
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(tmp_height), // %2
|
||||
"+r"(tmp_src), // %3
|
||||
"+r"(src_width), // %4
|
||||
"+rm"(src_height) // %5
|
||||
: "rm"((intptr_t)(src_stride)) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
// Reads 32 bytes and accumulates to 32 shorts at a time.
|
||||
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
asm volatile (
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
|
||||
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
|
||||
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
|
||||
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
|
||||
"vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
|
||||
"vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
|
||||
"lea " MEMLEA(0x40,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_SCALEADDROW_AVX2
|
||||
|
||||
// Constant for making pixels signed to avoid pmaddubsw
|
||||
// saturation.
|
||||
static uvec8 kFsub80 =
|
||||
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
||||
|
||||
// Constant for making pixels unsigned and adding .5 for rounding.
|
||||
static uvec16 kFadd40 =
|
||||
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
|
||||
|
||||
// Bilinear column filtering. SSSE3 version.
|
||||
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) {
|
||||
intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
|
||||
intptr_t x0, x1, temp_pixel;
|
||||
asm volatile (
|
||||
"movd %6,%%xmm2 \n"
|
||||
"movd %7,%%xmm3 \n"
|
||||
"movl $0x04040000,%k2 \n"
|
||||
"movd %k2,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm6,%%xmm6 \n"
|
||||
"psrlw $0x9,%%xmm6 \n"
|
||||
"psrlw $0x9,%%xmm6 \n" // 0x007f007f
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||
"psrlw $15,%%xmm7 \n" // 0x00010001
|
||||
|
||||
"pextrw $0x1,%%xmm2,%k3 \n"
|
||||
"subl $0x2,%5 \n"
|
||||
"jl 29f \n"
|
||||
@@ -650,16 +866,19 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
"movd %k2,%%xmm4 \n"
|
||||
"pshufb %%xmm5,%%xmm1 \n"
|
||||
"punpcklwd %%xmm4,%%xmm0 \n"
|
||||
"pxor %%xmm6,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm1,%%xmm0 \n"
|
||||
"psubb %8,%%xmm0 \n" // make pixels signed.
|
||||
"pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
|
||||
"paddusb %%xmm7,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm0,%%xmm1 \n"
|
||||
"pextrw $0x1,%%xmm2,%k3 \n"
|
||||
"pextrw $0x3,%%xmm2,%k4 \n"
|
||||
"psrlw $0x7,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"movd %%xmm0,%k2 \n"
|
||||
"paddw %9,%%xmm1 \n" // make pixels unsigned.
|
||||
"psrlw $0x7,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm1 \n"
|
||||
"movd %%xmm1,%k2 \n"
|
||||
"mov %w2," MEMACCESS(0) " \n"
|
||||
"lea " MEMLEA(0x2,0) ",%0 \n"
|
||||
"sub $0x2,%5 \n"
|
||||
"subl $0x2,%5 \n"
|
||||
"jge 2b \n"
|
||||
|
||||
LABELALIGN
|
||||
@@ -670,23 +889,37 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
"movd %k2,%%xmm0 \n"
|
||||
"psrlw $0x9,%%xmm2 \n"
|
||||
"pshufb %%xmm5,%%xmm2 \n"
|
||||
"psubb %8,%%xmm0 \n" // make pixels signed.
|
||||
"pxor %%xmm6,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm2,%%xmm0 \n"
|
||||
"psrlw $0x7,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"movd %%xmm0,%k2 \n"
|
||||
"paddusb %%xmm7,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm0,%%xmm2 \n"
|
||||
"paddw %9,%%xmm2 \n" // make pixels unsigned.
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
"movd %%xmm2,%k2 \n"
|
||||
"mov %b2," MEMACCESS(0) " \n"
|
||||
"99: \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+a"(temp_pixel), // %2
|
||||
"+r"(x0), // %3
|
||||
"+r"(x1), // %4
|
||||
"+rm"(dst_width) // %5
|
||||
: "rm"(x), // %6
|
||||
"rm"(dx) // %7
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"=&a"(temp_pixel), // %2
|
||||
"=&r"(x0), // %3
|
||||
"=&r"(x1), // %4
|
||||
#if defined(__x86_64__)
|
||||
"+rm"(dst_width) // %5
|
||||
#else
|
||||
"+m"(dst_width) // %5
|
||||
#endif
|
||||
: "rm"(x), // %6
|
||||
"rm"(dx), // %7
|
||||
#if defined(__x86_64__)
|
||||
"x"(kFsub80), // %8
|
||||
"x"(kFadd40) // %9
|
||||
#else
|
||||
"m"(kFsub80), // %8
|
||||
"m"(kFadd40) // %9
|
||||
#endif
|
||||
: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -795,7 +1028,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
||||
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_stepx, uint8* dst_argb, int dst_width) {
|
||||
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
|
||||
intptr_t src_stepx_x12 = 0;
|
||||
intptr_t src_stepx_x12;
|
||||
asm volatile (
|
||||
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
|
||||
"lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
|
||||
@@ -813,11 +1046,11 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stepx_x4), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(dst_width), // %3
|
||||
"+r"(src_stepx_x12) // %4
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stepx_x4), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(dst_width), // %3
|
||||
"=&r"(src_stepx_x12) // %4
|
||||
:: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3"
|
||||
);
|
||||
@@ -829,7 +1062,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
ptrdiff_t src_stride, int src_stepx,
|
||||
uint8* dst_argb, int dst_width) {
|
||||
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
|
||||
intptr_t src_stepx_x12 = 0;
|
||||
intptr_t src_stepx_x12;
|
||||
intptr_t row1 = (intptr_t)(src_stride);
|
||||
asm volatile (
|
||||
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
|
||||
@@ -858,12 +1091,12 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stepx_x4), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+rm"(dst_width), // %3
|
||||
"+r"(src_stepx_x12), // %4
|
||||
"+r"(row1) // %5
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stepx_x4), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+rm"(dst_width), // %3
|
||||
"=&r"(src_stepx_x12), // %4
|
||||
"+r"(row1) // %5
|
||||
:: "memory", "cc", NACL_R14
|
||||
"xmm0", "xmm1", "xmm2", "xmm3"
|
||||
);
|
||||
@@ -871,7 +1104,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
|
||||
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
intptr_t x0 = 0, x1 = 0;
|
||||
intptr_t x0, x1;
|
||||
asm volatile (
|
||||
"movd %5,%%xmm2 \n"
|
||||
"movd %6,%%xmm3 \n"
|
||||
@@ -924,8 +1157,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"99: \n"
|
||||
: "+a"(x0), // %0
|
||||
"+d"(x1), // %1
|
||||
: "=&a"(x0), // %0
|
||||
"=&d"(x1), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(src_argb), // %3
|
||||
"+r"(dst_width) // %4
|
||||
@@ -976,7 +1209,7 @@ static uvec8 kShuffleFractions = {
|
||||
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
|
||||
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
intptr_t x0 = 0, x1 = 0;
|
||||
intptr_t x0, x1;
|
||||
asm volatile (
|
||||
"movdqa %0,%%xmm4 \n"
|
||||
"movdqa %1,%%xmm5 \n"
|
||||
@@ -1039,8 +1272,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+rm"(dst_width), // %2
|
||||
"+r"(x0), // %3
|
||||
"+r"(x1) // %4
|
||||
"=&r"(x0), // %3
|
||||
"=&r"(x1) // %4
|
||||
: "rm"(x), // %5
|
||||
"rm"(dx) // %6
|
||||
: "memory", "cc", NACL_R14
|
||||
|
Reference in New Issue
Block a user