This commit is contained in:
Michael Jerris
2016-02-23 14:32:34 -05:00
parent 1ffc801957
commit d57c6ec48a
154 changed files with 65122 additions and 0 deletions

View File

@@ -0,0 +1,340 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/compare.h"
#include <float.h>
#include <math.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// hash seed of 5381 recommended.
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
}
#endif
#if defined(HAS_HASHDJB2_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
HashDjb2_SSE = HashDjb2_AVX2;
}
#endif
while (count >= (uint64)(kBlockSize)) {
seed = HashDjb2_SSE(src, kBlockSize, seed);
src += kBlockSize;
count -= kBlockSize;
}
remainder = (int)(count) & ~15;
if (remainder) {
seed = HashDjb2_SSE(src, remainder, seed);
src += remainder;
count -= remainder;
}
remainder = (int)(count) & 15;
if (remainder) {
seed = HashDjb2_C(src, remainder, seed);
}
return seed;
}
static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
return FOURCC_BGRA;
}
if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
return FOURCC_ARGB;
}
argb += 8;
}
if (width & 1) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
}
return 0;
}
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
uint32 fourcc = 0;
int h;
// Coalesce rows.
if (stride_argb == width * 4) {
width *= height;
height = 1;
stride_argb = 0;
}
for (h = 0; h < height && fourcc == 0; ++h) {
fourcc = ARGBDetectRow_C(argb, width);
argb += stride_argb;
}
return fourcc;
}
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
int count) {
// SumSquareError returns values 0 to 65535 for each squared difference.
// Up to 65536 of those can be summed and remain within a uint32.
// After each block of 65536 pixels, accumulate into a uint64.
const int kBlockSize = 65536;
int remainder = count & (kBlockSize - 1) & ~31;
uint64 sse = 0;
int i;
uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
SumSquareError_C;
#if defined(HAS_SUMSQUAREERROR_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SumSquareError = SumSquareError_NEON;
}
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}
#endif
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
// Note only used for multiples of 32 so count is not checked.
SumSquareError = SumSquareError_AVX2;
}
#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+: sse)
#endif
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
}
src_a += count & ~(kBlockSize - 1);
src_b += count & ~(kBlockSize - 1);
if (remainder) {
sse += SumSquareError(src_a, src_b, remainder);
src_a += remainder;
src_b += remainder;
}
remainder = count & 31;
if (remainder) {
sse += SumSquareError_C(src_a, src_b, remainder);
}
return sse;
}
LIBYUV_API
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
uint64 sse = 0;
int h;
// Coalesce rows.
if (stride_a == width &&
stride_b == width) {
width *= height;
height = 1;
stride_a = stride_b = 0;
}
for (h = 0; h < height; ++h) {
sse += ComputeSumSquareError(src_a, src_b, width);
src_a += stride_a;
src_b += stride_b;
}
return sse;
}
LIBYUV_API
double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
double psnr;
if (sse > 0) {
double mse = (double)(count) / (double)(sse);
psnr = 10.0 * log10(255.0 * 255.0 * mse);
} else {
psnr = kMaxPsnr; // Limit to prevent divide by 0
}
if (psnr > kMaxPsnr)
psnr = kMaxPsnr;
return psnr;
}
LIBYUV_API
double CalcFramePsnr(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
const uint64 samples = width * height;
const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
src_b, stride_b,
width, height);
return SumSquareErrorToPsnr(sse, samples);
}
LIBYUV_API
double I420Psnr(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height) {
const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
src_y_b, stride_y_b,
width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
src_u_b, stride_u_b,
width_uv, height_uv);
const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
src_v_b, stride_v_b,
width_uv, height_uv);
const uint64 samples = width * height + 2 * (width_uv * height_uv);
const uint64 sse = sse_y + sse_u + sse_v;
return SumSquareErrorToPsnr(sse, samples);
}
static const int64 cc1 = 26634; // (64^2*(.01*255)^2
static const int64 cc2 = 239708; // (64^2*(.03*255)^2
static double Ssim8x8_C(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b) {
int64 sum_a = 0;
int64 sum_b = 0;
int64 sum_sq_a = 0;
int64 sum_sq_b = 0;
int64 sum_axb = 0;
int i;
for (i = 0; i < 8; ++i) {
int j;
for (j = 0; j < 8; ++j) {
sum_a += src_a[j];
sum_b += src_b[j];
sum_sq_a += src_a[j] * src_a[j];
sum_sq_b += src_b[j] * src_b[j];
sum_axb += src_a[j] * src_b[j];
}
src_a += stride_a;
src_b += stride_b;
}
{
const int64 count = 64;
// scale the constants by number of pixels
const int64 c1 = (cc1 * count * count) >> 12;
const int64 c2 = (cc2 * count * count) >> 12;
const int64 sum_a_x_sum_b = sum_a * sum_b;
const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
(2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
const int64 sum_a_sq = sum_a*sum_a;
const int64 sum_b_sq = sum_b*sum_b;
const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
(count * sum_sq_a - sum_a_sq +
count * sum_sq_b - sum_b_sq + c2);
if (ssim_d == 0.0) {
return DBL_MAX;
}
return ssim_n * 1.0 / ssim_d;
}
}
// We are using a 8x8 moving window with starting location of each 8x8 window
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
// block boundaries to penalize blocking artifacts.
LIBYUV_API
double CalcFrameSsim(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
int samples = 0;
double ssim_total = 0;
double (*Ssim8x8)(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b) = Ssim8x8_C;
// sample point start with each 4x4 location
int i;
for (i = 0; i < height - 8; i += 4) {
int j;
for (j = 0; j < width - 8; j += 4) {
ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
samples++;
}
src_a += stride_a * 4;
src_b += stride_b * 4;
}
ssim_total /= samples;
return ssim_total;
}
LIBYUV_API
double I420Ssim(const uint8* src_y_a, int stride_y_a,
const uint8* src_u_a, int stride_u_a,
const uint8* src_v_a, int stride_v_a,
const uint8* src_y_b, int stride_y_b,
const uint8* src_u_b, int stride_u_b,
const uint8* src_v_b, int stride_v_b,
int width, int height) {
const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
src_y_b, stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
src_u_b, stride_u_b,
width_uv, height_uv);
const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
src_v_b, stride_v_b,
width_uv, height_uv);
return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,44 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse = 0u;
int i;
for (i = 0; i < count; ++i) {
int diff = src_a[i] - src_b[i];
sse += (uint32)(diff * diff);
}
return sse;
}
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
uint32 hash = seed;
int i;
for (i = 0; i < count; ++i) {
hash += (hash << 5) + src[i];
}
return hash;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,151 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
"por %%xmm2,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm2 \n"
"pmaddwd %%xmm1,%%xmm1 \n"
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"pshufd $0x1,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"movd %%xmm0,%3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(sse) // %3
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
return sse;
}
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
static uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
static uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
static uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
uint32 hash;
asm volatile (
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
"pmulld %%xmm6,%%xmm0 \n"
"movdqa %5,%%xmm5 \n"
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm7,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm7,%%xmm3 \n"
"pmulld %%xmm5,%%xmm3 \n"
"movdqa %6,%%xmm5 \n"
"movdqa %%xmm2,%%xmm4 \n"
"punpckhwd %%xmm7,%%xmm4 \n"
"pmulld %%xmm5,%%xmm4 \n"
"movdqa %7,%%xmm5 \n"
"punpckhbw %%xmm7,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"punpcklwd %%xmm7,%%xmm2 \n"
"pmulld %%xmm5,%%xmm2 \n"
"movdqa %8,%%xmm5 \n"
"punpckhwd %%xmm7,%%xmm1 \n"
"pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"sub $0x10,%1 \n"
"jg 1b \n"
"movd %%xmm0,%3 \n"
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
"=g"(hash) // %3
: "m"(kHash16x33), // %4
"m"(kHashMul0), // %5
"m"(kHashMul1), // %6
"m"(kHashMul2), // %7
"m"(kHashMul3) // %8
: "memory", "cc"
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
return hash;
}
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,66 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,64 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,222 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
__declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
pxor xmm0, xmm0
pxor xmm5, xmm5
wloop:
movdqu xmm1, [eax]
lea eax, [eax + 16]
movdqu xmm2, [edx]
lea edx, [edx + 16]
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
por xmm1, xmm2
movdqa xmm2, xmm1
punpcklbw xmm1, xmm5
punpckhbw xmm2, xmm5
pmaddwd xmm1, xmm1
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
paddd xmm0, xmm1
pshufd xmm1, xmm0, 0x01
paddd xmm0, xmm1
movd eax, xmm0
ret
}
}
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable: 4752)
__declspec(naked)
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
mov ecx, [esp + 12] // count
vpxor ymm0, ymm0, ymm0 // sum
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
vpunpckhbw ymm1, ymm1, ymm5
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
vpaddd ymm0, ymm0, ymm1
vpermq ymm1, ymm0, 0x02 // high + low lane.
vpaddd ymm0, ymm0, ymm1
vmovd eax, xmm0
vzeroupper
ret
}
}
#endif // _MSC_VER >= 1700
uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
__declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, xmmword ptr kHash16x33
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, xmmword ptr kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld xmm3, xmm5
movdqa xmm5, xmmword ptr kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld xmm4, xmm5
movdqa xmm5, xmmword ptr kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld xmm2, xmm5
movdqa xmm5, xmmword ptr kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
ret
}
}
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
__declspec(naked)
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
wloop:
vpmovzxbd xmm3, [eax] // src[0-3]
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
vpmovzxbd xmm4, [eax + 4] // src[4-7]
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
vpmovzxbd xmm2, [eax + 8] // src[8-11]
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
vpmovzxbd xmm1, [eax + 12] // src[12-15]
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
lea eax, [eax + 16]
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
vpaddd xmm1, xmm1,xmm2
vpshufd xmm2, xmm1, 0x01
vpaddd xmm1, xmm1, xmm2
vpaddd xmm0, xmm0, xmm1
sub ecx, 16
jg wloop
vmovd eax, xmm0 // return hash
vzeroupper
ret
}
}
#endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,392 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/convert.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#ifdef HAVE_JPEG
struct I420Buffers {
uint8* y;
int y_stride;
uint8* u;
int u_stride;
uint8* v;
int v_stride;
int w;
int h;
};
static void JpegCopyI420(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I420Copy(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
dest->h -= rows;
}
static void JpegI422ToI420(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I422ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
dest->h -= rows;
}
static void JpegI444ToI420(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I444ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
dest->h -= rows;
}
static void JpegI411ToI420(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I411ToI420(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
dest->h -= rows;
}
static void JpegI400ToI420(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
I400ToI420(data[0], strides[0],
dest->y, dest->y_stride,
dest->u, dest->u_stride,
dest->v, dest->v_stride,
dest->w, rows);
dest->y += rows * dest->y_stride;
dest->u += ((rows + 1) >> 1) * dest->u_stride;
dest->v += ((rows + 1) >> 1) * dest->v_stride;
dest->h -= rows;
}
// Query size of MJPG in pixels.
LIBYUV_API
int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height) {
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret) {
*width = mjpeg_decoder.GetWidth();
*height = mjpeg_decoder.GetHeight();
}
mjpeg_decoder.UnloadFrame();
return ret ? 0 : -1; // -1 for runtime failure.
}
// MJPG (Motion JPeg) to I420
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
LIBYUV_API
int MJPGToI420(const uint8* sample,
size_t sample_size,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int w, int h,
int dw, int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
}
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
// YUV420
if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
// YUV411
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
// factors that occur in practice. 411 is supported by libjpeg
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
}
}
return ret ? 0 : 1;
}
#ifdef HAVE_JPEG
struct ARGBBuffers {
uint8* argb;
int argb_stride;
int w;
int h;
};
static void JpegI420ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I420ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
static void JpegI422ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I422ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
static void JpegI444ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I444ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
static void JpegI411ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I411ToARGB(data[0], strides[0],
data[1], strides[1],
data[2], strides[2],
dest->argb, dest->argb_stride,
dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
static void JpegI400ToARGB(void* opaque,
const uint8* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
I400ToARGB(data[0], strides[0],
dest->argb, dest->argb_stride,
dest->w, rows);
dest->argb += rows * dest->argb_stride;
dest->h -= rows;
}
// MJPG (Motion JPeg) to ARGB
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
LIBYUV_API
int MJPGToARGB(const uint8* sample,
size_t sample_size,
uint8* argb, int argb_stride,
int w, int h,
int dw, int dh) {
if (sample_size == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
}
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
if (ret && (mjpeg_decoder.GetWidth() != w ||
mjpeg_decoder.GetHeight() != h)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
ARGBBuffers bufs = { argb, argb_stride, dw, dh };
// YUV420
if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
// YUV411
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
} else {
// TODO(fbarchard): Implement conversion for any other colorspace/sample
// factors that occur in practice. 411 is supported by libjpeg
// ERROR: Unable to convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
}
}
return ret ? 0 : 1;
}
#endif
#endif
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,306 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
#include "libyuv/rotate_argb.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// src_width is used for source stride computation
// src_height is used to compute location of planes, and indicate inversion
// sample_size is measured in bytes and is the size of the frame.
// With MJPEG it is the compressed size of the frame.
LIBYUV_API
int ConvertToARGB(const uint8* sample, size_t sample_size,
uint8* crop_argb, int argb_stride,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
int aligned_src_width = (src_width + 1) & ~1;
const uint8* src;
const uint8* src_uv;
int abs_src_height = (src_height < 0) ? -src_height : src_height;
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
// and then rotate the I420 to the final destination buffer.
// For in-place conversion, if destination crop_argb is same as source sample,
// also enable temporary buffer.
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
crop_argb == sample;
uint8* tmp_argb = crop_argb;
int tmp_argb_stride = argb_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (crop_argb == NULL || sample == NULL ||
src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
inv_crop_height = -inv_crop_height;
}
if (need_buf) {
int argb_size = crop_width * abs_crop_height * 4;
rotate_buffer = (uint8*)malloc(argb_size);
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
crop_argb = rotate_buffer;
argb_stride = crop_width;
}
switch (format) {
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToARGB(src, aligned_src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToARGB(src, aligned_src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RGB24ToARGB(src, src_width * 3,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RAWToARGB(src, src_width * 3,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = BGRAToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ABGRToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = RGBAToARGB(src, src_width * 4,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB1555ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB4444ToARGB(src, src_width * 2,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
r = NV12ToARGB(src, src_width,
src_uv, aligned_src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
// Call NV12 but with u and v parameters swapped.
r = NV21ToARGB(src, src_width,
src_uv, aligned_src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
r = M420ToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
r = I420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_J420: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I422:
case FOURCC_YV16: {
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u;
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
r = I422ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I444:
case FOURCC_YV24: {
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u;
const uint8* src_v;
if (format == FOURCC_YV24) {
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
} else {
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
r = I444ToARGB(src_y, src_width,
src_u, src_width,
src_v, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I411: {
int quarterwidth = (src_width + 3) / 4;
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u = sample + src_width * abs_src_height +
quarterwidth * crop_y + crop_x / 4;
const uint8* src_v = sample + src_width * abs_src_height +
quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
r = I411ToARGB(src_y, src_width,
src_u, quarterwidth,
src_v, quarterwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToARGB(sample, sample_size,
crop_argb, argb_stride,
src_width, abs_src_height, crop_width, inv_crop_height);
break;
#endif
default:
r = -1; // unknown fourcc - return failure code.
}
if (need_buf) {
if (!r) {
r = ARGBRotate(crop_argb, argb_stride,
tmp_argb, tmp_argb_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);
}
return r;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,339 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "libyuv/convert.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// src_width is used for source stride computation
// src_height is used to compute location of planes, and indicate inversion
// sample_size is measured in bytes and is the size of the frame.
// With MJPEG it is the compressed size of the frame.
LIBYUV_API
int ConvertToI420(const uint8* sample,
size_t sample_size,
uint8* y, int y_stride,
uint8* u, int u_stride,
uint8* v, int v_stride,
int crop_x, int crop_y,
int src_width, int src_height,
int crop_width, int crop_height,
enum RotationMode rotation,
uint32 fourcc) {
uint32 format = CanonicalFourCC(fourcc);
int aligned_src_width = (src_width + 1) & ~1;
const uint8* src;
const uint8* src_uv;
int abs_src_height = (src_height < 0) ? -src_height : src_height;
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
format != FOURCC_NV12 && format != FOURCC_NV21 &&
format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
uint8* tmp_y = y;
uint8* tmp_u = u;
uint8* tmp_v = v;
int tmp_y_stride = y_stride;
int tmp_u_stride = u_stride;
int tmp_v_stride = v_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (!y || !u || !v || !sample ||
src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
inv_crop_height = -inv_crop_height;
}
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
// and then rotate the I420 to the final destination buffer.
// For in-place conversion, if destination y is same as source sample,
// also enable temporary buffer.
if (need_buf) {
int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
y = rotate_buffer;
u = y + y_size;
v = u + uv_size;
y_stride = crop_width;
u_stride = v_stride = ((crop_width + 1) / 2);
}
switch (format) {
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToI420(src, aligned_src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToI420(src, aligned_src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB1555ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
r = ARGB4444ToI420(src, src_width * 2,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RGB24ToI420(src, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
r = RAWToI420(src, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ARGBToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = BGRAToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
r = ABGRToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
r = RGBAToI420(src, src_width * 4,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height, rotation);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
// Call NV12 but with u and v parameters swapped.
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
v, v_stride,
u, u_stride,
crop_width, inv_crop_height, rotation);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
r = M420ToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
src_v = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
r = I420Rotate(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height, rotation);
break;
}
case FOURCC_I422:
case FOURCC_YV16: {
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u;
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_u = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
} else {
src_u = sample + src_width * abs_src_height +
halfwidth * crop_y + crop_x / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
r = I422ToI420(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I444:
case FOURCC_YV24: {
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u;
const uint8* src_v;
if (format == FOURCC_YV24) {
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
} else {
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
r = I444ToI420(src_y, src_width,
src_u, src_width,
src_v, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I411: {
int quarterwidth = (src_width + 3) / 4;
const uint8* src_y = sample + src_width * crop_y + crop_x;
const uint8* src_u = sample + src_width * abs_src_height +
quarterwidth * crop_y + crop_x / 4;
const uint8* src_v = sample + src_width * abs_src_height +
quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
r = I411ToI420(src_y, src_width,
src_u, quarterwidth,
src_v, quarterwidth,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToI420(sample, sample_size,
y, y_stride,
u, u_stride,
v, v_stride,
src_width, abs_src_height, crop_width, inv_crop_height);
break;
#endif
default:
r = -1; // unknown fourcc - return failure code.
}
if (need_buf) {
if (!r) {
r = I420Rotate(y, y_stride,
u, u_stride,
v, v_stride,
tmp_y, tmp_y_stride,
tmp_u, tmp_u_stride,
tmp_v, tmp_v_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);
}
return r;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,299 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/cpu_id.h"
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
#if !defined(__native_client__)
#include <stdlib.h> // For getenv()
#endif
// For ArmCpuCaps() but unittested on all platforms
#include <stdio.h>
#include <string.h>
#include "libyuv/basic_types.h" // For CPU_X86
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check.
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
#define SAFEBUFFERS __declspec(safebuffers)
#else
#define SAFEBUFFERS
#endif
// Low level cpuid for X86.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER) && !defined(__clang__)
// Visual C version uses intrinsic or inline x86 assembly.
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86)
__asm {
mov eax, info_eax
mov ecx, info_ecx
mov edi, cpu_info
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
}
#else // Visual C but not x86
if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax);
} else {
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
}
#endif
// GCC version uses inline x86 assembly.
#else // defined(_MSC_VER) && !defined(__clang__)
uint32 info_ebx, info_edx;
asm volatile (
#if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=D" (info_ebx),
#else
"cpuid \n"
: "=b" (info_ebx),
#endif // defined( __i386__) && defined(__PIC__)
"+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
cpu_info[0] = info_eax;
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
cpu_info[3] = info_edx;
#endif // defined(_MSC_VER) && !defined(__clang__)
}
#else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
#endif
// For VS2010 and earlier emit can be used:
// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
// __asm {
// xor ecx, ecx // xcr 0
// xgetbv
// mov xcr0, eax
// }
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
// https://code.google.com/p/libyuv/issues/detail?id=529
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int GetXCR0() {
uint32 xcr0 = 0u;
#if (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return xcr0;
}
#endif // defined(_M_IX86) || defined(_M_X64) ..
// Return optimization to previous setting.
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", on)
#endif
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS
int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
// Assume Neon if /proc/cpuinfo is unavailable.
// This will occur for Chrome sandbox for Pepper or Render process.
return kCpuHasNEON;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
char* p = strstr(cpuinfo_line, " neon");
if (p && (p[5] == ' ' || p[5] == '\n')) {
fclose(f);
return kCpuHasNEON;
}
// aarch64 uses asimd for Neon.
p = strstr(cpuinfo_line, " asimd");
if (p && (p[6] == ' ' || p[6] == '\n')) {
fclose(f);
return kCpuHasNEON;
}
}
}
fclose(f);
return 0;
}
// CPU detect function for SIMD instruction sets.
LIBYUV_API
int cpu_info_ = 0; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
#if !defined(__native_client__) && !defined(_M_ARM)
static LIBYUV_BOOL TestEnv(const char* name) {
const char* var = getenv(name);
if (var) {
if (var[0] != '0') {
return LIBYUV_TRUE;
}
}
return LIBYUV_FALSE;
}
#else // nacl does not support getenv().
static LIBYUV_BOOL TestEnv(const char*) {
return LIBYUV_FALSE;
}
#endif
LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
// TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
}
cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
#ifdef HAS_XGETBV
// AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
}
}
#endif
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info &= ~kCpuHasX86;
}
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
cpu_info &= ~kCpuHasSSE2;
}
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
cpu_info &= ~kCpuHasSSSE3;
}
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
cpu_info &= ~kCpuHasSSE41;
}
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
cpu_info &= ~kCpuHasSSE42;
}
if (TestEnv("LIBYUV_DISABLE_AVX")) {
cpu_info &= ~kCpuHasAVX;
}
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
cpu_info &= ~kCpuHasAVX2;
}
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
cpu_info &= ~kCpuHasERMS;
}
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info &= ~kCpuHasFMA3;
}
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
cpu_info &= ~kCpuHasAVX3;
}
#endif
#if defined(__mips__) && defined(__linux__)
#if defined(__mips_dspr2)
cpu_info |= kCpuHasDSPR2;
#endif
cpu_info |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_DSPR2")) {
cpu_info &= ~kCpuHasDSPR2;
}
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info = kCpuHasNEON;
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it.
// So for aarch64, neon enabling is hard coded here.
#endif
#if defined(__aarch64__)
cpu_info = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.
cpu_info = ArmCpuCaps("/proc/cpuinfo");
#endif
cpu_info |= kCpuHasARM;
if (TestEnv("LIBYUV_DISABLE_NEON")) {
cpu_info &= ~kCpuHasNEON;
}
#endif // __arm__
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info = 0;
}
cpu_info |= kCpuInitialized;
cpu_info_ = cpu_info;
return cpu_info;
}
// Note that use of this function is not thread safe.
LIBYUV_API
void MaskCpuFlags(int enable_flags) {
cpu_info_ = InitCpuFlags() & enable_flags;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,570 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/mjpeg_decoder.h"
#ifdef HAVE_JPEG
#include <assert.h>
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// Must be included before jpeglib.
#include <setjmp.h>
#define HAVE_SETJMP
#if defined(_MSC_VER)
// disable warning 4324: structure was padded due to __declspec(align())
#pragma warning(disable:4324)
#endif
#endif
struct FILE; // For jpeglib.h.
// C++ build requires extern C for jpeg internals.
#ifdef __cplusplus
extern "C" {
#endif
#include <jpeglib.h>
#ifdef __cplusplus
} // extern "C"
#endif
#include "libyuv/planar_functions.h" // For CopyPlane().
namespace libyuv {
#ifdef HAVE_SETJMP
struct SetJmpErrorMgr {
jpeg_error_mgr base; // Must be at the top
jmp_buf setjmp_buffer;
};
#endif
const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
// Methods that are passed to jpeglib.
boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
void init_source(jpeg_decompress_struct* cinfo);
void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
void term_source(jpeg_decompress_struct* cinfo);
void ErrorHandler(jpeg_common_struct* cinfo);
MJpegDecoder::MJpegDecoder()
: has_scanline_padding_(LIBYUV_FALSE),
num_outbufs_(0),
scanlines_(NULL),
scanlines_sizes_(NULL),
databuf_(NULL),
databuf_strides_(NULL) {
decompress_struct_ = new jpeg_decompress_struct;
source_mgr_ = new jpeg_source_mgr;
#ifdef HAVE_SETJMP
error_mgr_ = new SetJmpErrorMgr;
decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
// Override standard exit()-based error handler.
error_mgr_->base.error_exit = &ErrorHandler;
#endif
decompress_struct_->client_data = NULL;
source_mgr_->init_source = &init_source;
source_mgr_->fill_input_buffer = &fill_input_buffer;
source_mgr_->skip_input_data = &skip_input_data;
source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
source_mgr_->term_source = &term_source;
jpeg_create_decompress(decompress_struct_);
decompress_struct_->src = source_mgr_;
buf_vec_.buffers = &buf_;
buf_vec_.len = 1;
}
MJpegDecoder::~MJpegDecoder() {
jpeg_destroy_decompress(decompress_struct_);
delete decompress_struct_;
delete source_mgr_;
#ifdef HAVE_SETJMP
delete error_mgr_;
#endif
DestroyOutputBuffers();
}
LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
if (!ValidateJpeg(src, src_len)) {
return LIBYUV_FALSE;
}
buf_.data = src;
buf_.len = static_cast<int>(src_len);
buf_vec_.pos = 0;
decompress_struct_->client_data = &buf_vec_;
#ifdef HAVE_SETJMP
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called jpeg_read_header, it experienced an error, and we called
// longjmp() and rewound the stack to here. Return error.
return LIBYUV_FALSE;
}
#endif
if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
// ERROR: Bad MJPEG header
return LIBYUV_FALSE;
}
AllocOutputBuffers(GetNumComponents());
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_size = GetComponentScanlinesPerImcuRow(i);
if (scanlines_sizes_[i] != scanlines_size) {
if (scanlines_[i]) {
delete scanlines_[i];
}
scanlines_[i] = new uint8* [scanlines_size];
scanlines_sizes_[i] = scanlines_size;
}
// We allocate padding for the final scanline to pad it up to DCTSIZE bytes
// to avoid memory errors, since jpeglib only reads full MCUs blocks. For
// the preceding scanlines, the padding is not needed/wanted because the
// following addresses will already be valid (they are the initial bytes of
// the next scanline) and will be overwritten when jpeglib writes out that
// next scanline.
int databuf_stride = GetComponentStride(i);
int databuf_size = scanlines_size * databuf_stride;
if (databuf_strides_[i] != databuf_stride) {
if (databuf_[i]) {
delete databuf_[i];
}
databuf_[i] = new uint8[databuf_size];
databuf_strides_[i] = databuf_stride;
}
if (GetComponentStride(i) != GetComponentWidth(i)) {
has_scanline_padding_ = LIBYUV_TRUE;
}
}
return LIBYUV_TRUE;
}
static int DivideAndRoundUp(int numerator, int denominator) {
return (numerator + denominator - 1) / denominator;
}
static int DivideAndRoundDown(int numerator, int denominator) {
return numerator / denominator;
}
// Returns width of the last loaded frame.
int MJpegDecoder::GetWidth() {
return decompress_struct_->image_width;
}
// Returns height of the last loaded frame.
int MJpegDecoder::GetHeight() {
return decompress_struct_->image_height;
}
// Returns format of the last loaded frame. The return value is one of the
// kColorSpace* constants.
int MJpegDecoder::GetColorSpace() {
return decompress_struct_->jpeg_color_space;
}
// Number of color components in the color space.
int MJpegDecoder::GetNumComponents() {
return decompress_struct_->num_components;
}
// Sample factors of the n-th component.
int MJpegDecoder::GetHorizSampFactor(int component) {
return decompress_struct_->comp_info[component].h_samp_factor;
}
int MJpegDecoder::GetVertSampFactor(int component) {
return decompress_struct_->comp_info[component].v_samp_factor;
}
int MJpegDecoder::GetHorizSubSampFactor(int component) {
return decompress_struct_->max_h_samp_factor /
GetHorizSampFactor(component);
}
int MJpegDecoder::GetVertSubSampFactor(int component) {
return decompress_struct_->max_v_samp_factor /
GetVertSampFactor(component);
}
int MJpegDecoder::GetImageScanlinesPerImcuRow() {
return decompress_struct_->max_v_samp_factor * DCTSIZE;
}
int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
int vs = GetVertSubSampFactor(component);
return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
}
int MJpegDecoder::GetComponentWidth(int component) {
int hs = GetHorizSubSampFactor(component);
return DivideAndRoundUp(GetWidth(), hs);
}
int MJpegDecoder::GetComponentHeight(int component) {
int vs = GetVertSubSampFactor(component);
return DivideAndRoundUp(GetHeight(), vs);
}
// Get width in bytes padded out to a multiple of DCTSIZE
int MJpegDecoder::GetComponentStride(int component) {
return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
}
int MJpegDecoder::GetComponentSize(int component) {
return GetComponentWidth(component) * GetComponentHeight(component);
}
LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
#ifdef HAVE_SETJMP
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called jpeg_abort_decompress, it experienced an error, and we called
// longjmp() and rewound the stack to here. Return error.
return LIBYUV_FALSE;
}
#endif
jpeg_abort_decompress(decompress_struct_);
return LIBYUV_TRUE;
}
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
uint8** planes, int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
#ifdef HAVE_SETJMP
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called into jpeglib, it experienced an error sometime during this
// function call, and we called longjmp() and rewound the stack to here.
// Return error.
return LIBYUV_FALSE;
}
#endif
if (!StartDecode()) {
return LIBYUV_FALSE;
}
SetScanlinePointers(databuf_);
int lines_left = dst_height;
// Compute amount of lines to skip to implement vertical crop.
// TODO(fbarchard): Ensure skip is a multiple of maximum component
// subsample. ie 2
int skip = (GetHeight() - dst_height) / 2;
if (skip > 0) {
// There is no API to skip lines in the output data, so we read them
// into the temp buffer.
while (skip >= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
skip -= GetImageScanlinesPerImcuRow();
}
if (skip > 0) {
// Have a partial iMCU row left over to skip. Must read it and then
// copy the parts we want into the destination.
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
// TODO(fbarchard): Compute skip to avoid this
assert(skip % GetVertSubSampFactor(i) == 0);
int rows_to_skip =
DivideAndRoundDown(skip, GetVertSubSampFactor(i));
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
rows_to_skip;
int data_to_skip = rows_to_skip * GetComponentStride(i);
CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
lines_left -= (GetImageScanlinesPerImcuRow() - skip);
}
}
// Read full MCUs but cropped horizontally
for (; lines_left > GetImageScanlinesPerImcuRow();
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
CopyPlane(databuf_[i], GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
if (lines_left > 0) {
// Have a partial iMCU row left over to decode.
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
int scanlines_to_copy =
DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
CopyPlane(databuf_[i], GetComponentStride(i),
planes[i], GetComponentWidth(i),
GetComponentWidth(i), scanlines_to_copy);
planes[i] += scanlines_to_copy * GetComponentWidth(i);
}
}
return FinishDecode();
}
LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
int dst_width, int dst_height) {
if (dst_width != GetWidth() ||
dst_height > GetHeight()) {
// ERROR: Bad dimensions
return LIBYUV_FALSE;
}
#ifdef HAVE_SETJMP
if (setjmp(error_mgr_->setjmp_buffer)) {
// We called into jpeglib, it experienced an error sometime during this
// function call, and we called longjmp() and rewound the stack to here.
// Return error.
return LIBYUV_FALSE;
}
#endif
if (!StartDecode()) {
return LIBYUV_FALSE;
}
SetScanlinePointers(databuf_);
int lines_left = dst_height;
// TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
int skip = (GetHeight() - dst_height) / 2;
if (skip > 0) {
while (skip >= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
skip -= GetImageScanlinesPerImcuRow();
}
if (skip > 0) {
// Have a partial iMCU row left over to skip.
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
for (int i = 0; i < num_outbufs_; ++i) {
// TODO(fbarchard): Compute skip to avoid this
assert(skip % GetVertSubSampFactor(i) == 0);
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
int data_to_skip = rows_to_skip * GetComponentStride(i);
// Change our own data buffer pointers so we can pass them to the
// callback.
databuf_[i] += data_to_skip;
}
int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
(*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
// Now change them back.
for (int i = 0; i < num_outbufs_; ++i) {
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
int data_to_skip = rows_to_skip * GetComponentStride(i);
databuf_[i] -= data_to_skip;
}
lines_left -= scanlines_to_copy;
}
}
// Read full MCUs until we get to the crop point.
for (; lines_left >= GetImageScanlinesPerImcuRow();
lines_left -= GetImageScanlinesPerImcuRow()) {
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
(*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
}
if (lines_left > 0) {
// Have a partial iMCU row left over to decode.
if (!DecodeImcuRow()) {
FinishDecode();
return LIBYUV_FALSE;
}
(*fn)(opaque, databuf_, databuf_strides_, lines_left);
}
return FinishDecode();
}
void init_source(j_decompress_ptr cinfo) {
fill_input_buffer(cinfo);
}
boolean fill_input_buffer(j_decompress_ptr cinfo) {
BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
assert(0 && "No more data");
// ERROR: No more data
return FALSE;
}
cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
++buf_vec->pos;
return TRUE;
}
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
cinfo->src->next_input_byte += num_bytes;
}
void term_source(j_decompress_ptr cinfo) {
// Nothing to do.
}
#ifdef HAVE_SETJMP
void ErrorHandler(j_common_ptr cinfo) {
// This is called when a jpeglib command experiences an error. Unfortunately
// jpeglib's error handling model is not very flexible, because it expects the
// error handler to not return--i.e., it wants the program to terminate. To
// recover from errors we use setjmp() as shown in their example. setjmp() is
// C's implementation for the "call with current continuation" functionality
// seen in some functional programming languages.
// A formatted message can be output, but is unsafe for release.
#ifdef DEBUG
char buf[JMSG_LENGTH_MAX];
(*cinfo->err->format_message)(cinfo, buf);
// ERROR: Error in jpeglib: buf
#endif
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
// This rewinds the call stack to the point of the corresponding setjmp()
// and causes it to return (for a second time) with value 1.
longjmp(mgr->setjmp_buffer, 1);
}
#endif
void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
if (num_outbufs != num_outbufs_) {
// We could perhaps optimize this case to resize the output buffers without
// necessarily having to delete and recreate each one, but it's not worth
// it.
DestroyOutputBuffers();
scanlines_ = new uint8** [num_outbufs];
scanlines_sizes_ = new int[num_outbufs];
databuf_ = new uint8* [num_outbufs];
databuf_strides_ = new int[num_outbufs];
for (int i = 0; i < num_outbufs; ++i) {
scanlines_[i] = NULL;
scanlines_sizes_[i] = 0;
databuf_[i] = NULL;
databuf_strides_[i] = 0;
}
num_outbufs_ = num_outbufs;
}
}
void MJpegDecoder::DestroyOutputBuffers() {
for (int i = 0; i < num_outbufs_; ++i) {
delete [] scanlines_[i];
delete [] databuf_[i];
}
delete [] scanlines_;
delete [] databuf_;
delete [] scanlines_sizes_;
delete [] databuf_strides_;
scanlines_ = NULL;
databuf_ = NULL;
scanlines_sizes_ = NULL;
databuf_strides_ = NULL;
num_outbufs_ = 0;
}
// JDCT_IFAST and do_block_smoothing improve performance substantially.
LIBYUV_BOOL MJpegDecoder::StartDecode() {
decompress_struct_->raw_data_out = TRUE;
decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
decompress_struct_->dither_mode = JDITHER_NONE;
// Not applicable to 'raw':
decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
// Only for buffered mode:
decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
// Blocky but fast:
decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
if (!jpeg_start_decompress(decompress_struct_)) {
// ERROR: Couldn't start JPEG decompressor";
return LIBYUV_FALSE;
}
return LIBYUV_TRUE;
}
LIBYUV_BOOL MJpegDecoder::FinishDecode() {
// jpeglib considers it an error if we finish without decoding the whole
// image, so we call "abort" rather than "finish".
jpeg_abort_decompress(decompress_struct_);
return LIBYUV_TRUE;
}
void MJpegDecoder::SetScanlinePointers(uint8** data) {
for (int i = 0; i < num_outbufs_; ++i) {
uint8* data_i = data[i];
for (int j = 0; j < scanlines_sizes_[i]; ++j) {
scanlines_[i][j] = data_i;
data_i += GetComponentStride(i);
}
}
}
inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
jpeg_read_raw_data(decompress_struct_,
scanlines_,
GetImageScanlinesPerImcuRow());
}
// The helper function which recognizes the jpeg sub-sampling type.
JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
int* subsample_x, int* subsample_y, int number_of_components) {
if (number_of_components == 3) { // Color images.
if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 2 && subsample_y[1] == 2 &&
subsample_x[2] == 2 && subsample_y[2] == 2) {
return kJpegYuv420;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 2 && subsample_y[1] == 1 &&
subsample_x[2] == 2 && subsample_y[2] == 1) {
return kJpegYuv422;
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
subsample_x[1] == 1 && subsample_y[1] == 1 &&
subsample_x[2] == 1 && subsample_y[2] == 1) {
return kJpegYuv444;
}
} else if (number_of_components == 1) { // Grey-scale images.
if (subsample_x[0] == 1 && subsample_y[0] == 1) {
return kJpegYuv400;
}
}
return kJpegUnknown;
}
} // namespace libyuv
#endif // HAVE_JPEG

View File

@@ -0,0 +1,71 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/mjpeg_decoder.h"
#include <string.h> // For memchr.
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Helper function to scan for EOI marker (0xff 0xd9).
static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
if (sample_size >= 2) {
const uint8* end = sample + sample_size - 1;
const uint8* it = sample;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
if (it == NULL) {
break;
}
if (it[1] == 0xd9) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
++it; // Skip over current 0xff.
}
}
// ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE;
}
// Helper function to validate the jpeg appears intact.
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// Maximum size that ValidateJpeg will consider valid.
const size_t kMaxJpegSize = 0x7fffffffull;
const size_t kBackSearchSize = 1024;
if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
// ERROR: Invalid jpeg size: sample_size
return LIBYUV_FALSE;
}
if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
// Look for the End Of Image (EOI) marker near the end of the buffer.
if (sample_size > kBackSearchSize) {
if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
// Reduce search size for forward search.
sample_size = sample_size - kBackSearchSize + 1;
}
// Step over SOI marker and scan for EOI.
return ScanEOI(sample + 2, sample_size - 2);
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,491 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/rotate.h"
#include "libyuv/cpu_id.h"
#include "libyuv/convert.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i = height;
void (*TransposeWx8)(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) = TransposeWx8_C;
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
}
#endif
#if defined(HAS_TRANSPOSEWX8_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
TransposeWx8 = TransposeWx8_SSSE3;
}
}
#endif
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
TransposeWx8 = TransposeWx8_Fast_SSSE3;
}
}
#endif
#if defined(HAS_TRANSPOSEWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_Fast_DSPR2;
} else {
TransposeWx8 = TransposeWx8_DSPR2;
}
}
#endif
// Work across the source in 8x8 tiles
while (i >= 8) {
TransposeWx8(src, src_stride, dst, dst_stride, width);
src += 8 * src_stride; // Go down 8 rows.
dst += 8; // Move over 8 columns.
i -= 8;
}
if (i > 0) {
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
}
}
LIBYUV_API
void RotatePlane90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 90 is a transpose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src += src_stride * (height - 1);
src_stride = -src_stride;
TransposePlane(src, src_stride, dst, dst_stride, width, height);
}
LIBYUV_API
void RotatePlane270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Rotate by 270 is a transpose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst += dst_stride * (width - 1);
dst_stride = -dst_stride;
TransposePlane(src, src_stride, dst, dst_stride, width, height);
}
LIBYUV_API
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width);
const uint8* src_bot = src + src_stride * (height - 1);
uint8* dst_bot = dst + dst_stride * (height - 1);
int half_height = (height + 1) >> 1;
int y;
void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_NEON;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MirrorRow = MirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_AVX2;
}
}
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
MirrorRow = MirrorRow_DSPR2;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
}
#endif
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
MirrorRow(src, row, width); // Mirror first row into a buffer
src += src_stride;
MirrorRow(src_bot, dst, width); // Mirror last row into first row
dst += dst_stride;
CopyRow(row, dst_bot, width); // Copy first mirrored row into last
src_bot -= src_stride;
dst_bot -= dst_stride;
}
free_aligned_buffer_64(row);
}
LIBYUV_API
void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i = height;
void (*TransposeUVWx8)(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2;
}
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_DSPR2;
}
#endif
// Work through the source in 8x8 tiles.
while (i >= 8) {
TransposeUVWx8(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width);
src += 8 * src_stride; // Go down 8 rows.
dst_a += 8; // Move over 8 columns.
dst_b += 8; // Move over 8 columns.
i -= 8;
}
if (i > 0) {
TransposeUVWxH_C(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, i);
}
}
LIBYUV_API
void RotateUV90(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
src += src_stride * (height - 1);
src_stride = -src_stride;
TransposeUV(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, height);
}
LIBYUV_API
void RotateUV270(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
dst_a += dst_stride_a * (width - 1);
dst_b += dst_stride_b * (width - 1);
dst_stride_a = -dst_stride_a;
dst_stride_b = -dst_stride_b;
TransposeUV(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, height);
}
// Rotate 180 is a horizontal and vertical flip.
LIBYUV_API
void RotateUV180(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
MirrorUVRow_C;
#if defined(HAS_MIRRORUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
MirrorUVRow = MirrorUVRow_NEON;
}
#endif
#if defined(HAS_MIRRORUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorUVRow = MirrorUVRow_SSSE3;
}
#endif
#if defined(HAS_MIRRORUVROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
MirrorUVRow = MirrorUVRow_DSPR2;
}
#endif
dst_a += dst_stride_a * (height - 1);
dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) {
MirrorUVRow(src, dst_a, dst_b, width);
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;
}
}
LIBYUV_API
int RotatePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height,
enum RotationMode mode) {
if (!src || width <= 0 || height == 0 || !dst) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src = src + (height - 1) * src_stride;
src_stride = -src_stride;
}
switch (mode) {
case kRotate0:
// copy frame
CopyPlane(src, src_stride,
dst, dst_stride,
width, height);
return 0;
case kRotate90:
RotatePlane90(src, src_stride,
dst, dst_stride,
width, height);
return 0;
case kRotate270:
RotatePlane270(src, src_stride,
dst, dst_stride,
width, height);
return 0;
case kRotate180:
RotatePlane180(src, src_stride,
dst, dst_stride,
width, height);
return 0;
default:
break;
}
return -1;
}
LIBYUV_API
int I420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (halfheight - 1) * src_stride_u;
src_v = src_v + (halfheight - 1) * src_stride_v;
src_stride_y = -src_stride_y;
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
switch (mode) {
case kRotate0:
// copy frame
return I420Copy(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height);
case kRotate90:
RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane90(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane90(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
return 0;
case kRotate270:
RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane270(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane270(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
return 0;
case kRotate180:
RotatePlane180(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotatePlane180(src_u, src_stride_u,
dst_u, dst_stride_u,
halfwidth, halfheight);
RotatePlane180(src_v, src_stride_v,
dst_v, dst_stride_v,
halfwidth, halfheight);
return 0;
default:
break;
}
return -1;
}
LIBYUV_API
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_uv || width <= 0 || height == 0 ||
!dst_y || !dst_u || !dst_v) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_uv = src_uv + (halfheight - 1) * src_stride_uv;
src_stride_y = -src_stride_y;
src_stride_uv = -src_stride_uv;
}
switch (mode) {
case kRotate0:
// copy frame
return NV12ToI420(src_y, src_stride_y,
src_uv, src_stride_uv,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height);
case kRotate90:
RotatePlane90(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV90(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
return 0;
case kRotate270:
RotatePlane270(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV270(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
return 0;
case kRotate180:
RotatePlane180(src_y, src_stride_y,
dst_y, dst_stride_y,
width, height);
RotateUV180(src_uv, src_stride_uv,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
halfwidth, halfheight);
return 0;
default:
break;
}
return -1;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,80 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/rotate.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
}
#ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#endif
#ifdef HAS_TRANSPOSEWX8_DSPR2
TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
#endif
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst_a, int dst_stride_a, \
uint8* dst_b, int dst_stride_b, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
n); \
} \
TransposeUVWx8_C(src + n * 2, src_stride, \
dst_a + n * dst_stride_a, dst_stride_a, \
dst_b + n * dst_stride_b, dst_stride_b, r); \
}
#ifdef HAS_TRANSPOSEUVWX8_NEON
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX8_SSE2
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX8_DSPR2
TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
#endif
#undef TUVANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,205 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/rotate.h"
#include "libyuv/cpu_id.h"
#include "libyuv/convert.h"
#include "libyuv/planar_functions.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// ARGBScale has a function to copy pixels to a row, striding each source
// pixel by a constant.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx, uint8* dst_ptr, int dst_width);
#endif
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
int src_stepx, uint8* dst_ptr, int dst_width);
static void ARGBTranspose(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
int i;
int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
}
#endif
for (i = 0; i < width; ++i) { // column of source to row of dest.
ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
dst += dst_stride;
src += 4;
}
}
void ARGBRotate90(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src += src_stride * (height - 1);
src_stride = -src_stride;
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
void ARGBRotate270(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst += dst_stride * (width - 1);
dst_stride = -dst_stride;
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
void ARGBRotate180(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1);
uint8* dst_bot = dst + dst_stride * (height - 1);
int half_height = (height + 1) >> 1;
int y;
void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
ARGBMirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_AVX2;
}
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
}
#endif
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
src += src_stride;
dst += dst_stride;
src_bot -= src_stride;
dst_bot -= dst_stride;
}
free_aligned_buffer_64(row);
}
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, int width, int height,
enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
switch (mode) {
case kRotate0:
// copy frame
return ARGBCopy(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
case kRotate90:
ARGBRotate90(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
return 0;
case kRotate270:
ARGBRotate270(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
return 0;
case kRotate180:
ARGBRotate180(src_argb, src_stride_argb,
dst_argb, dst_stride_argb,
width, height);
return 0;
default:
break;
}
return -1;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,92 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
dst[3] = src[3 * src_stride];
dst[4] = src[4 * src_stride];
dst[5] = src[5 * src_stride];
dst[6] = src[6 * src_stride];
dst[7] = src[7 * src_stride];
++src;
dst += dst_stride;
}
}
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0];
dst_b[1] = src[1 * src_stride + 1];
dst_a[2] = src[2 * src_stride + 0];
dst_b[2] = src[2 * src_stride + 1];
dst_a[3] = src[3 * src_stride + 0];
dst_b[3] = src[3 * src_stride + 1];
dst_a[4] = src[4 * src_stride + 0];
dst_b[4] = src[4 * src_stride + 1];
dst_a[5] = src[5 * src_stride + 0];
dst_b[5] = src[5 * src_stride + 1];
dst_a[6] = src[6 * src_stride + 0];
dst_b[6] = src[6 * src_stride + 1];
dst_a[7] = src[7 * src_stride + 0];
dst_b[7] = src[7 * src_stride + 1];
src += 2;
dst_a += dst_stride_a;
dst_b += dst_stride_b;
}
}
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
for (j = 0; j < height; ++j) {
dst[i * dst_stride + j] = src[j * src_stride + i];
}
}
}
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;
for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
}
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,368 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
// Transpose 16x8. 64 bit
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
}
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
// Transpose UV 8x8. 64 bit.
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
}
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,484 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"andi $t0, %[dst], 0x3 \n"
"andi $t1, %[dst_stride], 0x3 \n"
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
"1: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
"lbux $t9, $t3(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s0, $t8, $t0 \n"
"lbux $t0, $t4(%[src]) \n"
"lbux $t1, $t5(%[src]) \n"
"lbux $t8, $t6(%[src]) \n"
"lbux $t9, $t7(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s1, $t8, $t0 \n"
"sw $s0, 0(%[dst]) \n"
"addiu %[width], -1 \n"
"addiu %[src], 1 \n"
"sw $s1, 4(%[dst]) \n"
"bnez %[width], 1b \n"
" addu %[dst], %[dst], %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
"11: \n"
"lbu $t0, 0(%[src]) \n"
"lbux $t1, %[src_stride](%[src]) \n"
"lbux $t8, $t2(%[src]) \n"
"lbux $t9, $t3(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s0, $t8, $t0 \n"
"lbux $t0, $t4(%[src]) \n"
"lbux $t1, $t5(%[src]) \n"
"lbux $t8, $t6(%[src]) \n"
"lbux $t9, $t7(%[src]) \n"
"sll $t1, $t1, 16 \n"
"sll $t9, $t9, 16 \n"
"or $t0, $t0, $t1 \n"
"or $t8, $t8, $t9 \n"
"precr.qb.ph $s1, $t8, $t0 \n"
"swr $s0, 0(%[dst]) \n"
"swl $s0, 3(%[dst]) \n"
"addiu %[width], -1 \n"
"addiu %[src], 1 \n"
"swr $s1, 4(%[dst]) \n"
"swl $s1, 7(%[dst]) \n"
"bnez %[width], 11b \n"
"addu %[dst], %[dst], %[dst_stride] \n"
"2: \n"
".set pop \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1"
);
}
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set noat \n"
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"srl $AT, %[width], 0x2 \n"
"andi $t0, %[dst], 0x3 \n"
"andi $t1, %[dst_stride], 0x3 \n"
"or $t0, $t0, $t1 \n"
"bnez $t0, 11f \n"
" subu $t7, $t9, %[src_stride] \n"
//dst + dst_stride word aligned
"1: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
"addu $s2, $s1, %[dst_stride] \n"
"sw $s4, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $s6, 0($s0) \n"
"sw $t8, 4($s0) \n"
"sw $s5, 0($s1) \n"
"sw $t1, 4($s1) \n"
"sw $s7, 0($s2) \n"
"sw $t9, 4($s2) \n"
"addiu $AT, -1 \n"
"addiu %[src], 4 \n"
"bnez $AT, 1b \n"
" addu %[dst], $s2, %[dst_stride] \n"
"b 2f \n"
//dst + dst_stride unaligned
"11: \n"
"lw $t0, 0(%[src]) \n"
"lwx $t1, %[src_stride](%[src]) \n"
"lwx $t8, $t2(%[src]) \n"
"lwx $t9, $t3(%[src]) \n"
// t0 = | 30 | 20 | 10 | 00 |
// t1 = | 31 | 21 | 11 | 01 |
// t8 = | 32 | 22 | 12 | 02 |
// t9 = | 33 | 23 | 13 | 03 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 21 | 01 | 20 | 00 |
// s1 = | 23 | 03 | 22 | 02 |
// s2 = | 31 | 11 | 30 | 10 |
// s3 = | 33 | 13 | 32 | 12 |
"precr.qb.ph $s4, $s1, $s0 \n"
"precrq.qb.ph $s5, $s1, $s0 \n"
"precr.qb.ph $s6, $s3, $s2 \n"
"precrq.qb.ph $s7, $s3, $s2 \n"
// s4 = | 03 | 02 | 01 | 00 |
// s5 = | 23 | 22 | 21 | 20 |
// s6 = | 13 | 12 | 11 | 10 |
// s7 = | 33 | 32 | 31 | 30 |
"lwx $t0, $t4(%[src]) \n"
"lwx $t1, $t5(%[src]) \n"
"lwx $t8, $t6(%[src]) \n"
"lwx $t9, $t7(%[src]) \n"
// t0 = | 34 | 24 | 14 | 04 |
// t1 = | 35 | 25 | 15 | 05 |
// t8 = | 36 | 26 | 16 | 06 |
// t9 = | 37 | 27 | 17 | 07 |
"precr.qb.ph $s0, $t1, $t0 \n"
"precr.qb.ph $s1, $t9, $t8 \n"
"precrq.qb.ph $s2, $t1, $t0 \n"
"precrq.qb.ph $s3, $t9, $t8 \n"
// s0 = | 25 | 05 | 24 | 04 |
// s1 = | 27 | 07 | 26 | 06 |
// s2 = | 35 | 15 | 34 | 14 |
// s3 = | 37 | 17 | 36 | 16 |
"precr.qb.ph $t0, $s1, $s0 \n"
"precrq.qb.ph $t1, $s1, $s0 \n"
"precr.qb.ph $t8, $s3, $s2 \n"
"precrq.qb.ph $t9, $s3, $s2 \n"
// t0 = | 07 | 06 | 05 | 04 |
// t1 = | 27 | 26 | 25 | 24 |
// t8 = | 17 | 16 | 15 | 14 |
// t9 = | 37 | 36 | 35 | 34 |
"addu $s0, %[dst], %[dst_stride] \n"
"addu $s1, $s0, %[dst_stride] \n"
"addu $s2, $s1, %[dst_stride] \n"
"swr $s4, 0(%[dst]) \n"
"swl $s4, 3(%[dst]) \n"
"swr $t0, 4(%[dst]) \n"
"swl $t0, 7(%[dst]) \n"
"swr $s6, 0($s0) \n"
"swl $s6, 3($s0) \n"
"swr $t8, 4($s0) \n"
"swl $t8, 7($s0) \n"
"swr $s5, 0($s1) \n"
"swl $s5, 3($s1) \n"
"swr $t1, 4($s1) \n"
"swl $t1, 7($s1) \n"
"swr $s7, 0($s2) \n"
"swl $s7, 3($s2) \n"
"swr $t9, 4($s2) \n"
"swl $t9, 7($s2) \n"
"addiu $AT, -1 \n"
"addiu %[src], 4 \n"
"bnez $AT, 11b \n"
" addu %[dst], $s2, %[dst_stride] \n"
"2: \n"
".set pop \n"
".set at \n"
:[src] "+r" (src),
[dst] "+r" (dst),
[width] "+r" (width)
:[src_stride] "r" (src_stride),
[dst_stride] "r" (dst_stride)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
);
}
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
"addu $t3, $t2, %[src_stride] \n"
"addu $t5, $t4, %[src_stride] \n"
"addu $t6, $t2, $t4 \n"
"subu $t7, $t9, %[src_stride] \n"
"srl $t1, %[width], 1 \n"
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
"andi $t0, %[dst_a], 0x3 \n"
"andi $t8, %[dst_b], 0x3 \n"
"or $t0, $t0, $t8 \n"
"andi $t8, %[dst_stride_a], 0x3 \n"
"andi $s5, %[dst_stride_b], 0x3 \n"
"or $t8, $t8, $s5 \n"
"or $t0, $t0, $t8 \n"
"bnez $t0, 11f \n"
" nop \n"
// dst + dst_stride word aligned (both, a & b dst addresses)
"1: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"sw $s3, 0($s5) \n"
"sw $s4, 0($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"sw $s3, 0(%[dst_a]) \n"
"sw $s4, 0(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"sw $s3, 4($s5) \n"
"sw $s4, 4($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
"sll $t0, %[dst_stride_a], 1 \n"
"sll $t8, %[dst_stride_b], 1 \n"
"sw $s3, 4(%[dst_a]) \n"
"sw $s4, 4(%[dst_b]) \n"
"addu %[dst_a], %[dst_a], $t0 \n"
"bnez $t1, 1b \n"
" addu %[dst_b], %[dst_b], $t8 \n"
"b 2f \n"
" nop \n"
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
"11: \n"
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
"addu $s5, %[dst_a], %[dst_stride_a] \n"
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
"addu $s6, %[dst_b], %[dst_stride_b] \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
"swr $s3, 0($s5) \n"
"swl $s3, 3($s5) \n"
"swr $s4, 0($s6) \n"
"swl $s4, 3($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
"swr $s3, 0(%[dst_a]) \n"
"swl $s3, 3(%[dst_a]) \n"
"swr $s4, 0(%[dst_b]) \n"
"swl $s4, 3(%[dst_b]) \n"
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
"sll $t0, $t0, 16 \n"
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
"sll $t9, $t9, 16 \n"
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
"swr $s3, 4($s5) \n"
"swl $s3, 7($s5) \n"
"swr $s4, 4($s6) \n"
"swl $s4, 7($s6) \n"
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
"addiu %[src], 4 \n"
"addiu $t1, -1 \n"
"sll $t0, %[dst_stride_a], 1 \n"
"sll $t8, %[dst_stride_b], 1 \n"
"swr $s3, 4(%[dst_a]) \n"
"swl $s3, 7(%[dst_a]) \n"
"swr $s4, 4(%[dst_b]) \n"
"swl $s4, 7(%[dst_b]) \n"
"addu %[dst_a], %[dst_a], $t0 \n"
"bnez $t1, 11b \n"
" addu %[dst_b], %[dst_b], $t8 \n"
"2: \n"
".set pop \n"
: [src] "+r" (src),
[dst_a] "+r" (dst_a),
[dst_b] "+r" (dst_b),
[width] "+r" (width),
[src_stride] "+r" (src_stride)
: [dst_stride_a] "r" (dst_stride_a),
[dst_stride_b] "r" (dst_stride_b)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,533 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
const uint8* src_temp = NULL;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n"
"vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n"
"vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8
"add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
"subs %5, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %5, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %5, #2 \n"
"blt 3f \n"
"cmp %5, #4 \n"
"blt 2f \n"
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n"
"mov %0, %3 \n"
MEMACCESS(6)
"vld1.8 {q3}, [%6] \n"
"vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n"
"vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
MEMACCESS(0)
"vst1.32 {d4[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d4[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d0[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d0[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n"
"add %1, #4 \n" // src += 4
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
"subs %5, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %5, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.16 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0] \n"
"add %1, #2 \n" // src += 2
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"subs %5, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
"4: \n"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
"+r"(dst_stride), // %4
"+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3"
);
}
static uvec8 kVTbl4x4TransposeDi =
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
const uint8* src_temp = NULL;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n"
"vtrn.8 q9, q8 \n"
"vtrn.8 q11, q10 \n"
"vtrn.16 q1, q3 \n"
"vtrn.16 q0, q2 \n"
"vtrn.16 q9, q11 \n"
"vtrn.16 q8, q10 \n"
"vtrn.32 q1, q9 \n"
"vtrn.32 q0, q8 \n"
"vtrn.32 q3, q11 \n"
"vtrn.32 q2, q10 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"vrev16.8 q8, q8 \n"
"vrev16.8 q9, q9 \n"
"vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2
"add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %7, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %7, #8 \n"
"beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %7, #2 \n"
"blt 3f \n"
"cmp %7, #4 \n"
"blt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"vld1.64 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d7}, [%0] \n"
MEMACCESS(8)
"vld1.8 {q15}, [%8] \n"
"vtrn.8 q0, q1 \n"
"vtrn.8 q2, q3 \n"
"vtbl.8 d16, {d0, d1}, d30 \n"
"vtbl.8 d17, {d0, d1}, d31 \n"
"vtbl.8 d18, {d2, d3}, d30 \n"
"vtbl.8 d19, {d2, d3}, d31 \n"
"vtbl.8 d20, {d4, d5}, d30 \n"
"vtbl.8 d21, {d4, d5}, d31 \n"
"vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.32 {d16[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d16[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[1]}, [%0], %4 \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d20[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d20[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[1]}, [%0] \n"
"mov %0, %5 \n"
MEMACCESS(0)
"vst1.32 {d18[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d18[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n"
"add %0, %5, #4 \n"
MEMACCESS(0)
"vst1.32 {d22[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d22[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[1]}, [%0] \n"
"add %1, #4*2 \n" // src += 4 * 2
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %7, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %7, #2 \n"
"blt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[3], d3[3]}, [%0] \n"
"vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n"
"mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d2}, [%0] \n"
"mov %0, %5 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.64 {d3}, [%0] \n"
"add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %7, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n"
MEMACCESS(5)
"vst1.64 {d1}, [%5] \n"
"4: \n"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst_a), // %3
"+r"(dst_stride_a), // %4
"+r"(dst_b), // %5
"+r"(dst_stride_b), // %6
"+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,543 @@
/*
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %3, %3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n"
"trn1 v17.8b, v0.8b, v1.8b \n"
"trn2 v18.8b, v2.8b, v3.8b \n"
"trn1 v19.8b, v2.8b, v3.8b \n"
"trn2 v20.8b, v4.8b, v5.8b \n"
"trn1 v21.8b, v4.8b, v5.8b \n"
"trn2 v22.8b, v6.8b, v7.8b \n"
"trn1 v23.8b, v6.8b, v7.8b \n"
"trn2 v3.4h, v17.4h, v19.4h \n"
"trn1 v1.4h, v17.4h, v19.4h \n"
"trn2 v2.4h, v16.4h, v18.4h \n"
"trn1 v0.4h, v16.4h, v18.4h \n"
"trn2 v7.4h, v21.4h, v23.4h \n"
"trn1 v5.4h, v21.4h, v23.4h \n"
"trn2 v6.4h, v20.4h, v22.4h \n"
"trn1 v4.4h, v20.4h, v22.4h \n"
"trn2 v21.2s, v1.2s, v5.2s \n"
"trn1 v17.2s, v1.2s, v5.2s \n"
"trn2 v20.2s, v0.2s, v4.2s \n"
"trn1 v16.2s, v0.2s, v4.2s \n"
"trn2 v23.2s, v3.2s, v7.2s \n"
"trn1 v19.2s, v3.2s, v7.2s \n"
"trn2 v22.2s, v2.2s, v6.2s \n"
"trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
"subs %3, %3, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %3, %3, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %3, #2 \n"
"b.lt 3f \n"
"cmp %3, #4 \n"
"b.lt 2f \n"
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
MEMACCESS(0)
"st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %3, %3, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %3, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
"trn1 v3.8b, v0.8b, v1.8b \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %3, %3, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n"
"4: \n"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width64) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
}
static uint8 kVTbl4x4TransposeDi[32] =
{ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %4, %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v20.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v20.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %4, %4, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %4, %4, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %4, #2 \n"
"b.lt 3f \n"
"cmp %4, #4 \n"
"b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n"
MEMACCESS(8)
"ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n"
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v18.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v17.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"st1 {v19.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %4, %4, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %4, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n"
"trn2 v5.8b, v0.8b, v2.8b \n"
"trn1 v6.8b, v1.8b, v3.8b \n"
"trn2 v7.8b, v1.8b, v3.8b \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v4.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v5.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %4, %4, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.d}[0], [%2] \n"
MEMACCESS(3)
"st1 {v1.d}[0], [%3] \n"
"4: \n"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
"+r"(width64) // %4
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
"r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v30", "v31"
);
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,247 @@
/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
__declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
// Read in the data from the source pointer.
// First round of bit swap.
align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm0, xmm1
movq xmm2, qword ptr [eax]
movdqa xmm1, xmm0
palignr xmm1, xmm1, 8
movq xmm3, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm2, xmm3
movdqa xmm3, xmm2
movq xmm4, qword ptr [eax]
palignr xmm3, xmm3, 8
movq xmm5, qword ptr [eax + edi]
punpcklbw xmm4, xmm5
lea eax, [eax + 2 * edi]
movdqa xmm5, xmm4
movq xmm6, qword ptr [eax]
palignr xmm5, xmm5, 8
movq xmm7, qword ptr [eax + edi]
punpcklbw xmm6, xmm7
mov eax, ebp
movdqa xmm7, xmm6
palignr xmm7, xmm7, 8
// Second round of bit swap.
punpcklwd xmm0, xmm2
punpcklwd xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm3, xmm1
palignr xmm2, xmm2, 8
palignr xmm3, xmm3, 8
punpcklwd xmm4, xmm6
punpcklwd xmm5, xmm7
movdqa xmm6, xmm4
movdqa xmm7, xmm5
palignr xmm6, xmm6, 8
palignr xmm7, xmm7, 8
// Third round of bit swap.
// Write to the destination pointer.
punpckldq xmm0, xmm4
movq qword ptr [edx], xmm0
movdqa xmm4, xmm0
palignr xmm4, xmm4, 8
movq qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
punpckldq xmm2, xmm6
movdqa xmm6, xmm2
palignr xmm6, xmm6, 8
movq qword ptr [edx], xmm2
punpckldq xmm1, xmm5
movq qword ptr [edx + esi], xmm6
lea edx, [edx + 2 * esi]
movdqa xmm5, xmm1
movq qword ptr [edx], xmm1
palignr xmm5, xmm5, 8
punpckldq xmm3, xmm7
movq qword ptr [edx + esi], xmm5
lea edx, [edx + 2 * esi]
movq qword ptr [edx], xmm3
movdqa xmm7, xmm3
palignr xmm7, xmm7, 8
sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
jg convertloop
pop ebp
pop esi
pop edi
ret
}
}
__declspec(naked)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
mov ebp, [esp + 16 + 24] // dst_stride_b
mov ecx, esp
sub esp, 4 + 16
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqu xmm2, [eax]
movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqu xmm4, [eax]
movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqu xmm6, [eax]
movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
movdqa xmm2, xmm5
movdqa xmm5, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm5, xmm3
movdqa xmm3, xmm5
movdqa xmm5, xmm4
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
punpckhdq xmm0, xmm6
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
punpckhdq xmm0, xmm5
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
sub ecx, 8
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
jg convertloop
mov esp, [esp + 16]
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,818 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include <string.h> // For memset.
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Subsampled source needs to be increase by 1 of not even.
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
// Any 4 planes to 1 with yuvconstants
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
const uint8* a_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 5]); \
memset(temp, 0, 64 * 4); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 192, a_buf + n, r); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I422ALPHATOARGBROW_AVX2
ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422ALPHATOARGBROW_NEON
ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
#endif
#undef ANY41C
// Any 3 planes to 1.
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_NEON
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#endif
#ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif
#ifdef HAS_BLENDPLANEROW_SSSE3
ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
#endif
#undef ANY31
// Note that odd width replication includes 444 due to implementation
// on arm that subsamples 444 to 422 internally.
// Any 3 planes to 1 with yuvconstants
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I411TOARGBROW_SSSE3
ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TORGBAROW_AVX2
ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I444TOARGBROW_AVX2
ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I411TOARGBROW_AVX2
ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
#undef ANY31C
// Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Merge functions.
#ifdef HAS_MERGEUVROW_SSE2
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
#endif
#ifdef HAS_MERGEUVROW_NEON
ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
#endif
// Math functions.
#ifdef HAS_ARGBMULTIPLYROW_SSE2
ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
#endif
#ifdef HAS_ARGBADDROW_SSE2
ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
#endif
#ifdef HAS_ARGBSUBTRACTROW_SSE2
ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
#endif
#ifdef HAS_ARGBMULTIPLYROW_AVX2
ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBADDROW_AVX2
ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_AVX2
ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBMULTIPLYROW_NEON
ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBADDROW_NEON
ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_NEON
ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
#endif
#ifdef HAS_SOBELROW_NEON
ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
#endif
#ifdef HAS_SOBELTOPLANEROW_SSE2
ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
#endif
#ifdef HAS_SOBELTOPLANEROW_NEON
ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
#endif
#ifdef HAS_SOBELXYROW_SSE2
ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
#endif
#ifdef HAS_SOBELXYROW_NEON
ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
#endif
#undef ANY21
// Any 2 planes to 1 with yuvconstants
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Biplanar to RGB.
#ifdef HAS_NV12TOARGBROW_SSSE3
ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TOARGBROW_AVX2
ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV12TOARGBROW_NEON
ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_SSSE3
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_AVX2
ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV21TOARGBROW_NEON
ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_AVX2
ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
#endif
#ifdef HAS_NV12TORGB565ROW_NEON
ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#endif
#undef ANY21C
// Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_COPYROW_AVX
ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
#endif
#ifdef HAS_COPYROW_SSE2
ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
#endif
#ifdef HAS_COPYROW_NEON
ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
#endif
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
#endif
#if defined(HAS_ARGBTORGB565ROW_AVX2)
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif
#if defined(HAS_J400TOARGBROW_SSE2)
ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
#endif
#if defined(HAS_J400TOARGBROW_AVX2)
ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif
#if defined(HAS_I400TOARGBROW_SSE2)
ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
#endif
#if defined(HAS_I400TOARGBROW_AVX2)
ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif
#if defined(HAS_RAWTORGB24ROW_SSSE3)
ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
#endif
#if defined(HAS_RGB565TOARGBROW_AVX2)
ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYJROW_AVX2
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_UYVYTOYROW_AVX2
ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
#endif
#ifdef HAS_YUY2TOYROW_AVX2
ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_BGRATOYROW_SSSE3
ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYJROW_SSSE3
ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_NEON
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_BGRATOYROW_NEON
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ABGRTOYROW_NEON
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_RGBATOYROW_NEON
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_RGB24TOYROW_NEON
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
#endif
#ifdef HAS_RAWTOYROW_NEON
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
#endif
#ifdef HAS_RGB565TOYROW_NEON
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
#endif
#ifdef HAS_ARGB1555TOYROW_NEON
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
#endif
#ifdef HAS_ARGB4444TOYROW_NEON
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
#endif
#ifdef HAS_YUY2TOYROW_NEON
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
#endif
#ifdef HAS_UYVYTOYROW_NEON
ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
#endif
#ifdef HAS_RAWTOARGBROW_NEON
ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
#endif
#ifdef HAS_RGB565TOARGBROW_NEON
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_AVX2
ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBATTENUATEROW_NEON
ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#endif
#undef ANY11
// Any 1 to 1 with yuvconstants
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#if defined(HAS_YUY2TOARGBROW_SSSE3)
ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
#endif
#if defined(HAS_YUY2TOARGBROW_NEON)
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif
#undef ANY11C
// Any 1 to 1 blended.
#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#endif
#undef ANY11B
// Any 1 to 1 with parameter.
#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
T shuffler, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
const uint32, 4, 2, 3)
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
const uint32, 4, 2, 7)
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
const uint32, 4, 2, 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSE2
ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_AVX2
ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
#endif
#ifdef HAS_ARGBSHUFFLEROW_NEON
ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
#endif
#undef ANY11P
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
ptrdiff_t src_stride_ptr, int width, \
int source_y_fraction) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_DSPR2
ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
#endif
#undef ANY11T
// Any 1 to 1 mirror.
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
} \
memcpy(temp, src_ptr, r * BPP); \
ANY_SIMD(temp, temp + 64, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
}
#ifdef HAS_MIRRORROW_AVX2
ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
#endif
#ifdef HAS_MIRRORROW_SSSE3
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON
ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
#ifdef HAS_ARGBMIRRORROW_SSE2
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#endif
#ifdef HAS_ARGBMIRRORROW_NEON
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
#endif
#undef ANY11M
// Any 1 plane. (memset)
#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, T v32, int width) { \
SIMD_ALIGNED(uint8 temp[64]); \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, v32, n); \
} \
ANY_SIMD(temp, v32, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp, r * BPP); \
}
#ifdef HAS_SETROW_X86
ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
#endif
#ifdef HAS_SETROW_NEON
ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
#endif
#ifdef HAS_ARGBSETROW_NEON
ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
#endif
#undef ANY1
// Any 1 to 2. Outputs UV planes.
#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
SIMD_ALIGNED(uint8 temp[128 * 3]); \
memset(temp, 0, 128); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
/* repeat last 4 bytes for 422 subsampler */ \
if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \
/* repeat last 4 - 12 bytes for 411 subsampler */ \
if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \
} \
if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \
} \
if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
}
#ifdef HAS_SPLITUVROW_SSE2
ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
#endif
#ifdef HAS_SPLITUVROW_AVX2
ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
#endif
#ifdef HAS_SPLITUVROW_NEON
ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#endif
#ifdef HAS_SPLITUVROW_DSPR2
ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_AVX2
ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
#endif
#ifdef HAS_YUY2TOUV422ROW_SSE2
ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
#endif
#undef ANY12
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \
uint8* dst_u, uint8* dst_v, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 4]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \
ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
}
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
#endif
#ifdef HAS_YUY2TOUVROW_AVX2
ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
#endif
#ifdef HAS_YUY2TOUVROW_SSE2
ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
#endif
#ifdef HAS_ARGBTOUVROW_NEON
ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_NEON
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_NEON
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_NEON
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_RGB24TOUVROW_NEON
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RAWTOUVROW_NEON
ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RGB565TOUVROW_NEON
ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
#endif
#ifdef HAS_YUY2TOUVROW_NEON
ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
#endif
#ifdef HAS_UYVYTOUVROW_NEON
ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
#endif
#undef ANY12S
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,782 @@
/*
* Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
#ifdef HAS_COPYROW_MIPS
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
__asm__ __volatile__ (
".set noreorder \n"
".set noat \n"
"slti $at, %[count], 8 \n"
"bne $at ,$zero, $last8 \n"
"xor $t8, %[src], %[dst] \n"
"andi $t8, $t8, 0x3 \n"
"bne $t8, $zero, unaligned \n"
"negu $a3, %[dst] \n"
// make dst/src aligned
"andi $a3, $a3, 0x3 \n"
"beq $a3, $zero, $chk16w \n"
// word-aligned now count is the remining bytes count
"subu %[count], %[count], $a3 \n"
"lwr $t8, 0(%[src]) \n"
"addu %[src], %[src], $a3 \n"
"swr $t8, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// Now the dst/src are mutually word-aligned with word-aligned addresses
"$chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, chk8w \n"
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n"
// t0 is the "past the end" address
// When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
// the "t0-32" address
// This means: for x=128 the last "safe" a1 address is "t0-160"
// Alternatively, for x=64 the last "safe" a1 address is "t0-96"
// we will use "pref 30,128(a1)", so "t0-160" is the limit
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line of src
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $loop16w \n"
"nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$loop16w: \n"
"pref 0, 96(%[src]) \n"
"lw $t0, 0(%[src]) \n"
"bgtz $v1, $skip_pref30_96 \n" // skip
"lw $t1, 4(%[src]) \n"
"pref 30, 96(%[dst]) \n" // continue
"$skip_pref30_96: \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lw $t0, 32(%[src]) \n"
"bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
"lw $t1, 36(%[src]) \n"
"pref 30, 128(%[dst]) \n" // set dest, addr 128
"$skip_pref30_128: \n"
"lw $t2, 40(%[src]) \n"
"lw $t3, 44(%[src]) \n"
"lw $t4, 48(%[src]) \n"
"lw $t5, 52(%[src]) \n"
"lw $t6, 56(%[src]) \n"
"lw $t7, 60(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst], %[dst], 64 \n" // adding 64 to dest
"sgtu $v1, %[dst], $t9 \n"
"bne %[dst], $a3, $loop16w \n"
" addiu %[src], %[src], 64 \n" // adding 64 to src
"move %[count], $t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count past 32-bytes
"beq %[count], $t8, chk1w \n"
// count=t8,no 32-byte chunk
" nop \n"
"lw $t0, 0(%[src]) \n"
"lw $t1, 4(%[src]) \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, $last8 \n"
" subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$wordCopy_loop: \n"
"lw $t3, 0(%[src]) \n"
// the first t3 may be equal t0 ... optimize?
"addiu %[src], %[src],4 \n"
"addiu %[dst], %[dst],4 \n"
"bne %[dst], $a3,$wordCopy_loop \n"
" sw $t3, -4(%[dst]) \n"
// For the last (<8) bytes
"$last8: \n"
"blez %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 -last dst address
"$last8loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst], $a3, $last8loop \n"
" sb $v1, -1(%[dst]) \n"
"leave: \n"
" j $ra \n"
" nop \n"
//
// UNALIGNED case
//
"unaligned: \n"
// got here with a3="negu a1"
"andi $a3, $a3, 0x3 \n" // a1 is word aligned?
"beqz $a3, $ua_chk16w \n"
" subu %[count], %[count], $a3 \n"
// bytes left after initial a3 bytes
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
"swr $v1, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// below the dst will be word aligned (NOTE1)
"$ua_chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, ua_chk8w \n"
// if a2==t8, no 64-byte chunks
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n" // t0 "past the end"
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line addr 32
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// safe, as we have at least 64 bytes ahead
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $ua_loop16w \n"
// skip "pref 30,64(a1)" for too short arrays
" nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$ua_loop16w: \n"
"pref 0, 96(%[src]) \n"
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"bgtz $v1, $ua_skip_pref30_96 \n"
" lwl $t1, 7(%[src]) \n"
"pref 30, 96(%[dst]) \n"
// continue setting up the dest, addr 96
"$ua_skip_pref30_96: \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lwr $t0, 32(%[src]) \n"
"lwl $t0, 35(%[src]) \n"
"lwr $t1, 36(%[src]) \n"
"bgtz $v1, ua_skip_pref30_128 \n"
" lwl $t1, 39(%[src]) \n"
"pref 30, 128(%[dst]) \n"
// continue setting up the dest, addr 128
"ua_skip_pref30_128: \n"
"lwr $t2, 40(%[src]) \n"
"lwl $t2, 43(%[src]) \n"
"lwr $t3, 44(%[src]) \n"
"lwl $t3, 47(%[src]) \n"
"lwr $t4, 48(%[src]) \n"
"lwl $t4, 51(%[src]) \n"
"lwr $t5, 52(%[src]) \n"
"lwl $t5, 55(%[src]) \n"
"lwr $t6, 56(%[src]) \n"
"lwl $t6, 59(%[src]) \n"
"lwr $t7, 60(%[src]) \n"
"lwl $t7, 63(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst],%[dst],64 \n" // adding 64 to dest
"sgtu $v1,%[dst],$t9 \n"
"bne %[dst],$a3,$ua_loop16w \n"
" addiu %[src],%[src],64 \n" // adding 64 to src
"move %[count],$t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"ua_chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count
"beq %[count], $t8, $ua_chk1w \n"
// when count==t8, no 32-byte chunk
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"lwl $t1, 7(%[src]) \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"$ua_chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, ua_smallCopy \n"
"subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$ua_wordCopy_loop: \n"
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addiu %[src], %[src], 4 \n"
"addiu %[dst], %[dst], 4 \n"
// note: dst=a1 is word aligned here, see NOTE1
"bne %[dst], $a3, $ua_wordCopy_loop \n"
" sw $v1,-4(%[dst]) \n"
// Now less than 4 bytes (value in count) left to copy
"ua_smallCopy: \n"
"beqz %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 = last dst address
"$ua_smallCopy_loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst],$a3,$ua_smallCopy_loop \n"
" sb $v1, -1(%[dst]) \n"
"j $ra \n"
" nop \n"
".set at \n"
".set reorder \n"
: [dst] "+r" (dst), [src] "+r" (src)
: [count] "r" (count)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
"t8", "t9", "a3", "v1", "at"
);
}
#endif // HAS_COPYROW_MIPS
// DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
"1: \n"
"addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
"lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
"lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
"lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
"lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
"lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
"lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
"lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
"addiu %[src_uv], %[src_uv], 32 \n"
"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
"sw $t9, 0(%[dst_v]) \n"
"sw $t0, 0(%[dst_u]) \n"
"sw $t1, 4(%[dst_v]) \n"
"sw $t2, 4(%[dst_u]) \n"
"sw $t3, 8(%[dst_v]) \n"
"sw $t5, 8(%[dst_u]) \n"
"sw $t6, 12(%[dst_v]) \n"
"sw $t7, 12(%[dst_u]) \n"
"addiu %[dst_v], %[dst_v], 16 \n"
"bgtz $t4, 1b \n"
" addiu %[dst_u], %[dst_u], 16 \n"
"beqz %[width], 3f \n"
" nop \n"
"2: \n"
"lbu $t0, 0(%[src_uv]) \n"
"lbu $t1, 1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], 2 \n"
"addiu %[width], %[width], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[width], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[width] "+r" (width),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6", "t7", "t8", "t9"
);
}
void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"andi $t5, %[width], 0xf \n"
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
"1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
"lw $t2, -8(%[src]) \n" // |11|10|9|8|
"lw $t3, -4(%[src]) \n" // |15|14|13|12|
"wsbh $t0, $t0 \n" // |2|3|0|1|
"wsbh $t1, $t1 \n" // |6|7|4|5|
"wsbh $t2, $t2 \n" // |10|11|8|9|
"wsbh $t3, $t3 \n" // |14|15|12|13|
"rotr $t0, $t0, 16 \n" // |0|1|2|3|
"rotr $t1, $t1, 16 \n" // |4|5|6|7|
"rotr $t2, $t2, 16 \n" // |8|9|10|11|
"rotr $t3, $t3, 16 \n" // |12|13|14|15|
"addiu %[src], %[src], -16 \n"
"addiu $t4, $t4, -1 \n"
"sw $t3, 0(%[dst]) \n" // |15|14|13|12|
"sw $t2, 4(%[dst]) \n" // |11|10|9|8|
"sw $t1, 8(%[dst]) \n" // |7|6|5|4|
"sw $t0, 12(%[dst]) \n" // |3|2|1|0|
"bgtz $t4, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"beqz $t5, 3f \n"
" nop \n"
"2: \n"
"lbu $t0, -1(%[src]) \n"
"addiu $t5, $t5, -1 \n"
"addiu %[src], %[src], -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgez $t5, 2b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src] "+r" (src), [dst] "+r" (dst)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4", "t5"
);
}
void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int x = 0;
int y = 0;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"addu $t4, %[width], %[width] \n"
"srl %[x], %[width], 4 \n"
"andi %[y], %[width], 0xf \n"
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
"1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
"lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
"lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
"lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
"lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
"lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
"rotr $t0, $t0, 16 \n" // |1|0|3|2|
"rotr $t1, $t1, 16 \n" // |5|4|7|6|
"rotr $t2, $t2, 16 \n" // |9|8|11|10|
"rotr $t3, $t3, 16 \n" // |13|12|15|14|
"rotr $t4, $t4, 16 \n" // |17|16|19|18|
"rotr $t6, $t6, 16 \n" // |21|20|23|22|
"rotr $t7, $t7, 16 \n" // |25|24|27|26|
"rotr $t8, $t8, 16 \n" // |29|28|31|30|
"precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
"precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
"precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
"precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
"precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
"precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
"precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
"precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
"addiu %[src_uv], %[src_uv], -32 \n"
"addiu %[x], %[x], -1 \n"
"swr $t4, 0(%[dst_u]) \n"
"swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
"swr $t6, 0(%[dst_v]) \n"
"swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
"swr $t2, 4(%[dst_u]) \n"
"swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
"swr $t3, 4(%[dst_v]) \n"
"swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
"swr $t0, 8(%[dst_u]) \n"
"swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
"swr $t1, 8(%[dst_v]) \n"
"swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
"swr $t9, 12(%[dst_u]) \n"
"swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
"swr $t5, 12(%[dst_v]) \n"
"swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
"addiu %[dst_v], %[dst_v], 16 \n"
"bgtz %[x], 1b \n"
" addiu %[dst_u], %[dst_u], 16 \n"
"beqz %[y], 3f \n"
" nop \n"
"b 2f \n"
" nop \n"
"2: \n"
"lbu $t0, -2(%[src_uv]) \n"
"lbu $t1, -1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], -2 \n"
"addiu %[y], %[y], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[y], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v),
[x] "=&r" (x),
[y] "+r" (y)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9"
);
}
// Convert (4 Y and 2 VU) I422 and arrange RGB values into
// t5 = | 0 | B0 | 0 | b0 |
// t4 = | 0 | B1 | 0 | b1 |
// t9 = | 0 | G0 | 0 | g0 |
// t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
#define YUVTORGB \
"lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \
"preceu.ph.qbr $t1, $t1 \n" \
"preceu.ph.qbr $t2, $t2 \n" \
"preceu.ph.qbra $t3, $t0 \n" \
"preceu.ph.qbla $t0, $t0 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t3, $t3, $s4 \n" \
"subu.ph $t0, $t0, $s4 \n" \
"mul.ph $t3, $t3, $s0 \n" \
"mul.ph $t0, $t0, $s0 \n" \
"shll.ph $t4, $t1, 0x7 \n" \
"subu.ph $t4, $t4, $t1 \n" \
"mul.ph $t6, $t1, $s1 \n" \
"mul.ph $t1, $t2, $s2 \n" \
"addq_s.ph $t5, $t4, $t3 \n" \
"addq_s.ph $t4, $t4, $t0 \n" \
"shra.ph $t5, $t5, 6 \n" \
"shra.ph $t4, $t4, 6 \n" \
"addiu %[u_buf], 2 \n" \
"addiu %[v_buf], 2 \n" \
"addu.ph $t6, $t6, $t1 \n" \
"mul.ph $t1, $t2, $s3 \n" \
"addu.ph $t9, $t6, $t3 \n" \
"addu.ph $t8, $t6, $t0 \n" \
"shra.ph $t9, $t9, 6 \n" \
"shra.ph $t8, $t8, 6 \n" \
"addu.ph $t2, $t1, $t3 \n" \
"addu.ph $t1, $t1, $t0 \n" \
"shra.ph $t2, $t2, 6 \n" \
"shra.ph $t1, $t1, 6 \n" \
"subu.ph $t5, $t5, $s5 \n" \
"subu.ph $t4, $t4, $s5 \n" \
"subu.ph $t9, $t9, $s5 \n" \
"subu.ph $t8, $t8, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"shll_s.ph $t5, $t5, 8 \n" \
"shll_s.ph $t4, $t4, 8 \n" \
"shll_s.ph $t9, $t9, 8 \n" \
"shll_s.ph $t8, $t8, 8 \n" \
"shll_s.ph $t2, $t2, 8 \n" \
"shll_s.ph $t1, $t1, 8 \n" \
"shra.ph $t5, $t5, 8 \n" \
"shra.ph $t4, $t4, 8 \n" \
"shra.ph $t9, $t9, 8 \n" \
"shra.ph $t8, $t8, 8 \n" \
"shra.ph $t2, $t2, 8 \n" \
"shra.ph $t1, $t1, 8 \n" \
"addu.ph $t5, $t5, $s5 \n" \
"addu.ph $t4, $t4, $s5 \n" \
"addu.ph $t9, $t9, $s5 \n" \
"addu.ph $t8, $t8, $s5 \n" \
"addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
// TODO(fbarchard): accept yuv conversion constants.
void I422ToARGBRow_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128| // clipping
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
"1: \n"
YUVTORGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
"addiu %[width], -4 \n"
"precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
"precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
"or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"
"lw $t4, 4(%[src_ptr]) \n"
"lw $t5, 4(%[src_ptr1]) \n"
"muleu_s.ph.qbl $t6, $t2, $t0 \n"
"muleu_s.ph.qbr $t7, $t2, $t0 \n"
"muleu_s.ph.qbl $t8, $t3, $t1 \n"
"muleu_s.ph.qbr $t9, $t3, $t1 \n"
"muleu_s.ph.qbl $t2, $t4, $t0 \n"
"muleu_s.ph.qbr $t3, $t4, $t0 \n"
"muleu_s.ph.qbl $t4, $t5, $t1 \n"
"muleu_s.ph.qbr $t5, $t5, $t1 \n"
"addq.ph $t6, $t6, $t8 \n"
"addq.ph $t7, $t7, $t9 \n"
"addq.ph $t2, $t2, $t4 \n"
"addq.ph $t3, $t3, $t5 \n"
"shra.ph $t6, $t6, 8 \n"
"shra.ph $t7, $t7, 8 \n"
"shra.ph $t2, $t2, 8 \n"
"shra.ph $t3, $t3, 8 \n"
"precr.qb.ph $t6, $t6, $t7 \n"
"precr.qb.ph $t2, $t2, $t3 \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[src_ptr1], %[src_ptr1], 8 \n"
"addiu %[dst_width], %[dst_width], -8 \n"
"sw $t6, 0(%[dst_ptr]) \n"
"sw $t2, 4(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[dst_ptr], %[dst_ptr], 8 \n"
".set pop \n"
: [dst_ptr] "+r" (dst_ptr),
[src_ptr1] "+r" (src_ptr1),
[src_ptr] "+r" (src_ptr),
[dst_width] "+r" (dst_width)
: [source_y_fraction] "r" (source_y_fraction),
[y0_fraction] "r" (y0_fraction),
[src_stride] "r" (src_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
#endif // __mips_dsp_rev >= 2
#endif // defined(__mips__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1672
libs/libyuv/source/scale.cc Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,221 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
int dst_width, int x, int dx) { \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, \
dst_width & MASK, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C, 4, 3)
#endif
#undef CANY
// Fixed scale down.
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
// Fixed scale down for odd source width. Used by I420Blend subsampling.
// Since dst_width is (width + 1) / 2, this function scales one less pixel
// and copies the last pixel.
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31)
SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
2, 1, 31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
4, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN4_NEON
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
#endif
#ifdef HAS_SCALEROWDOWN34_NEON
SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
#endif
#ifdef HAS_SCALEROWDOWN38_NEON
SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
ScaleARGBRowDown2_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
ScaleARGBRowDown2Linear_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
ScaleARGBRowDown2Box_C, 2, 4, 3)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_NEON
SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
ScaleARGBRowDown2_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
ScaleARGBRowDown2Linear_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
ScaleARGBRowDown2Box_C, 2, 4, 7)
#endif
#undef SDANY
// Scale down by even scale factor.
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
src_stepx, dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
ScaleARGBRowDownEvenBox_C, 4, 3)
#endif
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
ScaleARGBRowDownEvenBox_C, 4, 3)
#endif
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
#endif
#ifdef HAS_SCALEADDROW_AVX2
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#endif
#ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif
#undef SAANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -0,0 +1,859 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/scale.h"
#include <assert.h>
#include <string.h>
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h" // For CopyARGB
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
// ScaleARGB ARGB, 1/2
// This is an optimized version for scaling down a ARGB to 1/2 of
// its original size.
static void ScaleARGBDown2(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
enum FilterMode filtering) {
int j;
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) =
filtering == kFilterNone ? ScaleARGBRowDown2_C :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
ScaleARGBRowDown2Box_C);
assert(dx == 65536 * 2); // Test scale factor of 2.
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row, even column.
if (filtering == kFilterBilinear) {
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
} else {
src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
ScaleARGBRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
ScaleARGBRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
ScaleARGBRowDown2Box_NEON);
}
}
#endif
if (filtering == kFilterLinear) {
src_stride = 0;
}
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
src_argb += row_stride;
dst_argb += dst_stride;
}
}
// ScaleARGB ARGB, 1/4
// This is an optimized version for scaling down a ARGB to 1/4 of
// its original size.
static void ScaleARGBDown4Box(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy) {
int j;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
// Advance to odd row, even column.
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
row + kRowSize, dst_width * 2);
ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
src_argb += row_stride;
dst_argb += dst_stride;
}
free_aligned_buffer_64(row);
}
// ScaleARGB ARGB Even
// This is an optimized version for scaling down a ARGB to even
// multiple of its original size.
static void ScaleARGBDownEven(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
enum FilterMode filtering) {
int j;
int col_step = dx >> 16;
int row_stride = (dy >> 16) * src_stride;
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
int src_step, uint8* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
ScaleARGBRowDownEven_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
ScaleARGBRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
}
#endif
if (filtering == kFilterLinear) {
src_stride = 0;
}
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
src_argb += row_stride;
dst_argb += dst_stride;
}
}
// Scale ARGB down with bilinear interpolation.
static void ScaleARGBBilinearDown(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
enum FilterMode filtering) {
int j;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
int64 xlast = x + (int64)(dst_width - 1) * dx;
int64 xl = (dx >= 0) ? x : xlast;
int64 xr = (dx >= 0) ? xlast : x;
int clip_src_width;
xl = (xl >> 16) & ~3; // Left edge aligned.
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
if (xr > src_width) {
xr = src_width;
}
clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
src_argb += xl * 4;
x -= (int)(xl << 16);
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(clip_src_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
{
align_buffer_64(row, clip_src_width * 4);
const int max_y = (src_height - 1) << 16;
if (y > max_y) {
y = max_y;
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
const uint8* src = src_argb + yi * src_stride;
if (filtering == kFilterLinear) {
ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
} else {
int yf = (y >> 8) & 255;
InterpolateRow(row, src, src_stride, clip_src_width, yf);
ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
}
dst_argb += dst_stride;
y += dy;
if (y > max_y) {
y = max_y;
}
}
free_aligned_buffer_64(row);
}
}
// Scale ARGB up with bilinear interpolation.
static void ScaleARGBBilinearUp(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy,
enum FilterMode filtering) {
int j;
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
if (src_width >= 32768) {
ScaleARGBFilterCols = filtering ?
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
}
if (y > max_y) {
y = max_y;
}
{
int yi = y >> 16;
const uint8* src = src_argb + yi * src_stride;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
uint8* rowptr = row;
int rowstride = kRowSize;
int lasty = yi;
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
if (src_height > 1) {
src += src_stride;
}
ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
src += src_stride;
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
if (yi != lasty) {
if (y > max_y) {
y = max_y;
yi = y >> 16;
src = src_argb + yi * src_stride;
}
if (yi != lasty) {
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
src += src_stride;
}
}
if (filtering == kFilterLinear) {
InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
} else {
int yf = (y >> 8) & 255;
InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
}
dst_argb += dst_stride;
y += dy;
}
free_aligned_buffer_64(row);
}
}
#ifdef YUVSCALEUP
// Scale YUV to ARGB up with bilinear interpolation.
static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride_y,
int src_stride_u,
int src_stride_v,
int dst_stride_argb,
const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int x, int dx, int y, int dy,
enum FilterMode filtering) {
int j;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(src_width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
}
#endif
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
if (src_width >= 32768) {
ScaleARGBFilterCols = filtering ?
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
}
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
}
const int max_y = (src_height - 1) << 16;
if (y > max_y) {
y = max_y;
}
const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
int yi = y >> 16;
int uv_yi = yi >> kYShift;
const uint8* src_row_y = src_y + yi * src_stride_y;
const uint8* src_row_u = src_u + uv_yi * src_stride_u;
const uint8* src_row_v = src_v + uv_yi * src_stride_v;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
// Allocate 1 row of ARGB for source conversion.
align_buffer_64(argb_row, src_width * 4);
uint8* rowptr = row;
int rowstride = kRowSize;
int lasty = yi;
// TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
if (src_height > 1) {
src_row_y += src_stride_y;
if (yi & 1) {
src_row_u += src_stride_u;
src_row_v += src_stride_v;
}
}
ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
if (src_height > 2) {
src_row_y += src_stride_y;
if (!(yi & 1)) {
src_row_u += src_stride_u;
src_row_v += src_stride_v;
}
}
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
if (yi != lasty) {
if (y > max_y) {
y = max_y;
yi = y >> 16;
uv_yi = yi >> kYShift;
src_row_y = src_y + yi * src_stride_y;
src_row_u = src_u + uv_yi * src_stride_u;
src_row_v = src_v + uv_yi * src_stride_v;
}
if (yi != lasty) {
// TODO(fbarchard): Convert the clipped region of row.
I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
src_row_y += src_stride_y;
if (yi & 1) {
src_row_u += src_stride_u;
src_row_v += src_stride_v;
}
}
}
if (filtering == kFilterLinear) {
InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
} else {
int yf = (y >> 8) & 255;
InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
}
dst_argb += dst_stride_argb;
y += dy;
}
free_aligned_buffer_64(row);
free_aligned_buffer_64(row_argb);
}
#endif
// Scale ARGB to/from any dimensions, without interpolation.
// Fixed point math is used for performance: The upper 16 bits
// of x and dx is the integer part of the source position and
// the lower 16 bits are the fixed decimal part.
static void ScaleARGBSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int dx, int y, int dy) {
int j;
void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBCols_NEON;
}
}
#endif
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif
}
for (j = 0; j < dst_height; ++j) {
ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
dst_width, x, dx);
dst_argb += dst_stride;
y += dy;
}
}
// ScaleARGB a ARGB.
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
static void ScaleARGB(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
int dx = 0;
int dy = 0;
// ARGB does not support box filter yet, but allow the user to pass it.
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height,
filtering);
// Negative src_height means invert the image.
if (src_height < 0) {
src_height = -src_height;
src = src + (src_height - 1) * src_stride;
src_stride = -src_stride;
}
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
if (clip_x) {
int64 clipf = (int64)(clip_x) * dx;
x += (clipf & 0xffff);
src += (clipf >> 16) * 4;
dst += clip_x * 4;
}
if (clip_y) {
int64 clipf = (int64)(clip_y) * dy;
y += (clipf & 0xffff);
src += (clipf >> 16) * src_stride;
dst += clip_y * dst_stride;
}
// Special case for integer step values.
if (((dx | dy) & 0xffff) == 0) {
if (!dx || !dy) { // 1 pixel wide and/or tall.
filtering = kFilterNone;
} else {
// Optimized even scale down. ie 2, 4, 6, 8, 10x.
if (!(dx & 0x10000) && !(dy & 0x10000)) {
if (dx == 0x20000) {
// Optimized 1/2 downsample.
ScaleARGBDown2(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
return;
}
if (dx == 0x40000 && filtering == kFilterBox) {
// Optimized 1/4 box downsample.
ScaleARGBDown4Box(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy);
return;
}
ScaleARGBDownEven(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
return;
}
// Optimized odd scale down. ie 3, 5, 7, 9x.
if ((dx & 0x10000) && (dy & 0x10000)) {
filtering = kFilterNone;
if (dx == 0x10000 && dy == 0x10000) {
// Straight copy.
ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
dst, dst_stride, clip_width, clip_height);
return;
}
}
}
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
// Arbitrary scale vertically, but unscaled vertically.
ScalePlaneVertical(src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, y, dy, 4, filtering);
return;
}
if (filtering && dy < 65536) {
ScaleARGBBilinearUp(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
return;
}
if (filtering) {
ScaleARGBBilinearDown(src_width, src_height,
clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
return;
}
ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy);
}
LIBYUV_API
int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
clip_x < 0 || clip_y < 0 ||
clip_width > 32768 || clip_height > 32768 ||
(clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) {
return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height, filtering);
return 0;
}
// Scale an ARGB image.
LIBYUV_API
int ARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
int dst_width, int dst_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
dst_argb, dst_stride_argb, dst_width, dst_height,
0, 0, dst_width, dst_height, filtering);
return 0;
}
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
free(argb_buffer);
return r;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,644 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC MIPS DSPR2
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t0, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst), [t] "+r" (t)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n"
"beqz $t9, 2f \n"
" nop \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
"precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
"precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
"precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
"precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
"precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
"precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t1, 0(%[dst]) \n"
"sw $t5, 4(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 7 \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t1, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -1 \n"
"sb $t1, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
const uint8* s3 = s2 + stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
"lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 4(%[s1]) \n" // |23|22|21|20|
"lw $t6, 4(%[s2]) \n" // |27|26|25|24|
"lw $t7, 4(%[s3]) \n" // |31|30|29|28|
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
"raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
"raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
"raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
"raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
"add $t0, $t0, $t1 \n"
"add $t1, $t2, $t3 \n"
"add $t0, $t0, $t1 \n"
"add $t4, $t4, $t5 \n"
"add $t6, $t6, $t7 \n"
"add $t4, $t4, $t6 \n"
"shra_r.w $t0, $t0, 4 \n"
"shra_r.w $t4, $t4, 4 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[s1], %[s1], 8 \n"
"addiu %[s2], %[s2], 8 \n"
"addiu %[s3], %[s3], 8 \n"
"addiu $t9, $t9, -1 \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 2 \n"
"beqz $t8, 2f \n"
" nop \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
"add $t0, $t0, $t1 \n"
"add $t1, $t2, $t3 \n"
"add $t0, $t0, $t1 \n"
"shra_r.w $t0, $t0, 4 \n"
"sb $t0, 0(%[dst]) \n"
"2: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[s1] "+r" (s1),
[s2] "+r" (s2),
[s3] "+r" (s3)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
}
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
"precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
"precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
"addiu %[dst_width], %[dst_width], -24 \n"
"ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
"ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
"ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
"ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
"packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
"prepend $t1, $t2, 8 \n" // |4|3|1|0|
"prepend $t3, $t4, 24 \n" // |15|13|12|11|
"prepend $t5, $t6, 8 \n" // |20|19|17|16|
"prepend $t7, $t8, 24 \n" // |31|29|28|27|
"sw $t1, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t3, 8(%[dst]) \n"
"sw $t5, 12(%[dst]) \n"
"sw $t9, 16(%[dst]) \n"
"sw $t7, 20(%[dst]) \n"
"bnez %[dst_width], 1b \n"
" addiu %[dst], %[dst], 24 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
}
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
"muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
"muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
"andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t1, $t1 \n"
"shra_r.w $t0, $t0, 1 \n"
"shra_r.w $t1, $t1, 1 \n"
"preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
"rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
"addu.ph $t2, $t2, $t4 \n"
"addu.ph $t6, $t6, $t5 \n"
"sll $t5, $t0, 1 \n"
"add $t0, $t5, $t0 \n"
"shra_r.ph $t2, $t2, 2 \n"
"shra_r.ph $t6, $t6, 2 \n"
"shll.ph $t4, $t2, 1 \n"
"addq.ph $t4, $t4, $t2 \n"
"addu $t0, $t0, $t1 \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"shra_r.w $t0, $t0, 2 \n"
"addu.ph $t6, $t6, $t4 \n"
"shra_r.ph $t6, $t6, 2 \n"
"srl $t1, $t6, 16 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"sb $t1, 0(%[d]) \n"
"sb $t0, 1(%[d]) \n"
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
}
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
"muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
"muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
"andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t1, $t1 \n"
"shra_r.w $t0, $t0, 1 \n"
"shra_r.w $t1, $t1, 1 \n"
"preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
"rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
"addu.ph $t4, $t4, $t3 \n"
"addu.ph $t6, $t6, $t5 \n"
"shra_r.ph $t6, $t6, 2 \n"
"shra_r.ph $t4, $t4, 2 \n"
"addu.ph $t6, $t6, $t4 \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"shra_r.ph $t6, $t6, 1 \n"
"addu $t0, $t0, $t1 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"shra_r.w $t0, $t0, 1 \n"
"srl $t1, $t6, 16 \n"
"sb $t1, 0(%[d]) \n"
"sb $t0, 1(%[d]) \n"
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
}
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
"wsbh $t0, $t0 \n" // |2|3|0|1|
"wsbh $t6, $t6 \n" // |26|27|24|25|
"srl $t0, $t0, 8 \n" // |X|2|3|0|
"srl $t3, $t3, 16 \n" // |X|X|15|14|
"srl $t5, $t5, 16 \n" // |X|X|23|22|
"srl $t7, $t7, 16 \n" // |X|X|31|30|
"ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
"ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
"ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
"ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
"prepend $t2, $t3, 24 \n" // |X|15|14|11|
"ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
"ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu %[dst_width], %[dst_width], -12 \n"
"addiu $t8,%[dst_width], -12 \n"
"sw $t1, 0(%[dst]) \n"
"sw $t4, 4(%[dst]) \n"
"sw $t6, 8(%[dst]) \n"
"bgez $t8, 1b \n"
" addiu %[dst], %[dst], 12 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
}
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
"lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
"packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
"packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
"raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
"raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
"precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
"precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
"srl $t4, $t4, 2 \n" // t4 / 4
"srl $t6, $t6, 16 \n" // |0|0|S3|T3|
"raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
"addu $t6, $t5, $t6 \n"
"mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
"raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
"raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
"addu $t0, $t0, $t2 \n"
"mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[t], %[t], 8 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
"srl $t6, $t6, 16 \n"
"srl $t0, $t0, 16 \n"
"sb $t4, -1(%[dst_ptr]) \n"
"sb $t6, -2(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[t] "+r" (t),
[dst_width] "+r" (dst_width)
: [c] "r" (c)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6"
);
}
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
const uint8* s2 = src_ptr + stride;
const int c1 = 0x1C71;
const int c2 = 0x2AAA;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
"lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
"lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
"lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
"packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
"raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
"packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
"raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
"sll $t8, $t5, 16 \n" // |R5|R4|0|0|
"raddu.w.qb $t8, $t8 \n" // R5+R4
"addu $t7, $t7, $t8 \n"
"srl $t8, $t5, 16 \n" // |0|0|R7|R6|
"raddu.w.qb $t8, $t8 \n" // R7 + R6
"addu $t6, $t6, $t8 \n"
"mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
"precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
"precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
"srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
"raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
"addu $t7, $t7, $t8 \n"
"mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
"sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t2, $t2 \n"
"raddu.w.qb $t4, $t4 \n"
"addu $t0, $t0, $t2 \n"
"addu $t0, $t0, $t4 \n"
"mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[s1], %[s1], 8 \n"
"addiu %[s2], %[s2], 8 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
"srl $t6, $t6, 16 \n"
"srl $t7, $t7, 16 \n"
"srl $t0, $t0, 16 \n"
"sb $t6, -1(%[dst_ptr]) \n"
"sb $t7, -2(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[s1] "+r" (s1),
[s2] "+r" (s2),
[dst_width] "+r" (dst_width)
: [c1] "r" (c1), [c2] "r" (c2)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
struct FourCCAliasEntry {
uint32 alias;
uint32 canonical;
};
static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},
{FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
{FOURCC_HDYC, FOURCC_UYVY},
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_RGB3, FOURCC_RAW },
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
{FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
{FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
{FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
{FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
};
// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
LIBYUV_API
uint32 CanonicalFourCC(uint32 fourcc) {
int i;
for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
if (kFourCCAliases[i].alias == fourcc) {
return kFourCCAliases[i].canonical;
}
}
// Not an alias, so return it as-is.
return fourcc;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif