mirror of
https://github.com/signalwire/freeswitch.git
synced 2025-08-13 01:26:58 +00:00
import libyuv at hash 38d37a5 from https://chromium.googlesource.com/libyuv/libyuv/
This commit is contained in:
340
libs/libyuv/source/compare.cc
Normal file
340
libs/libyuv/source/compare.cc
Normal file
@@ -0,0 +1,340 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/compare.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// hash seed of 5381 recommended.
|
||||
LIBYUV_API
|
||||
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
|
||||
const int kBlockSize = 1 << 15; // 32768;
|
||||
int remainder;
|
||||
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
|
||||
HashDjb2_C;
|
||||
#if defined(HAS_HASHDJB2_SSE41)
|
||||
if (TestCpuFlag(kCpuHasSSE41)) {
|
||||
HashDjb2_SSE = HashDjb2_SSE41;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HASHDJB2_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HashDjb2_SSE = HashDjb2_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (count >= (uint64)(kBlockSize)) {
|
||||
seed = HashDjb2_SSE(src, kBlockSize, seed);
|
||||
src += kBlockSize;
|
||||
count -= kBlockSize;
|
||||
}
|
||||
remainder = (int)(count) & ~15;
|
||||
if (remainder) {
|
||||
seed = HashDjb2_SSE(src, remainder, seed);
|
||||
src += remainder;
|
||||
count -= remainder;
|
||||
}
|
||||
remainder = (int)(count) & 15;
|
||||
if (remainder) {
|
||||
seed = HashDjb2_C(src, remainder, seed);
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
||||
static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
argb += 8;
|
||||
}
|
||||
if (width & 1) {
|
||||
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Scan an opaque argb image and return fourcc based on alpha offset.
|
||||
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
|
||||
LIBYUV_API
|
||||
uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
|
||||
uint32 fourcc = 0;
|
||||
int h;
|
||||
|
||||
// Coalesce rows.
|
||||
if (stride_argb == width * 4) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
stride_argb = 0;
|
||||
}
|
||||
for (h = 0; h < height && fourcc == 0; ++h) {
|
||||
fourcc = ARGBDetectRow_C(argb, width);
|
||||
argb += stride_argb;
|
||||
}
|
||||
return fourcc;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Refactor into row function.
|
||||
LIBYUV_API
|
||||
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
|
||||
int count) {
|
||||
// SumSquareError returns values 0 to 65535 for each squared difference.
|
||||
// Up to 65536 of those can be summed and remain within a uint32.
|
||||
// After each block of 65536 pixels, accumulate into a uint64.
|
||||
const int kBlockSize = 65536;
|
||||
int remainder = count & (kBlockSize - 1) & ~31;
|
||||
uint64 sse = 0;
|
||||
int i;
|
||||
uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
|
||||
SumSquareError_C;
|
||||
#if defined(HAS_SUMSQUAREERROR_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SumSquareError = SumSquareError_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
// Note only used for multiples of 16 so count is not checked.
|
||||
SumSquareError = SumSquareError_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
// Note only used for multiples of 32 so count is not checked.
|
||||
SumSquareError = SumSquareError_AVX2;
|
||||
}
|
||||
#endif
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+: sse)
|
||||
#endif
|
||||
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
|
||||
sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
|
||||
}
|
||||
src_a += count & ~(kBlockSize - 1);
|
||||
src_b += count & ~(kBlockSize - 1);
|
||||
if (remainder) {
|
||||
sse += SumSquareError(src_a, src_b, remainder);
|
||||
src_a += remainder;
|
||||
src_b += remainder;
|
||||
}
|
||||
remainder = count & 31;
|
||||
if (remainder) {
|
||||
sse += SumSquareError_C(src_a, src_b, remainder);
|
||||
}
|
||||
return sse;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
|
||||
const uint8* src_b, int stride_b,
|
||||
int width, int height) {
|
||||
uint64 sse = 0;
|
||||
int h;
|
||||
// Coalesce rows.
|
||||
if (stride_a == width &&
|
||||
stride_b == width) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
stride_a = stride_b = 0;
|
||||
}
|
||||
for (h = 0; h < height; ++h) {
|
||||
sse += ComputeSumSquareError(src_a, src_b, width);
|
||||
src_a += stride_a;
|
||||
src_b += stride_b;
|
||||
}
|
||||
return sse;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
|
||||
double psnr;
|
||||
if (sse > 0) {
|
||||
double mse = (double)(count) / (double)(sse);
|
||||
psnr = 10.0 * log10(255.0 * 255.0 * mse);
|
||||
} else {
|
||||
psnr = kMaxPsnr; // Limit to prevent divide by 0
|
||||
}
|
||||
|
||||
if (psnr > kMaxPsnr)
|
||||
psnr = kMaxPsnr;
|
||||
|
||||
return psnr;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double CalcFramePsnr(const uint8* src_a, int stride_a,
|
||||
const uint8* src_b, int stride_b,
|
||||
int width, int height) {
|
||||
const uint64 samples = width * height;
|
||||
const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
|
||||
src_b, stride_b,
|
||||
width, height);
|
||||
return SumSquareErrorToPsnr(sse, samples);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double I420Psnr(const uint8* src_y_a, int stride_y_a,
|
||||
const uint8* src_u_a, int stride_u_a,
|
||||
const uint8* src_v_a, int stride_v_a,
|
||||
const uint8* src_y_b, int stride_y_b,
|
||||
const uint8* src_u_b, int stride_u_b,
|
||||
const uint8* src_v_b, int stride_v_b,
|
||||
int width, int height) {
|
||||
const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
|
||||
src_y_b, stride_y_b,
|
||||
width, height);
|
||||
const int width_uv = (width + 1) >> 1;
|
||||
const int height_uv = (height + 1) >> 1;
|
||||
const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
|
||||
src_u_b, stride_u_b,
|
||||
width_uv, height_uv);
|
||||
const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
|
||||
src_v_b, stride_v_b,
|
||||
width_uv, height_uv);
|
||||
const uint64 samples = width * height + 2 * (width_uv * height_uv);
|
||||
const uint64 sse = sse_y + sse_u + sse_v;
|
||||
return SumSquareErrorToPsnr(sse, samples);
|
||||
}
|
||||
|
||||
static const int64 cc1 = 26634; // (64^2*(.01*255)^2
|
||||
static const int64 cc2 = 239708; // (64^2*(.03*255)^2
|
||||
|
||||
static double Ssim8x8_C(const uint8* src_a, int stride_a,
|
||||
const uint8* src_b, int stride_b) {
|
||||
int64 sum_a = 0;
|
||||
int64 sum_b = 0;
|
||||
int64 sum_sq_a = 0;
|
||||
int64 sum_sq_b = 0;
|
||||
int64 sum_axb = 0;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
int j;
|
||||
for (j = 0; j < 8; ++j) {
|
||||
sum_a += src_a[j];
|
||||
sum_b += src_b[j];
|
||||
sum_sq_a += src_a[j] * src_a[j];
|
||||
sum_sq_b += src_b[j] * src_b[j];
|
||||
sum_axb += src_a[j] * src_b[j];
|
||||
}
|
||||
|
||||
src_a += stride_a;
|
||||
src_b += stride_b;
|
||||
}
|
||||
|
||||
{
|
||||
const int64 count = 64;
|
||||
// scale the constants by number of pixels
|
||||
const int64 c1 = (cc1 * count * count) >> 12;
|
||||
const int64 c2 = (cc2 * count * count) >> 12;
|
||||
|
||||
const int64 sum_a_x_sum_b = sum_a * sum_b;
|
||||
|
||||
const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
|
||||
(2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
|
||||
|
||||
const int64 sum_a_sq = sum_a*sum_a;
|
||||
const int64 sum_b_sq = sum_b*sum_b;
|
||||
|
||||
const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
|
||||
(count * sum_sq_a - sum_a_sq +
|
||||
count * sum_sq_b - sum_b_sq + c2);
|
||||
|
||||
if (ssim_d == 0.0) {
|
||||
return DBL_MAX;
|
||||
}
|
||||
return ssim_n * 1.0 / ssim_d;
|
||||
}
|
||||
}
|
||||
|
||||
// We are using a 8x8 moving window with starting location of each 8x8 window
|
||||
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
|
||||
// block boundaries to penalize blocking artifacts.
|
||||
LIBYUV_API
|
||||
double CalcFrameSsim(const uint8* src_a, int stride_a,
|
||||
const uint8* src_b, int stride_b,
|
||||
int width, int height) {
|
||||
int samples = 0;
|
||||
double ssim_total = 0;
|
||||
double (*Ssim8x8)(const uint8* src_a, int stride_a,
|
||||
const uint8* src_b, int stride_b) = Ssim8x8_C;
|
||||
|
||||
// sample point start with each 4x4 location
|
||||
int i;
|
||||
for (i = 0; i < height - 8; i += 4) {
|
||||
int j;
|
||||
for (j = 0; j < width - 8; j += 4) {
|
||||
ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
|
||||
samples++;
|
||||
}
|
||||
|
||||
src_a += stride_a * 4;
|
||||
src_b += stride_b * 4;
|
||||
}
|
||||
|
||||
ssim_total /= samples;
|
||||
return ssim_total;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
double I420Ssim(const uint8* src_y_a, int stride_y_a,
|
||||
const uint8* src_u_a, int stride_u_a,
|
||||
const uint8* src_v_a, int stride_v_a,
|
||||
const uint8* src_y_b, int stride_y_b,
|
||||
const uint8* src_u_b, int stride_u_b,
|
||||
const uint8* src_v_b, int stride_v_b,
|
||||
int width, int height) {
|
||||
const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
|
||||
src_y_b, stride_y_b, width, height);
|
||||
const int width_uv = (width + 1) >> 1;
|
||||
const int height_uv = (height + 1) >> 1;
|
||||
const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
|
||||
src_u_b, stride_u_b,
|
||||
width_uv, height_uv);
|
||||
const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
|
||||
src_v_b, stride_v_b,
|
||||
width_uv, height_uv);
|
||||
return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
44
libs/libyuv/source/compare_common.cc
Normal file
44
libs/libyuv/source/compare_common.cc
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse = 0u;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
int diff = src_a[i] - src_b[i];
|
||||
sse += (uint32)(diff * diff);
|
||||
}
|
||||
return sse;
|
||||
}
|
||||
|
||||
// hash seed of 5381 recommended.
|
||||
// Internal C version of HashDjb2 with int sized count for efficiency.
|
||||
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
|
||||
uint32 hash = seed;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
hash += (hash << 5) + src[i];
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
151
libs/libyuv/source/compare_gcc.cc
Normal file
151
libs/libyuv/source/compare_gcc.cc
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC x86 and x64.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
asm volatile (
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x10, 0) ",%0 \n"
|
||||
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
|
||||
"lea " MEMLEA(0x10, 1) ",%1 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psubusb %%xmm2,%%xmm1 \n"
|
||||
"psubusb %%xmm3,%%xmm2 \n"
|
||||
"por %%xmm2,%%xmm1 \n"
|
||||
"movdqa %%xmm1,%%xmm2 \n"
|
||||
"punpcklbw %%xmm5,%%xmm1 \n"
|
||||
"punpckhbw %%xmm5,%%xmm2 \n"
|
||||
"pmaddwd %%xmm1,%%xmm1 \n"
|
||||
"pmaddwd %%xmm2,%%xmm2 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"paddd %%xmm2,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"pshufd $0xee,%%xmm0,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"pshufd $0x1,%%xmm0,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"movd %%xmm0,%3 \n"
|
||||
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=g"(sse) // %3
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
);
|
||||
return sse;
|
||||
}
|
||||
|
||||
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
||||
static uvec32 kHashMul0 = {
|
||||
0x0c3525e1, // 33 ^ 15
|
||||
0xa3476dc1, // 33 ^ 14
|
||||
0x3b4039a1, // 33 ^ 13
|
||||
0x4f5f0981, // 33 ^ 12
|
||||
};
|
||||
static uvec32 kHashMul1 = {
|
||||
0x30f35d61, // 33 ^ 11
|
||||
0x855cb541, // 33 ^ 10
|
||||
0x040a9121, // 33 ^ 9
|
||||
0x747c7101, // 33 ^ 8
|
||||
};
|
||||
static uvec32 kHashMul2 = {
|
||||
0xec41d4e1, // 33 ^ 7
|
||||
0x4cfa3cc1, // 33 ^ 6
|
||||
0x025528a1, // 33 ^ 5
|
||||
0x00121881, // 33 ^ 4
|
||||
};
|
||||
static uvec32 kHashMul3 = {
|
||||
0x00008c61, // 33 ^ 3
|
||||
0x00000441, // 33 ^ 2
|
||||
0x00000021, // 33 ^ 1
|
||||
0x00000001, // 33 ^ 0
|
||||
};
|
||||
|
||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
uint32 hash;
|
||||
asm volatile (
|
||||
"movd %2,%%xmm0 \n"
|
||||
"pxor %%xmm7,%%xmm7 \n"
|
||||
"movdqa %4,%%xmm6 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x10, 0) ",%0 \n"
|
||||
"pmulld %%xmm6,%%xmm0 \n"
|
||||
"movdqa %5,%%xmm5 \n"
|
||||
"movdqa %%xmm1,%%xmm2 \n"
|
||||
"punpcklbw %%xmm7,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"punpcklwd %%xmm7,%%xmm3 \n"
|
||||
"pmulld %%xmm5,%%xmm3 \n"
|
||||
"movdqa %6,%%xmm5 \n"
|
||||
"movdqa %%xmm2,%%xmm4 \n"
|
||||
"punpckhwd %%xmm7,%%xmm4 \n"
|
||||
"pmulld %%xmm5,%%xmm4 \n"
|
||||
"movdqa %7,%%xmm5 \n"
|
||||
"punpckhbw %%xmm7,%%xmm1 \n"
|
||||
"movdqa %%xmm1,%%xmm2 \n"
|
||||
"punpcklwd %%xmm7,%%xmm2 \n"
|
||||
"pmulld %%xmm5,%%xmm2 \n"
|
||||
"movdqa %8,%%xmm5 \n"
|
||||
"punpckhwd %%xmm7,%%xmm1 \n"
|
||||
"pmulld %%xmm5,%%xmm1 \n"
|
||||
"paddd %%xmm4,%%xmm3 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"paddd %%xmm3,%%xmm1 \n"
|
||||
"pshufd $0xe,%%xmm1,%%xmm2 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"pshufd $0x1,%%xmm1,%%xmm2 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%1 \n"
|
||||
"jg 1b \n"
|
||||
"movd %%xmm0,%3 \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(count), // %1
|
||||
"+rm"(seed), // %2
|
||||
"=g"(hash) // %3
|
||||
: "m"(kHash16x33), // %4
|
||||
"m"(kHashMul0), // %5
|
||||
"m"(kHashMul1), // %6
|
||||
"m"(kHashMul2), // %7
|
||||
"m"(kHashMul3) // %8
|
||||
: "memory", "cc"
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
);
|
||||
return hash;
|
||||
}
|
||||
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
66
libs/libyuv/source/compare_neon.cc
Normal file
66
libs/libyuv/source/compare_neon.cc
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
|
||||
!defined(__aarch64__)
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
volatile uint32 sse;
|
||||
asm volatile (
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q11, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q8, d4, d4 \n"
|
||||
"vmlal.s16 q9, d6, d6 \n"
|
||||
"vmlal.s16 q10, d5, d5 \n"
|
||||
"vmlal.s16 q11, d7, d7 \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
"vadd.u32 q8, q8, q9 \n"
|
||||
"vadd.u32 q10, q10, q11 \n"
|
||||
"vadd.u32 q11, q8, q10 \n"
|
||||
"vpaddl.u32 q1, q11 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
return sse;
|
||||
}
|
||||
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
64
libs/libyuv/source/compare_neon64.cc
Normal file
64
libs/libyuv/source/compare_neon64.cc
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
volatile uint32 sse;
|
||||
asm volatile (
|
||||
"eor v16.16b, v16.16b, v16.16b \n"
|
||||
"eor v18.16b, v18.16b, v18.16b \n"
|
||||
"eor v17.16b, v17.16b, v17.16b \n"
|
||||
"eor v19.16b, v19.16b, v19.16b \n"
|
||||
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
"smlal v17.4s, v3.4h, v3.4h \n"
|
||||
"smlal2 v18.4s, v2.8h, v2.8h \n"
|
||||
"smlal2 v19.4s, v3.8h, v3.8h \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"add v16.4s, v16.4s, v17.4s \n"
|
||||
"add v18.4s, v18.4s, v19.4s \n"
|
||||
"add v19.4s, v16.4s, v18.4s \n"
|
||||
"addv s0, v19.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
return sse;
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
222
libs/libyuv/source/compare_win.cc
Normal file
222
libs/libyuv/source/compare_win.cc
Normal file
@@ -0,0 +1,222 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#include "libyuv/compare_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86 and clangcl
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
__declspec(naked)
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm5, xmm5
|
||||
|
||||
wloop:
|
||||
movdqu xmm1, [eax]
|
||||
lea eax, [eax + 16]
|
||||
movdqu xmm2, [edx]
|
||||
lea edx, [edx + 16]
|
||||
movdqa xmm3, xmm1 // abs trick
|
||||
psubusb xmm1, xmm2
|
||||
psubusb xmm2, xmm3
|
||||
por xmm1, xmm2
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm1, xmm5
|
||||
punpckhbw xmm2, xmm5
|
||||
pmaddwd xmm1, xmm1
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm0, xmm2
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
pshufd xmm1, xmm0, 0xee
|
||||
paddd xmm0, xmm1
|
||||
pshufd xmm1, xmm0, 0x01
|
||||
paddd xmm0, xmm1
|
||||
movd eax, xmm0
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
|
||||
#pragma warning(disable: 4752)
|
||||
__declspec(naked)
|
||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
vpxor ymm0, ymm0, ymm0 // sum
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
|
||||
sub edx, eax
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm1, [eax]
|
||||
vmovdqu ymm2, [eax + edx]
|
||||
lea eax, [eax + 32]
|
||||
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
||||
vpsubusb ymm2, ymm2, ymm1
|
||||
vpor ymm1, ymm2, ymm3
|
||||
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
|
||||
vpunpckhbw ymm1, ymm1, ymm5
|
||||
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
|
||||
vpmaddwd ymm1, ymm1, ymm1
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpaddd ymm0, ymm0, ymm2
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpermq ymm1, ymm0, 0x02 // high + low lane.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vmovd eax, xmm0
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // _MSC_VER >= 1700
|
||||
|
||||
uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
||||
uvec32 kHashMul0 = {
|
||||
0x0c3525e1, // 33 ^ 15
|
||||
0xa3476dc1, // 33 ^ 14
|
||||
0x3b4039a1, // 33 ^ 13
|
||||
0x4f5f0981, // 33 ^ 12
|
||||
};
|
||||
uvec32 kHashMul1 = {
|
||||
0x30f35d61, // 33 ^ 11
|
||||
0x855cb541, // 33 ^ 10
|
||||
0x040a9121, // 33 ^ 9
|
||||
0x747c7101, // 33 ^ 8
|
||||
};
|
||||
uvec32 kHashMul2 = {
|
||||
0xec41d4e1, // 33 ^ 7
|
||||
0x4cfa3cc1, // 33 ^ 6
|
||||
0x025528a1, // 33 ^ 5
|
||||
0x00121881, // 33 ^ 4
|
||||
};
|
||||
uvec32 kHashMul3 = {
|
||||
0x00008c61, // 33 ^ 3
|
||||
0x00000441, // 33 ^ 2
|
||||
0x00000021, // 33 ^ 1
|
||||
0x00000001, // 33 ^ 0
|
||||
};
|
||||
|
||||
__declspec(naked)
|
||||
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
movd xmm0, [esp + 12] // seed
|
||||
|
||||
pxor xmm7, xmm7 // constant 0 for unpck
|
||||
movdqa xmm6, xmmword ptr kHash16x33
|
||||
|
||||
wloop:
|
||||
movdqu xmm1, [eax] // src[0-15]
|
||||
lea eax, [eax + 16]
|
||||
pmulld xmm0, xmm6 // hash *= 33 ^ 16
|
||||
movdqa xmm5, xmmword ptr kHashMul0
|
||||
movdqa xmm2, xmm1
|
||||
punpcklbw xmm2, xmm7 // src[0-7]
|
||||
movdqa xmm3, xmm2
|
||||
punpcklwd xmm3, xmm7 // src[0-3]
|
||||
pmulld xmm3, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul1
|
||||
movdqa xmm4, xmm2
|
||||
punpckhwd xmm4, xmm7 // src[4-7]
|
||||
pmulld xmm4, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul2
|
||||
punpckhbw xmm1, xmm7 // src[8-15]
|
||||
movdqa xmm2, xmm1
|
||||
punpcklwd xmm2, xmm7 // src[8-11]
|
||||
pmulld xmm2, xmm5
|
||||
movdqa xmm5, xmmword ptr kHashMul3
|
||||
punpckhwd xmm1, xmm7 // src[12-15]
|
||||
pmulld xmm1, xmm5
|
||||
paddd xmm3, xmm4 // add 16 results
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm1, xmm3
|
||||
|
||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0x01
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm0, xmm1
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
movd eax, xmm0 // return hash
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
__declspec(naked)
|
||||
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
mov ecx, [esp + 8] // count
|
||||
vmovd xmm0, [esp + 12] // seed
|
||||
|
||||
wloop:
|
||||
vpmovzxbd xmm3, [eax] // src[0-3]
|
||||
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
|
||||
vpmovzxbd xmm4, [eax + 4] // src[4-7]
|
||||
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
|
||||
vpmovzxbd xmm2, [eax + 8] // src[8-11]
|
||||
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
|
||||
vpmovzxbd xmm1, [eax + 12] // src[12-15]
|
||||
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
|
||||
lea eax, [eax + 16]
|
||||
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
|
||||
vpaddd xmm3, xmm3, xmm4 // add 16 results
|
||||
vpaddd xmm1, xmm1, xmm2
|
||||
vpaddd xmm1, xmm1, xmm3
|
||||
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
vpaddd xmm1, xmm1,xmm2
|
||||
vpshufd xmm2, xmm1, 0x01
|
||||
vpaddd xmm1, xmm1, xmm2
|
||||
vpaddd xmm0, xmm0, xmm1
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
vmovd eax, xmm0 // return hash
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // _MSC_VER >= 1700
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
1389
libs/libyuv/source/convert.cc
Normal file
1389
libs/libyuv/source/convert.cc
Normal file
File diff suppressed because it is too large
Load Diff
1455
libs/libyuv/source/convert_argb.cc
Normal file
1455
libs/libyuv/source/convert_argb.cc
Normal file
File diff suppressed because it is too large
Load Diff
1167
libs/libyuv/source/convert_from.cc
Normal file
1167
libs/libyuv/source/convert_from.cc
Normal file
File diff suppressed because it is too large
Load Diff
1286
libs/libyuv/source/convert_from_argb.cc
Normal file
1286
libs/libyuv/source/convert_from_argb.cc
Normal file
File diff suppressed because it is too large
Load Diff
392
libs/libyuv/source/convert_jpeg.cc
Normal file
392
libs/libyuv/source/convert_jpeg.cc
Normal file
@@ -0,0 +1,392 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert.h"
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
struct I420Buffers {
|
||||
uint8* y;
|
||||
int y_stride;
|
||||
uint8* u;
|
||||
int u_stride;
|
||||
uint8* v;
|
||||
int v_stride;
|
||||
int w;
|
||||
int h;
|
||||
};
|
||||
|
||||
static void JpegCopyI420(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I420Copy(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->y, dest->y_stride,
|
||||
dest->u, dest->u_stride,
|
||||
dest->v, dest->v_stride,
|
||||
dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI422ToI420(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I422ToI420(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->y, dest->y_stride,
|
||||
dest->u, dest->u_stride,
|
||||
dest->v, dest->v_stride,
|
||||
dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI444ToI420(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I444ToI420(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->y, dest->y_stride,
|
||||
dest->u, dest->u_stride,
|
||||
dest->v, dest->v_stride,
|
||||
dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI411ToI420(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I411ToI420(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->y, dest->y_stride,
|
||||
dest->u, dest->u_stride,
|
||||
dest->v, dest->v_stride,
|
||||
dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI400ToI420(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
I420Buffers* dest = (I420Buffers*)(opaque);
|
||||
I400ToI420(data[0], strides[0],
|
||||
dest->y, dest->y_stride,
|
||||
dest->u, dest->u_stride,
|
||||
dest->v, dest->v_stride,
|
||||
dest->w, rows);
|
||||
dest->y += rows * dest->y_stride;
|
||||
dest->u += ((rows + 1) >> 1) * dest->u_stride;
|
||||
dest->v += ((rows + 1) >> 1) * dest->v_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
// Query size of MJPG in pixels.
|
||||
LIBYUV_API
|
||||
int MJPGSize(const uint8* sample, size_t sample_size,
|
||||
int* width, int* height) {
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
|
||||
if (ret) {
|
||||
*width = mjpeg_decoder.GetWidth();
|
||||
*height = mjpeg_decoder.GetHeight();
|
||||
}
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return ret ? 0 : -1; // -1 for runtime failure.
|
||||
}
|
||||
|
||||
// MJPG (Motion JPeg) to I420
|
||||
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
|
||||
LIBYUV_API
|
||||
int MJPGToI420(const uint8* sample,
|
||||
size_t sample_size,
|
||||
uint8* y, int y_stride,
|
||||
uint8* u, int u_stride,
|
||||
uint8* v, int v_stride,
|
||||
int w, int h,
|
||||
int dw, int dh) {
|
||||
if (sample_size == kUnknownDataSize) {
|
||||
// ERROR: MJPEG frame size unknown
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port MJpeg to C.
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
|
||||
if (ret && (mjpeg_decoder.GetWidth() != w ||
|
||||
mjpeg_decoder.GetHeight() != h)) {
|
||||
// ERROR: MJPEG frame has unexpected dimensions
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1; // runtime failure
|
||||
}
|
||||
if (ret) {
|
||||
I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
|
||||
// YUV420
|
||||
if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
|
||||
// YUV422
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
|
||||
// YUV444
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
|
||||
// YUV411
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
|
||||
// YUV400
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceGrayscale &&
|
||||
mjpeg_decoder.GetNumComponents() == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
|
||||
} else {
|
||||
// TODO(fbarchard): Implement conversion for any other colorspace/sample
|
||||
// factors that occur in practice. 411 is supported by libjpeg
|
||||
// ERROR: Unable to convert MJPEG frame because format is not supported
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return ret ? 0 : 1;
|
||||
}
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
struct ARGBBuffers {
|
||||
uint8* argb;
|
||||
int argb_stride;
|
||||
int w;
|
||||
int h;
|
||||
};
|
||||
|
||||
static void JpegI420ToARGB(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I420ToARGB(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->argb, dest->argb_stride,
|
||||
dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI422ToARGB(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I422ToARGB(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->argb, dest->argb_stride,
|
||||
dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI444ToARGB(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I444ToARGB(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->argb, dest->argb_stride,
|
||||
dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI411ToARGB(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I411ToARGB(data[0], strides[0],
|
||||
data[1], strides[1],
|
||||
data[2], strides[2],
|
||||
dest->argb, dest->argb_stride,
|
||||
dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
static void JpegI400ToARGB(void* opaque,
|
||||
const uint8* const* data,
|
||||
const int* strides,
|
||||
int rows) {
|
||||
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
|
||||
I400ToARGB(data[0], strides[0],
|
||||
dest->argb, dest->argb_stride,
|
||||
dest->w, rows);
|
||||
dest->argb += rows * dest->argb_stride;
|
||||
dest->h -= rows;
|
||||
}
|
||||
|
||||
// MJPG (Motion JPeg) to ARGB
|
||||
// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
|
||||
LIBYUV_API
|
||||
int MJPGToARGB(const uint8* sample,
|
||||
size_t sample_size,
|
||||
uint8* argb, int argb_stride,
|
||||
int w, int h,
|
||||
int dw, int dh) {
|
||||
if (sample_size == kUnknownDataSize) {
|
||||
// ERROR: MJPEG frame size unknown
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Port MJpeg to C.
|
||||
MJpegDecoder mjpeg_decoder;
|
||||
LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
|
||||
if (ret && (mjpeg_decoder.GetWidth() != w ||
|
||||
mjpeg_decoder.GetHeight() != h)) {
|
||||
// ERROR: MJPEG frame has unexpected dimensions
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1; // runtime failure
|
||||
}
|
||||
if (ret) {
|
||||
ARGBBuffers bufs = { argb, argb_stride, dw, dh };
|
||||
// YUV420
|
||||
if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
|
||||
// YUV422
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
|
||||
// YUV444
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
|
||||
// YUV411
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceYCbCr &&
|
||||
mjpeg_decoder.GetNumComponents() == 3 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
|
||||
mjpeg_decoder.GetVertSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
|
||||
// YUV400
|
||||
} else if (mjpeg_decoder.GetColorSpace() ==
|
||||
MJpegDecoder::kColorSpaceGrayscale &&
|
||||
mjpeg_decoder.GetNumComponents() == 1 &&
|
||||
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
|
||||
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
|
||||
ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
|
||||
} else {
|
||||
// TODO(fbarchard): Implement conversion for any other colorspace/sample
|
||||
// factors that occur in practice. 411 is supported by libjpeg
|
||||
// ERROR: Unable to convert MJPEG frame because format is not supported
|
||||
mjpeg_decoder.UnloadFrame();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return ret ? 0 : 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
306
libs/libyuv/source/convert_to_argb.cc
Normal file
306
libs/libyuv/source/convert_to_argb.cc
Normal file
@@ -0,0 +1,306 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert_argb.h"
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
#ifdef HAVE_JPEG
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
#endif
|
||||
#include "libyuv/rotate_argb.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Convert camera sample to I420 with cropping, rotation and vertical flip.
|
||||
// src_width is used for source stride computation
|
||||
// src_height is used to compute location of planes, and indicate inversion
|
||||
// sample_size is measured in bytes and is the size of the frame.
|
||||
// With MJPEG it is the compressed size of the frame.
|
||||
LIBYUV_API
|
||||
int ConvertToARGB(const uint8* sample, size_t sample_size,
|
||||
uint8* crop_argb, int argb_stride,
|
||||
int crop_x, int crop_y,
|
||||
int src_width, int src_height,
|
||||
int crop_width, int crop_height,
|
||||
enum RotationMode rotation,
|
||||
uint32 fourcc) {
|
||||
uint32 format = CanonicalFourCC(fourcc);
|
||||
int aligned_src_width = (src_width + 1) & ~1;
|
||||
const uint8* src;
|
||||
const uint8* src_uv;
|
||||
int abs_src_height = (src_height < 0) ? -src_height : src_height;
|
||||
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
int r = 0;
|
||||
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
|
||||
// and then rotate the I420 to the final destination buffer.
|
||||
// For in-place conversion, if destination crop_argb is same as source sample,
|
||||
// also enable temporary buffer.
|
||||
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
|
||||
crop_argb == sample;
|
||||
uint8* tmp_argb = crop_argb;
|
||||
int tmp_argb_stride = argb_stride;
|
||||
uint8* rotate_buffer = NULL;
|
||||
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
|
||||
if (crop_argb == NULL || sample == NULL ||
|
||||
src_width <= 0 || crop_width <= 0 ||
|
||||
src_height == 0 || crop_height == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (src_height < 0) {
|
||||
inv_crop_height = -inv_crop_height;
|
||||
}
|
||||
|
||||
if (need_buf) {
|
||||
int argb_size = crop_width * abs_crop_height * 4;
|
||||
rotate_buffer = (uint8*)malloc(argb_size);
|
||||
if (!rotate_buffer) {
|
||||
return 1; // Out of memory runtime error.
|
||||
}
|
||||
crop_argb = rotate_buffer;
|
||||
argb_stride = crop_width;
|
||||
}
|
||||
|
||||
switch (format) {
|
||||
// Single plane formats
|
||||
case FOURCC_YUY2:
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = YUY2ToARGB(src, aligned_src_width * 2,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_UYVY:
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = UYVYToARGB(src, aligned_src_width * 2,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_24BG:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RGB24ToARGB(src, src_width * 3,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RAW:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RAWToARGB(src, src_width * 3,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ARGB:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ARGBToARGB(src, src_width * 4,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_BGRA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = BGRAToARGB(src, src_width * 4,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ABGR:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ABGRToARGB(src, src_width * 4,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = RGBAToARGB(src, src_width * 4,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBP:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = RGB565ToARGB(src, src_width * 2,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBO:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB1555ToARGB(src, src_width * 2,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_R444:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB4444ToARGB(src, src_width * 2,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_I400:
|
||||
src = sample + src_width * crop_y + crop_x;
|
||||
r = I400ToARGB(src, src_width,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
|
||||
// Biplanar formats
|
||||
case FOURCC_NV12:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
|
||||
r = NV12ToARGB(src, src_width,
|
||||
src_uv, aligned_src_width,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_NV21:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
|
||||
// Call NV12 but with u and v parameters swapped.
|
||||
r = NV21ToARGB(src, src_width,
|
||||
src_uv, aligned_src_width,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_M420:
|
||||
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
|
||||
r = M420ToARGB(src, src_width,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
// Triplanar formats
|
||||
case FOURCC_I420:
|
||||
case FOURCC_YU12:
|
||||
case FOURCC_YV12: {
|
||||
const uint8* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
if (format == FOURCC_YV12) {
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
}
|
||||
r = I420ToARGB(src_y, src_width,
|
||||
src_u, halfwidth,
|
||||
src_v, halfwidth,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_J420: {
|
||||
const uint8* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
r = J420ToARGB(src_y, src_width,
|
||||
src_u, halfwidth,
|
||||
src_v, halfwidth,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
|
||||
case FOURCC_I422:
|
||||
case FOURCC_YV16: {
|
||||
const uint8* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
if (format == FOURCC_YV16) {
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * crop_y + crop_x / 2;
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * crop_y + crop_x / 2;
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
}
|
||||
r = I422ToARGB(src_y, src_width,
|
||||
src_u, halfwidth,
|
||||
src_v, halfwidth,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I444:
|
||||
case FOURCC_YV24: {
|
||||
const uint8* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
if (format == FOURCC_YV24) {
|
||||
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
} else {
|
||||
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
}
|
||||
r = I444ToARGB(src_y, src_width,
|
||||
src_u, src_width,
|
||||
src_v, src_width,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I411: {
|
||||
int quarterwidth = (src_width + 3) / 4;
|
||||
const uint8* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8* src_u = sample + src_width * abs_src_height +
|
||||
quarterwidth * crop_y + crop_x / 4;
|
||||
const uint8* src_v = sample + src_width * abs_src_height +
|
||||
quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
|
||||
r = I411ToARGB(src_y, src_width,
|
||||
src_u, quarterwidth,
|
||||
src_v, quarterwidth,
|
||||
crop_argb, argb_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
#ifdef HAVE_JPEG
|
||||
case FOURCC_MJPG:
|
||||
r = MJPGToARGB(sample, sample_size,
|
||||
crop_argb, argb_stride,
|
||||
src_width, abs_src_height, crop_width, inv_crop_height);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
r = -1; // unknown fourcc - return failure code.
|
||||
}
|
||||
|
||||
if (need_buf) {
|
||||
if (!r) {
|
||||
r = ARGBRotate(crop_argb, argb_stride,
|
||||
tmp_argb, tmp_argb_stride,
|
||||
crop_width, abs_crop_height, rotation);
|
||||
}
|
||||
free(rotate_buffer);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
339
libs/libyuv/source/convert_to_i420.cc
Normal file
339
libs/libyuv/source/convert_to_i420.cc
Normal file
@@ -0,0 +1,339 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "libyuv/convert.h"
|
||||
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Convert camera sample to I420 with cropping, rotation and vertical flip.
|
||||
// src_width is used for source stride computation
|
||||
// src_height is used to compute location of planes, and indicate inversion
|
||||
// sample_size is measured in bytes and is the size of the frame.
|
||||
// With MJPEG it is the compressed size of the frame.
|
||||
LIBYUV_API
|
||||
int ConvertToI420(const uint8* sample,
|
||||
size_t sample_size,
|
||||
uint8* y, int y_stride,
|
||||
uint8* u, int u_stride,
|
||||
uint8* v, int v_stride,
|
||||
int crop_x, int crop_y,
|
||||
int src_width, int src_height,
|
||||
int crop_width, int crop_height,
|
||||
enum RotationMode rotation,
|
||||
uint32 fourcc) {
|
||||
uint32 format = CanonicalFourCC(fourcc);
|
||||
int aligned_src_width = (src_width + 1) & ~1;
|
||||
const uint8* src;
|
||||
const uint8* src_uv;
|
||||
int abs_src_height = (src_height < 0) ? -src_height : src_height;
|
||||
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
int r = 0;
|
||||
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
|
||||
format != FOURCC_NV12 && format != FOURCC_NV21 &&
|
||||
format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
|
||||
uint8* tmp_y = y;
|
||||
uint8* tmp_u = u;
|
||||
uint8* tmp_v = v;
|
||||
int tmp_y_stride = y_stride;
|
||||
int tmp_u_stride = u_stride;
|
||||
int tmp_v_stride = v_stride;
|
||||
uint8* rotate_buffer = NULL;
|
||||
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
|
||||
if (!y || !u || !v || !sample ||
|
||||
src_width <= 0 || crop_width <= 0 ||
|
||||
src_height == 0 || crop_height == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (src_height < 0) {
|
||||
inv_crop_height = -inv_crop_height;
|
||||
}
|
||||
|
||||
// One pass rotation is available for some formats. For the rest, convert
|
||||
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
|
||||
// and then rotate the I420 to the final destination buffer.
|
||||
// For in-place conversion, if destination y is same as source sample,
|
||||
// also enable temporary buffer.
|
||||
if (need_buf) {
|
||||
int y_size = crop_width * abs_crop_height;
|
||||
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
|
||||
rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
|
||||
if (!rotate_buffer) {
|
||||
return 1; // Out of memory runtime error.
|
||||
}
|
||||
y = rotate_buffer;
|
||||
u = y + y_size;
|
||||
v = u + uv_size;
|
||||
y_stride = crop_width;
|
||||
u_stride = v_stride = ((crop_width + 1) / 2);
|
||||
}
|
||||
|
||||
switch (format) {
|
||||
// Single plane formats
|
||||
case FOURCC_YUY2:
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = YUY2ToI420(src, aligned_src_width * 2,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_UYVY:
|
||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||
r = UYVYToI420(src, aligned_src_width * 2,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBP:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = RGB565ToI420(src, src_width * 2,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBO:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB1555ToI420(src, src_width * 2,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_R444:
|
||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||
r = ARGB4444ToI420(src, src_width * 2,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_24BG:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RGB24ToI420(src, src_width * 3,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RAW:
|
||||
src = sample + (src_width * crop_y + crop_x) * 3;
|
||||
r = RAWToI420(src, src_width * 3,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ARGB:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ARGBToI420(src, src_width * 4,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_BGRA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = BGRAToI420(src, src_width * 4,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_ABGR:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = ABGRToI420(src, src_width * 4,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_RGBA:
|
||||
src = sample + (src_width * crop_y + crop_x) * 4;
|
||||
r = RGBAToI420(src, src_width * 4,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
case FOURCC_I400:
|
||||
src = sample + src_width * crop_y + crop_x;
|
||||
r = I400ToI420(src, src_width,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
// Biplanar formats
|
||||
case FOURCC_NV12:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv = sample + (src_width * src_height) +
|
||||
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
|
||||
r = NV12ToI420Rotate(src, src_width,
|
||||
src_uv, aligned_src_width,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height, rotation);
|
||||
break;
|
||||
case FOURCC_NV21:
|
||||
src = sample + (src_width * crop_y + crop_x);
|
||||
src_uv = sample + (src_width * src_height) +
|
||||
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
|
||||
// Call NV12 but with u and v parameters swapped.
|
||||
r = NV12ToI420Rotate(src, src_width,
|
||||
src_uv, aligned_src_width,
|
||||
y, y_stride,
|
||||
v, v_stride,
|
||||
u, u_stride,
|
||||
crop_width, inv_crop_height, rotation);
|
||||
break;
|
||||
case FOURCC_M420:
|
||||
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
|
||||
r = M420ToI420(src, src_width,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
// Triplanar formats
|
||||
case FOURCC_I420:
|
||||
case FOURCC_YU12:
|
||||
case FOURCC_YV12: {
|
||||
const uint8* src_y = sample + (src_width * crop_y + crop_x);
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
int halfheight = (abs_src_height + 1) / 2;
|
||||
if (format == FOURCC_YV12) {
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
(halfwidth * crop_y + crop_x) / 2;
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
|
||||
}
|
||||
r = I420Rotate(src_y, src_width,
|
||||
src_u, halfwidth,
|
||||
src_v, halfwidth,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height, rotation);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I422:
|
||||
case FOURCC_YV16: {
|
||||
const uint8* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
int halfwidth = (src_width + 1) / 2;
|
||||
if (format == FOURCC_YV16) {
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * crop_y + crop_x / 2;
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
} else {
|
||||
src_u = sample + src_width * abs_src_height +
|
||||
halfwidth * crop_y + crop_x / 2;
|
||||
src_v = sample + src_width * abs_src_height +
|
||||
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
|
||||
}
|
||||
r = I422ToI420(src_y, src_width,
|
||||
src_u, halfwidth,
|
||||
src_v, halfwidth,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I444:
|
||||
case FOURCC_YV24: {
|
||||
const uint8* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
if (format == FOURCC_YV24) {
|
||||
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
} else {
|
||||
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
|
||||
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
|
||||
}
|
||||
r = I444ToI420(src_y, src_width,
|
||||
src_u, src_width,
|
||||
src_v, src_width,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
case FOURCC_I411: {
|
||||
int quarterwidth = (src_width + 3) / 4;
|
||||
const uint8* src_y = sample + src_width * crop_y + crop_x;
|
||||
const uint8* src_u = sample + src_width * abs_src_height +
|
||||
quarterwidth * crop_y + crop_x / 4;
|
||||
const uint8* src_v = sample + src_width * abs_src_height +
|
||||
quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
|
||||
r = I411ToI420(src_y, src_width,
|
||||
src_u, quarterwidth,
|
||||
src_v, quarterwidth,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
crop_width, inv_crop_height);
|
||||
break;
|
||||
}
|
||||
#ifdef HAVE_JPEG
|
||||
case FOURCC_MJPG:
|
||||
r = MJPGToI420(sample, sample_size,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
src_width, abs_src_height, crop_width, inv_crop_height);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
r = -1; // unknown fourcc - return failure code.
|
||||
}
|
||||
|
||||
if (need_buf) {
|
||||
if (!r) {
|
||||
r = I420Rotate(y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
tmp_y, tmp_y_stride,
|
||||
tmp_u, tmp_u_stride,
|
||||
tmp_v, tmp_v_stride,
|
||||
crop_width, abs_crop_height, rotation);
|
||||
}
|
||||
free(rotate_buffer);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
299
libs/libyuv/source/cpu_id.cc
Normal file
299
libs/libyuv/source/cpu_id.cc
Normal file
@@ -0,0 +1,299 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#include <intrin.h> // For __cpuidex()
|
||||
#endif
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
|
||||
defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
|
||||
#include <immintrin.h> // For _xgetbv()
|
||||
#endif
|
||||
|
||||
#if !defined(__native_client__)
|
||||
#include <stdlib.h> // For getenv()
|
||||
#endif
|
||||
|
||||
// For ArmCpuCaps() but unittested on all platforms
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libyuv/basic_types.h" // For CPU_X86
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// For functions that use the stack and have runtime checks for overflow,
|
||||
// use SAFEBUFFERS to avoid additional check.
|
||||
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
|
||||
#define SAFEBUFFERS __declspec(safebuffers)
|
||||
#else
|
||||
#define SAFEBUFFERS
|
||||
#endif
|
||||
|
||||
// Low level cpuid for X86.
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || \
|
||||
defined(__i386__) || defined(__x86_64__)) && \
|
||||
!defined(__pnacl__) && !defined(__CLR_VER)
|
||||
LIBYUV_API
|
||||
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
// Visual C version uses intrinsic or inline x86 assembly.
|
||||
#if (_MSC_FULL_VER >= 160040219)
|
||||
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
|
||||
#elif defined(_M_IX86)
|
||||
__asm {
|
||||
mov eax, info_eax
|
||||
mov ecx, info_ecx
|
||||
mov edi, cpu_info
|
||||
cpuid
|
||||
mov [edi], eax
|
||||
mov [edi + 4], ebx
|
||||
mov [edi + 8], ecx
|
||||
mov [edi + 12], edx
|
||||
}
|
||||
#else // Visual C but not x86
|
||||
if (info_ecx == 0) {
|
||||
__cpuid((int*)(cpu_info), info_eax);
|
||||
} else {
|
||||
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
|
||||
}
|
||||
#endif
|
||||
// GCC version uses inline x86 assembly.
|
||||
#else // defined(_MSC_VER) && !defined(__clang__)
|
||||
uint32 info_ebx, info_edx;
|
||||
asm volatile (
|
||||
#if defined( __i386__) && defined(__PIC__)
|
||||
// Preserve ebx for fpic 32 bit.
|
||||
"mov %%ebx, %%edi \n"
|
||||
"cpuid \n"
|
||||
"xchg %%edi, %%ebx \n"
|
||||
: "=D" (info_ebx),
|
||||
#else
|
||||
"cpuid \n"
|
||||
: "=b" (info_ebx),
|
||||
#endif // defined( __i386__) && defined(__PIC__)
|
||||
"+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
|
||||
cpu_info[0] = info_eax;
|
||||
cpu_info[1] = info_ebx;
|
||||
cpu_info[2] = info_ecx;
|
||||
cpu_info[3] = info_edx;
|
||||
#endif // defined(_MSC_VER) && !defined(__clang__)
|
||||
}
|
||||
#else // (defined(_M_IX86) || defined(_M_X64) ...
|
||||
LIBYUV_API
|
||||
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
|
||||
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// For VS2010 and earlier emit can be used:
|
||||
// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
|
||||
// __asm {
|
||||
// xor ecx, ecx // xcr 0
|
||||
// xgetbv
|
||||
// mov xcr0, eax
|
||||
// }
|
||||
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
|
||||
// https://code.google.com/p/libyuv/issues/detail?id=529
|
||||
#if defined(_M_IX86) && (_MSC_VER < 1900)
|
||||
#pragma optimize("g", off)
|
||||
#endif
|
||||
#if (defined(_M_IX86) || defined(_M_X64) || \
|
||||
defined(__i386__) || defined(__x86_64__)) && \
|
||||
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
|
||||
#define HAS_XGETBV
|
||||
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
|
||||
int GetXCR0() {
|
||||
uint32 xcr0 = 0u;
|
||||
#if (_MSC_FULL_VER >= 160040219)
|
||||
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
|
||||
#elif defined(__i386__) || defined(__x86_64__)
|
||||
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
|
||||
#endif // defined(__i386__) || defined(__x86_64__)
|
||||
return xcr0;
|
||||
}
|
||||
#endif // defined(_M_IX86) || defined(_M_X64) ..
|
||||
// Return optimization to previous setting.
|
||||
#if defined(_M_IX86) && (_MSC_VER < 1900)
|
||||
#pragma optimize("g", on)
|
||||
#endif
|
||||
|
||||
// based on libvpx arm_cpudetect.c
|
||||
// For Arm, but public to allow testing on any CPU
|
||||
LIBYUV_API SAFEBUFFERS
|
||||
int ArmCpuCaps(const char* cpuinfo_name) {
|
||||
char cpuinfo_line[512];
|
||||
FILE* f = fopen(cpuinfo_name, "r");
|
||||
if (!f) {
|
||||
// Assume Neon if /proc/cpuinfo is unavailable.
|
||||
// This will occur for Chrome sandbox for Pepper or Render process.
|
||||
return kCpuHasNEON;
|
||||
}
|
||||
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
|
||||
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
|
||||
char* p = strstr(cpuinfo_line, " neon");
|
||||
if (p && (p[5] == ' ' || p[5] == '\n')) {
|
||||
fclose(f);
|
||||
return kCpuHasNEON;
|
||||
}
|
||||
// aarch64 uses asimd for Neon.
|
||||
p = strstr(cpuinfo_line, " asimd");
|
||||
if (p && (p[6] == ' ' || p[6] == '\n')) {
|
||||
fclose(f);
|
||||
return kCpuHasNEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// CPU detect function for SIMD instruction sets.
|
||||
LIBYUV_API
|
||||
int cpu_info_ = 0; // cpu_info is not initialized yet.
|
||||
|
||||
// Test environment variable for disabling CPU features. Any non-zero value
|
||||
// to disable. Zero ignored to make it easy to set the variable on/off.
|
||||
#if !defined(__native_client__) && !defined(_M_ARM)
|
||||
|
||||
static LIBYUV_BOOL TestEnv(const char* name) {
|
||||
const char* var = getenv(name);
|
||||
if (var) {
|
||||
if (var[0] != '0') {
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
}
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#else // nacl does not support getenv().
|
||||
static LIBYUV_BOOL TestEnv(const char*) {
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
|
||||
LIBYUV_API SAFEBUFFERS
|
||||
int InitCpuFlags(void) {
|
||||
// TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
|
||||
int cpu_info = 0;
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
|
||||
uint32 cpu_info0[4] = { 0, 0, 0, 0 };
|
||||
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
|
||||
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
|
||||
CpuId(0, 0, cpu_info0);
|
||||
CpuId(1, 0, cpu_info1);
|
||||
if (cpu_info0[0] >= 7) {
|
||||
CpuId(7, 0, cpu_info7);
|
||||
}
|
||||
cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
|
||||
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
|
||||
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
|
||||
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
|
||||
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
|
||||
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
|
||||
kCpuHasX86;
|
||||
|
||||
#ifdef HAS_XGETBV
|
||||
// AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
|
||||
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
|
||||
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
|
||||
cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
|
||||
|
||||
// Detect AVX512bw
|
||||
if ((GetXCR0() & 0xe0) == 0xe0) {
|
||||
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Environment variable overrides for testing.
|
||||
if (TestEnv("LIBYUV_DISABLE_X86")) {
|
||||
cpu_info &= ~kCpuHasX86;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
|
||||
cpu_info &= ~kCpuHasSSE2;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
|
||||
cpu_info &= ~kCpuHasSSSE3;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
|
||||
cpu_info &= ~kCpuHasSSE41;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
|
||||
cpu_info &= ~kCpuHasSSE42;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_AVX")) {
|
||||
cpu_info &= ~kCpuHasAVX;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
|
||||
cpu_info &= ~kCpuHasAVX2;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
|
||||
cpu_info &= ~kCpuHasERMS;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
|
||||
cpu_info &= ~kCpuHasFMA3;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
|
||||
cpu_info &= ~kCpuHasAVX3;
|
||||
}
|
||||
#endif
|
||||
#if defined(__mips__) && defined(__linux__)
|
||||
#if defined(__mips_dspr2)
|
||||
cpu_info |= kCpuHasDSPR2;
|
||||
#endif
|
||||
cpu_info |= kCpuHasMIPS;
|
||||
if (getenv("LIBYUV_DISABLE_DSPR2")) {
|
||||
cpu_info &= ~kCpuHasDSPR2;
|
||||
}
|
||||
#endif
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
// gcc -mfpu=neon defines __ARM_NEON__
|
||||
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
|
||||
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
|
||||
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
|
||||
cpu_info = kCpuHasNEON;
|
||||
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
|
||||
// flag in it.
|
||||
// So for aarch64, neon enabling is hard coded here.
|
||||
#endif
|
||||
#if defined(__aarch64__)
|
||||
cpu_info = kCpuHasNEON;
|
||||
#else
|
||||
// Linux arm parse text file for neon detect.
|
||||
cpu_info = ArmCpuCaps("/proc/cpuinfo");
|
||||
#endif
|
||||
cpu_info |= kCpuHasARM;
|
||||
if (TestEnv("LIBYUV_DISABLE_NEON")) {
|
||||
cpu_info &= ~kCpuHasNEON;
|
||||
}
|
||||
#endif // __arm__
|
||||
if (TestEnv("LIBYUV_DISABLE_ASM")) {
|
||||
cpu_info = 0;
|
||||
}
|
||||
cpu_info |= kCpuInitialized;
|
||||
cpu_info_ = cpu_info;
|
||||
return cpu_info;
|
||||
}
|
||||
|
||||
// Note that use of this function is not thread safe.
|
||||
LIBYUV_API
|
||||
void MaskCpuFlags(int enable_flags) {
|
||||
cpu_info_ = InitCpuFlags() & enable_flags;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
570
libs/libyuv/source/mjpeg_decoder.cc
Normal file
570
libs/libyuv/source/mjpeg_decoder.cc
Normal file
@@ -0,0 +1,570 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
|
||||
#ifdef HAVE_JPEG
|
||||
#include <assert.h>
|
||||
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
// Must be included before jpeglib.
|
||||
#include <setjmp.h>
|
||||
#define HAVE_SETJMP
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable warning 4324: structure was padded due to __declspec(align())
|
||||
#pragma warning(disable:4324)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
struct FILE; // For jpeglib.h.
|
||||
|
||||
// C++ build requires extern C for jpeg internals.
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <jpeglib.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#include "libyuv/planar_functions.h" // For CopyPlane().
|
||||
|
||||
namespace libyuv {
|
||||
|
||||
#ifdef HAVE_SETJMP
|
||||
struct SetJmpErrorMgr {
|
||||
jpeg_error_mgr base; // Must be at the top
|
||||
jmp_buf setjmp_buffer;
|
||||
};
|
||||
#endif
|
||||
|
||||
const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
|
||||
const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
|
||||
const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
|
||||
const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
|
||||
const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
|
||||
const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
|
||||
|
||||
// Methods that are passed to jpeglib.
|
||||
boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
|
||||
void init_source(jpeg_decompress_struct* cinfo);
|
||||
void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
|
||||
void term_source(jpeg_decompress_struct* cinfo);
|
||||
void ErrorHandler(jpeg_common_struct* cinfo);
|
||||
|
||||
MJpegDecoder::MJpegDecoder()
|
||||
: has_scanline_padding_(LIBYUV_FALSE),
|
||||
num_outbufs_(0),
|
||||
scanlines_(NULL),
|
||||
scanlines_sizes_(NULL),
|
||||
databuf_(NULL),
|
||||
databuf_strides_(NULL) {
|
||||
decompress_struct_ = new jpeg_decompress_struct;
|
||||
source_mgr_ = new jpeg_source_mgr;
|
||||
#ifdef HAVE_SETJMP
|
||||
error_mgr_ = new SetJmpErrorMgr;
|
||||
decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
|
||||
// Override standard exit()-based error handler.
|
||||
error_mgr_->base.error_exit = &ErrorHandler;
|
||||
#endif
|
||||
decompress_struct_->client_data = NULL;
|
||||
source_mgr_->init_source = &init_source;
|
||||
source_mgr_->fill_input_buffer = &fill_input_buffer;
|
||||
source_mgr_->skip_input_data = &skip_input_data;
|
||||
source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
|
||||
source_mgr_->term_source = &term_source;
|
||||
jpeg_create_decompress(decompress_struct_);
|
||||
decompress_struct_->src = source_mgr_;
|
||||
buf_vec_.buffers = &buf_;
|
||||
buf_vec_.len = 1;
|
||||
}
|
||||
|
||||
MJpegDecoder::~MJpegDecoder() {
|
||||
jpeg_destroy_decompress(decompress_struct_);
|
||||
delete decompress_struct_;
|
||||
delete source_mgr_;
|
||||
#ifdef HAVE_SETJMP
|
||||
delete error_mgr_;
|
||||
#endif
|
||||
DestroyOutputBuffers();
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
|
||||
if (!ValidateJpeg(src, src_len)) {
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
|
||||
buf_.data = src;
|
||||
buf_.len = static_cast<int>(src_len);
|
||||
buf_vec_.pos = 0;
|
||||
decompress_struct_->client_data = &buf_vec_;
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called jpeg_read_header, it experienced an error, and we called
|
||||
// longjmp() and rewound the stack to here. Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
|
||||
// ERROR: Bad MJPEG header
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
AllocOutputBuffers(GetNumComponents());
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int scanlines_size = GetComponentScanlinesPerImcuRow(i);
|
||||
if (scanlines_sizes_[i] != scanlines_size) {
|
||||
if (scanlines_[i]) {
|
||||
delete scanlines_[i];
|
||||
}
|
||||
scanlines_[i] = new uint8* [scanlines_size];
|
||||
scanlines_sizes_[i] = scanlines_size;
|
||||
}
|
||||
|
||||
// We allocate padding for the final scanline to pad it up to DCTSIZE bytes
|
||||
// to avoid memory errors, since jpeglib only reads full MCUs blocks. For
|
||||
// the preceding scanlines, the padding is not needed/wanted because the
|
||||
// following addresses will already be valid (they are the initial bytes of
|
||||
// the next scanline) and will be overwritten when jpeglib writes out that
|
||||
// next scanline.
|
||||
int databuf_stride = GetComponentStride(i);
|
||||
int databuf_size = scanlines_size * databuf_stride;
|
||||
if (databuf_strides_[i] != databuf_stride) {
|
||||
if (databuf_[i]) {
|
||||
delete databuf_[i];
|
||||
}
|
||||
databuf_[i] = new uint8[databuf_size];
|
||||
databuf_strides_[i] = databuf_stride;
|
||||
}
|
||||
|
||||
if (GetComponentStride(i) != GetComponentWidth(i)) {
|
||||
has_scanline_padding_ = LIBYUV_TRUE;
|
||||
}
|
||||
}
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
static int DivideAndRoundUp(int numerator, int denominator) {
|
||||
return (numerator + denominator - 1) / denominator;
|
||||
}
|
||||
|
||||
static int DivideAndRoundDown(int numerator, int denominator) {
|
||||
return numerator / denominator;
|
||||
}
|
||||
|
||||
// Returns width of the last loaded frame.
|
||||
int MJpegDecoder::GetWidth() {
|
||||
return decompress_struct_->image_width;
|
||||
}
|
||||
|
||||
// Returns height of the last loaded frame.
|
||||
int MJpegDecoder::GetHeight() {
|
||||
return decompress_struct_->image_height;
|
||||
}
|
||||
|
||||
// Returns format of the last loaded frame. The return value is one of the
|
||||
// kColorSpace* constants.
|
||||
int MJpegDecoder::GetColorSpace() {
|
||||
return decompress_struct_->jpeg_color_space;
|
||||
}
|
||||
|
||||
// Number of color components in the color space.
|
||||
int MJpegDecoder::GetNumComponents() {
|
||||
return decompress_struct_->num_components;
|
||||
}
|
||||
|
||||
// Sample factors of the n-th component.
|
||||
int MJpegDecoder::GetHorizSampFactor(int component) {
|
||||
return decompress_struct_->comp_info[component].h_samp_factor;
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetVertSampFactor(int component) {
|
||||
return decompress_struct_->comp_info[component].v_samp_factor;
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetHorizSubSampFactor(int component) {
|
||||
return decompress_struct_->max_h_samp_factor /
|
||||
GetHorizSampFactor(component);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetVertSubSampFactor(int component) {
|
||||
return decompress_struct_->max_v_samp_factor /
|
||||
GetVertSampFactor(component);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetImageScanlinesPerImcuRow() {
|
||||
return decompress_struct_->max_v_samp_factor * DCTSIZE;
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
|
||||
int vs = GetVertSubSampFactor(component);
|
||||
return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentWidth(int component) {
|
||||
int hs = GetHorizSubSampFactor(component);
|
||||
return DivideAndRoundUp(GetWidth(), hs);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentHeight(int component) {
|
||||
int vs = GetVertSubSampFactor(component);
|
||||
return DivideAndRoundUp(GetHeight(), vs);
|
||||
}
|
||||
|
||||
// Get width in bytes padded out to a multiple of DCTSIZE
|
||||
int MJpegDecoder::GetComponentStride(int component) {
|
||||
return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
|
||||
}
|
||||
|
||||
int MJpegDecoder::GetComponentSize(int component) {
|
||||
return GetComponentWidth(component) * GetComponentHeight(component);
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called jpeg_abort_decompress, it experienced an error, and we called
|
||||
// longjmp() and rewound the stack to here. Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
jpeg_abort_decompress(decompress_struct_);
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
|
||||
LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
|
||||
uint8** planes, int dst_width, int dst_height) {
|
||||
if (dst_width != GetWidth() ||
|
||||
dst_height > GetHeight()) {
|
||||
// ERROR: Bad dimensions
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called into jpeglib, it experienced an error sometime during this
|
||||
// function call, and we called longjmp() and rewound the stack to here.
|
||||
// Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
if (!StartDecode()) {
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
SetScanlinePointers(databuf_);
|
||||
int lines_left = dst_height;
|
||||
// Compute amount of lines to skip to implement vertical crop.
|
||||
// TODO(fbarchard): Ensure skip is a multiple of maximum component
|
||||
// subsample. ie 2
|
||||
int skip = (GetHeight() - dst_height) / 2;
|
||||
if (skip > 0) {
|
||||
// There is no API to skip lines in the output data, so we read them
|
||||
// into the temp buffer.
|
||||
while (skip >= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
skip -= GetImageScanlinesPerImcuRow();
|
||||
}
|
||||
if (skip > 0) {
|
||||
// Have a partial iMCU row left over to skip. Must read it and then
|
||||
// copy the parts we want into the destination.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
// TODO(fbarchard): Compute skip to avoid this
|
||||
assert(skip % GetVertSubSampFactor(i) == 0);
|
||||
int rows_to_skip =
|
||||
DivideAndRoundDown(skip, GetVertSubSampFactor(i));
|
||||
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
|
||||
rows_to_skip;
|
||||
int data_to_skip = rows_to_skip * GetComponentStride(i);
|
||||
CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
|
||||
planes[i], GetComponentWidth(i),
|
||||
GetComponentWidth(i), scanlines_to_copy);
|
||||
planes[i] += scanlines_to_copy * GetComponentWidth(i);
|
||||
}
|
||||
lines_left -= (GetImageScanlinesPerImcuRow() - skip);
|
||||
}
|
||||
}
|
||||
|
||||
// Read full MCUs but cropped horizontally
|
||||
for (; lines_left > GetImageScanlinesPerImcuRow();
|
||||
lines_left -= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
|
||||
CopyPlane(databuf_[i], GetComponentStride(i),
|
||||
planes[i], GetComponentWidth(i),
|
||||
GetComponentWidth(i), scanlines_to_copy);
|
||||
planes[i] += scanlines_to_copy * GetComponentWidth(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (lines_left > 0) {
|
||||
// Have a partial iMCU row left over to decode.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int scanlines_to_copy =
|
||||
DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
|
||||
CopyPlane(databuf_[i], GetComponentStride(i),
|
||||
planes[i], GetComponentWidth(i),
|
||||
GetComponentWidth(i), scanlines_to_copy);
|
||||
planes[i] += scanlines_to_copy * GetComponentWidth(i);
|
||||
}
|
||||
}
|
||||
return FinishDecode();
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
|
||||
int dst_width, int dst_height) {
|
||||
if (dst_width != GetWidth() ||
|
||||
dst_height > GetHeight()) {
|
||||
// ERROR: Bad dimensions
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#ifdef HAVE_SETJMP
|
||||
if (setjmp(error_mgr_->setjmp_buffer)) {
|
||||
// We called into jpeglib, it experienced an error sometime during this
|
||||
// function call, and we called longjmp() and rewound the stack to here.
|
||||
// Return error.
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
#endif
|
||||
if (!StartDecode()) {
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
SetScanlinePointers(databuf_);
|
||||
int lines_left = dst_height;
|
||||
// TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
|
||||
int skip = (GetHeight() - dst_height) / 2;
|
||||
if (skip > 0) {
|
||||
while (skip >= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
skip -= GetImageScanlinesPerImcuRow();
|
||||
}
|
||||
if (skip > 0) {
|
||||
// Have a partial iMCU row left over to skip.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
// TODO(fbarchard): Compute skip to avoid this
|
||||
assert(skip % GetVertSubSampFactor(i) == 0);
|
||||
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
|
||||
int data_to_skip = rows_to_skip * GetComponentStride(i);
|
||||
// Change our own data buffer pointers so we can pass them to the
|
||||
// callback.
|
||||
databuf_[i] += data_to_skip;
|
||||
}
|
||||
int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
|
||||
(*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
|
||||
// Now change them back.
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
|
||||
int data_to_skip = rows_to_skip * GetComponentStride(i);
|
||||
databuf_[i] -= data_to_skip;
|
||||
}
|
||||
lines_left -= scanlines_to_copy;
|
||||
}
|
||||
}
|
||||
// Read full MCUs until we get to the crop point.
|
||||
for (; lines_left >= GetImageScanlinesPerImcuRow();
|
||||
lines_left -= GetImageScanlinesPerImcuRow()) {
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
(*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
|
||||
}
|
||||
if (lines_left > 0) {
|
||||
// Have a partial iMCU row left over to decode.
|
||||
if (!DecodeImcuRow()) {
|
||||
FinishDecode();
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
(*fn)(opaque, databuf_, databuf_strides_, lines_left);
|
||||
}
|
||||
return FinishDecode();
|
||||
}
|
||||
|
||||
void init_source(j_decompress_ptr cinfo) {
|
||||
fill_input_buffer(cinfo);
|
||||
}
|
||||
|
||||
boolean fill_input_buffer(j_decompress_ptr cinfo) {
|
||||
BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
|
||||
if (buf_vec->pos >= buf_vec->len) {
|
||||
assert(0 && "No more data");
|
||||
// ERROR: No more data
|
||||
return FALSE;
|
||||
}
|
||||
cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
|
||||
cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
|
||||
++buf_vec->pos;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
|
||||
cinfo->src->next_input_byte += num_bytes;
|
||||
}
|
||||
|
||||
void term_source(j_decompress_ptr cinfo) {
|
||||
// Nothing to do.
|
||||
}
|
||||
|
||||
#ifdef HAVE_SETJMP
|
||||
void ErrorHandler(j_common_ptr cinfo) {
|
||||
// This is called when a jpeglib command experiences an error. Unfortunately
|
||||
// jpeglib's error handling model is not very flexible, because it expects the
|
||||
// error handler to not return--i.e., it wants the program to terminate. To
|
||||
// recover from errors we use setjmp() as shown in their example. setjmp() is
|
||||
// C's implementation for the "call with current continuation" functionality
|
||||
// seen in some functional programming languages.
|
||||
// A formatted message can be output, but is unsafe for release.
|
||||
#ifdef DEBUG
|
||||
char buf[JMSG_LENGTH_MAX];
|
||||
(*cinfo->err->format_message)(cinfo, buf);
|
||||
// ERROR: Error in jpeglib: buf
|
||||
#endif
|
||||
|
||||
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
|
||||
// This rewinds the call stack to the point of the corresponding setjmp()
|
||||
// and causes it to return (for a second time) with value 1.
|
||||
longjmp(mgr->setjmp_buffer, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
|
||||
if (num_outbufs != num_outbufs_) {
|
||||
// We could perhaps optimize this case to resize the output buffers without
|
||||
// necessarily having to delete and recreate each one, but it's not worth
|
||||
// it.
|
||||
DestroyOutputBuffers();
|
||||
|
||||
scanlines_ = new uint8** [num_outbufs];
|
||||
scanlines_sizes_ = new int[num_outbufs];
|
||||
databuf_ = new uint8* [num_outbufs];
|
||||
databuf_strides_ = new int[num_outbufs];
|
||||
|
||||
for (int i = 0; i < num_outbufs; ++i) {
|
||||
scanlines_[i] = NULL;
|
||||
scanlines_sizes_[i] = 0;
|
||||
databuf_[i] = NULL;
|
||||
databuf_strides_[i] = 0;
|
||||
}
|
||||
|
||||
num_outbufs_ = num_outbufs;
|
||||
}
|
||||
}
|
||||
|
||||
void MJpegDecoder::DestroyOutputBuffers() {
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
delete [] scanlines_[i];
|
||||
delete [] databuf_[i];
|
||||
}
|
||||
delete [] scanlines_;
|
||||
delete [] databuf_;
|
||||
delete [] scanlines_sizes_;
|
||||
delete [] databuf_strides_;
|
||||
scanlines_ = NULL;
|
||||
databuf_ = NULL;
|
||||
scanlines_sizes_ = NULL;
|
||||
databuf_strides_ = NULL;
|
||||
num_outbufs_ = 0;
|
||||
}
|
||||
|
||||
// JDCT_IFAST and do_block_smoothing improve performance substantially.
|
||||
LIBYUV_BOOL MJpegDecoder::StartDecode() {
|
||||
decompress_struct_->raw_data_out = TRUE;
|
||||
decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
|
||||
decompress_struct_->dither_mode = JDITHER_NONE;
|
||||
// Not applicable to 'raw':
|
||||
decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
|
||||
// Only for buffered mode:
|
||||
decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
|
||||
// Blocky but fast:
|
||||
decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
|
||||
|
||||
if (!jpeg_start_decompress(decompress_struct_)) {
|
||||
// ERROR: Couldn't start JPEG decompressor";
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
LIBYUV_BOOL MJpegDecoder::FinishDecode() {
|
||||
// jpeglib considers it an error if we finish without decoding the whole
|
||||
// image, so we call "abort" rather than "finish".
|
||||
jpeg_abort_decompress(decompress_struct_);
|
||||
return LIBYUV_TRUE;
|
||||
}
|
||||
|
||||
void MJpegDecoder::SetScanlinePointers(uint8** data) {
|
||||
for (int i = 0; i < num_outbufs_; ++i) {
|
||||
uint8* data_i = data[i];
|
||||
for (int j = 0; j < scanlines_sizes_[i]; ++j) {
|
||||
scanlines_[i][j] = data_i;
|
||||
data_i += GetComponentStride(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
|
||||
return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
|
||||
jpeg_read_raw_data(decompress_struct_,
|
||||
scanlines_,
|
||||
GetImageScanlinesPerImcuRow());
|
||||
}
|
||||
|
||||
// The helper function which recognizes the jpeg sub-sampling type.
|
||||
JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
|
||||
int* subsample_x, int* subsample_y, int number_of_components) {
|
||||
if (number_of_components == 3) { // Color images.
|
||||
if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
|
||||
subsample_x[1] == 2 && subsample_y[1] == 2 &&
|
||||
subsample_x[2] == 2 && subsample_y[2] == 2) {
|
||||
return kJpegYuv420;
|
||||
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
|
||||
subsample_x[1] == 2 && subsample_y[1] == 1 &&
|
||||
subsample_x[2] == 2 && subsample_y[2] == 1) {
|
||||
return kJpegYuv422;
|
||||
} else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
|
||||
subsample_x[1] == 1 && subsample_y[1] == 1 &&
|
||||
subsample_x[2] == 1 && subsample_y[2] == 1) {
|
||||
return kJpegYuv444;
|
||||
}
|
||||
} else if (number_of_components == 1) { // Grey-scale images.
|
||||
if (subsample_x[0] == 1 && subsample_y[0] == 1) {
|
||||
return kJpegYuv400;
|
||||
}
|
||||
}
|
||||
return kJpegUnknown;
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
#endif // HAVE_JPEG
|
||||
|
71
libs/libyuv/source/mjpeg_validate.cc
Normal file
71
libs/libyuv/source/mjpeg_validate.cc
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
|
||||
#include <string.h> // For memchr.
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Helper function to scan for EOI marker (0xff 0xd9).
|
||||
static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
|
||||
if (sample_size >= 2) {
|
||||
const uint8* end = sample + sample_size - 1;
|
||||
const uint8* it = sample;
|
||||
while (it < end) {
|
||||
// TODO(fbarchard): scan for 0xd9 instead.
|
||||
it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
|
||||
if (it == NULL) {
|
||||
break;
|
||||
}
|
||||
if (it[1] == 0xd9) {
|
||||
return LIBYUV_TRUE; // Success: Valid jpeg.
|
||||
}
|
||||
++it; // Skip over current 0xff.
|
||||
}
|
||||
}
|
||||
// ERROR: Invalid jpeg end code not found. Size sample_size
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
|
||||
// Helper function to validate the jpeg appears intact.
|
||||
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
|
||||
// Maximum size that ValidateJpeg will consider valid.
|
||||
const size_t kMaxJpegSize = 0x7fffffffull;
|
||||
const size_t kBackSearchSize = 1024;
|
||||
if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
|
||||
// ERROR: Invalid jpeg size: sample_size
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
|
||||
// ERROR: Invalid jpeg initial start code
|
||||
return LIBYUV_FALSE;
|
||||
}
|
||||
|
||||
// Look for the End Of Image (EOI) marker near the end of the buffer.
|
||||
if (sample_size > kBackSearchSize) {
|
||||
if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
|
||||
return LIBYUV_TRUE; // Success: Valid jpeg.
|
||||
}
|
||||
// Reduce search size for forward search.
|
||||
sample_size = sample_size - kBackSearchSize + 1;
|
||||
}
|
||||
// Step over SOI marker and scan for EOI.
|
||||
return ScanEOI(sample + 2, sample_size - 2);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
2629
libs/libyuv/source/planar_functions.cc
Normal file
2629
libs/libyuv/source/planar_functions.cc
Normal file
File diff suppressed because it is too large
Load Diff
491
libs/libyuv/source/rotate.cc
Normal file
491
libs/libyuv/source/rotate.cc
Normal file
@@ -0,0 +1,491 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate.h"
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/convert.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
LIBYUV_API
|
||||
void TransposePlane(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
int i = height;
|
||||
void (*TransposeWx8)(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) = TransposeWx8_C;
|
||||
#if defined(HAS_TRANSPOSEWX8_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
TransposeWx8 = TransposeWx8_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEWX8_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
TransposeWx8 = TransposeWx8_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
TransposeWx8 = TransposeWx8_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
TransposeWx8 = TransposeWx8_Fast_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEWX8_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2)) {
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
TransposeWx8 = TransposeWx8_Fast_DSPR2;
|
||||
} else {
|
||||
TransposeWx8 = TransposeWx8_DSPR2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Work across the source in 8x8 tiles
|
||||
while (i >= 8) {
|
||||
TransposeWx8(src, src_stride, dst, dst_stride, width);
|
||||
src += 8 * src_stride; // Go down 8 rows.
|
||||
dst += 8; // Move over 8 columns.
|
||||
i -= 8;
|
||||
}
|
||||
|
||||
if (i > 0) {
|
||||
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void RotatePlane90(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
// Rotate by 90 is a transpose with the source read
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// of the buffer and flip the sign of the source stride.
|
||||
src += src_stride * (height - 1);
|
||||
src_stride = -src_stride;
|
||||
TransposePlane(src, src_stride, dst, dst_stride, width, height);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void RotatePlane270(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
// Rotate by 270 is a transpose with the destination written
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// of the buffer and flip the sign of the destination stride.
|
||||
dst += dst_stride * (width - 1);
|
||||
dst_stride = -dst_stride;
|
||||
TransposePlane(src, src_stride, dst, dst_stride, width, height);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void RotatePlane180(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
align_buffer_64(row, width);
|
||||
const uint8* src_bot = src + src_stride * (height - 1);
|
||||
uint8* dst_bot = dst + dst_stride * (height - 1);
|
||||
int half_height = (height + 1) >> 1;
|
||||
int y;
|
||||
void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
|
||||
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
|
||||
#if defined(HAS_MIRRORROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MirrorRow = MirrorRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MirrorRow = MirrorRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
MirrorRow = MirrorRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MirrorRow = MirrorRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MirrorRow = MirrorRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
MirrorRow = MirrorRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// TODO(fbarchard): Mirror on mips handle unaligned memory.
|
||||
#if defined(HAS_MIRRORROW_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
|
||||
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
|
||||
MirrorRow = MirrorRow_DSPR2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
if (TestCpuFlag(kCpuHasERMS)) {
|
||||
CopyRow = CopyRow_ERMS;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
if (TestCpuFlag(kCpuHasMIPS)) {
|
||||
CopyRow = CopyRow_MIPS;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Odd height will harmlessly mirror the middle row twice.
|
||||
for (y = 0; y < half_height; ++y) {
|
||||
MirrorRow(src, row, width); // Mirror first row into a buffer
|
||||
src += src_stride;
|
||||
MirrorRow(src_bot, dst, width); // Mirror last row into first row
|
||||
dst += dst_stride;
|
||||
CopyRow(row, dst_bot, width); // Copy first mirrored row into last
|
||||
src_bot -= src_stride;
|
||||
dst_bot -= dst_stride;
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void TransposeUV(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height) {
|
||||
int i = height;
|
||||
void (*TransposeUVWx8)(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) = TransposeUVWx8_C;
|
||||
#if defined(HAS_TRANSPOSEUVWX8_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Work through the source in 8x8 tiles.
|
||||
while (i >= 8) {
|
||||
TransposeUVWx8(src, src_stride,
|
||||
dst_a, dst_stride_a,
|
||||
dst_b, dst_stride_b,
|
||||
width);
|
||||
src += 8 * src_stride; // Go down 8 rows.
|
||||
dst_a += 8; // Move over 8 columns.
|
||||
dst_b += 8; // Move over 8 columns.
|
||||
i -= 8;
|
||||
}
|
||||
|
||||
if (i > 0) {
|
||||
TransposeUVWxH_C(src, src_stride,
|
||||
dst_a, dst_stride_a,
|
||||
dst_b, dst_stride_b,
|
||||
width, i);
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void RotateUV90(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height) {
|
||||
src += src_stride * (height - 1);
|
||||
src_stride = -src_stride;
|
||||
|
||||
TransposeUV(src, src_stride,
|
||||
dst_a, dst_stride_a,
|
||||
dst_b, dst_stride_b,
|
||||
width, height);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void RotateUV270(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height) {
|
||||
dst_a += dst_stride_a * (width - 1);
|
||||
dst_b += dst_stride_b * (width - 1);
|
||||
dst_stride_a = -dst_stride_a;
|
||||
dst_stride_b = -dst_stride_b;
|
||||
|
||||
TransposeUV(src, src_stride,
|
||||
dst_a, dst_stride_a,
|
||||
dst_b, dst_stride_b,
|
||||
width, height);
|
||||
}
|
||||
|
||||
// Rotate 180 is a horizontal and vertical flip.
|
||||
LIBYUV_API
|
||||
void RotateUV180(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height) {
|
||||
int i;
|
||||
void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
|
||||
MirrorUVRow_C;
|
||||
#if defined(HAS_MIRRORUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
MirrorUVRow = MirrorUVRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
|
||||
MirrorUVRow = MirrorUVRow_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORUVROW_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
MirrorUVRow = MirrorUVRow_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
dst_a += dst_stride_a * (height - 1);
|
||||
dst_b += dst_stride_b * (height - 1);
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
MirrorUVRow(src, dst_a, dst_b, width);
|
||||
src += src_stride;
|
||||
dst_a -= dst_stride_a;
|
||||
dst_b -= dst_stride_b;
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int RotatePlane(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height,
|
||||
enum RotationMode mode) {
|
||||
if (!src || width <= 0 || height == 0 || !dst) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src = src + (height - 1) * src_stride;
|
||||
src_stride = -src_stride;
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
case kRotate0:
|
||||
// copy frame
|
||||
CopyPlane(src, src_stride,
|
||||
dst, dst_stride,
|
||||
width, height);
|
||||
return 0;
|
||||
case kRotate90:
|
||||
RotatePlane90(src, src_stride,
|
||||
dst, dst_stride,
|
||||
width, height);
|
||||
return 0;
|
||||
case kRotate270:
|
||||
RotatePlane270(src, src_stride,
|
||||
dst, dst_stride,
|
||||
width, height);
|
||||
return 0;
|
||||
case kRotate180:
|
||||
RotatePlane180(src, src_stride,
|
||||
dst, dst_stride,
|
||||
width, height);
|
||||
return 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I420Rotate(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height,
|
||||
enum RotationMode mode) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
|
||||
!dst_y || !dst_u || !dst_v) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (halfheight - 1) * src_stride_u;
|
||||
src_v = src_v + (halfheight - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
case kRotate0:
|
||||
// copy frame
|
||||
return I420Copy(src_y, src_stride_y,
|
||||
src_u, src_stride_u,
|
||||
src_v, src_stride_v,
|
||||
dst_y, dst_stride_y,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
width, height);
|
||||
case kRotate90:
|
||||
RotatePlane90(src_y, src_stride_y,
|
||||
dst_y, dst_stride_y,
|
||||
width, height);
|
||||
RotatePlane90(src_u, src_stride_u,
|
||||
dst_u, dst_stride_u,
|
||||
halfwidth, halfheight);
|
||||
RotatePlane90(src_v, src_stride_v,
|
||||
dst_v, dst_stride_v,
|
||||
halfwidth, halfheight);
|
||||
return 0;
|
||||
case kRotate270:
|
||||
RotatePlane270(src_y, src_stride_y,
|
||||
dst_y, dst_stride_y,
|
||||
width, height);
|
||||
RotatePlane270(src_u, src_stride_u,
|
||||
dst_u, dst_stride_u,
|
||||
halfwidth, halfheight);
|
||||
RotatePlane270(src_v, src_stride_v,
|
||||
dst_v, dst_stride_v,
|
||||
halfwidth, halfheight);
|
||||
return 0;
|
||||
case kRotate180:
|
||||
RotatePlane180(src_y, src_stride_y,
|
||||
dst_y, dst_stride_y,
|
||||
width, height);
|
||||
RotatePlane180(src_u, src_stride_u,
|
||||
dst_u, dst_stride_u,
|
||||
halfwidth, halfheight);
|
||||
RotatePlane180(src_v, src_stride_v,
|
||||
dst_v, dst_stride_v,
|
||||
halfwidth, halfheight);
|
||||
return 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_uv, int src_stride_uv,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height,
|
||||
enum RotationMode mode) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
if (!src_y || !src_uv || width <= 0 || height == 0 ||
|
||||
!dst_y || !dst_u || !dst_v) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_uv = src_uv + (halfheight - 1) * src_stride_uv;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_uv = -src_stride_uv;
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
case kRotate0:
|
||||
// copy frame
|
||||
return NV12ToI420(src_y, src_stride_y,
|
||||
src_uv, src_stride_uv,
|
||||
dst_y, dst_stride_y,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
width, height);
|
||||
case kRotate90:
|
||||
RotatePlane90(src_y, src_stride_y,
|
||||
dst_y, dst_stride_y,
|
||||
width, height);
|
||||
RotateUV90(src_uv, src_stride_uv,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
halfwidth, halfheight);
|
||||
return 0;
|
||||
case kRotate270:
|
||||
RotatePlane270(src_y, src_stride_y,
|
||||
dst_y, dst_stride_y,
|
||||
width, height);
|
||||
RotateUV270(src_uv, src_stride_uv,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
halfwidth, halfheight);
|
||||
return 0;
|
||||
case kRotate180:
|
||||
RotatePlane180(src_y, src_stride_y,
|
||||
dst_y, dst_stride_y,
|
||||
width, height);
|
||||
RotateUV180(src_uv, src_stride_uv,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
halfwidth, halfheight);
|
||||
return 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
80
libs/libyuv/source/rotate_any.cc
Normal file
80
libs/libyuv/source/rotate_any.cc
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
|
||||
void NAMEANY(const uint8* src, int src_stride, \
|
||||
uint8* dst, int dst_stride, int width) { \
|
||||
int r = width & MASK; \
|
||||
int n = width - r; \
|
||||
if (n > 0) { \
|
||||
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
|
||||
} \
|
||||
TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
|
||||
}
|
||||
|
||||
#ifdef HAS_TRANSPOSEWX8_NEON
|
||||
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_SSSE3
|
||||
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
|
||||
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEWX8_DSPR2
|
||||
TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
|
||||
#endif
|
||||
#undef TANY
|
||||
|
||||
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
|
||||
void NAMEANY(const uint8* src, int src_stride, \
|
||||
uint8* dst_a, int dst_stride_a, \
|
||||
uint8* dst_b, int dst_stride_b, int width) { \
|
||||
int r = width & MASK; \
|
||||
int n = width - r; \
|
||||
if (n > 0) { \
|
||||
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
|
||||
n); \
|
||||
} \
|
||||
TransposeUVWx8_C(src + n * 2, src_stride, \
|
||||
dst_a + n * dst_stride_a, dst_stride_a, \
|
||||
dst_b + n * dst_stride_b, dst_stride_b, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_TRANSPOSEUVWX8_NEON
|
||||
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEUVWX8_SSE2
|
||||
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
|
||||
#endif
|
||||
#ifdef HAS_TRANSPOSEUVWX8_DSPR2
|
||||
TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
|
||||
#endif
|
||||
#undef TUVANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
205
libs/libyuv/source/rotate_argb.cc
Normal file
205
libs/libyuv/source/rotate_argb.cc
Normal file
@@ -0,0 +1,205 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/rotate.h"
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/convert.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// ARGBScale has a function to copy pixels to a row, striding each source
|
||||
// pixel by a constant.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || \
|
||||
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
|
||||
int src_stepx, uint8* dst_ptr, int dst_width);
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
|
||||
int src_stepx, uint8* dst_ptr, int dst_width);
|
||||
#endif
|
||||
|
||||
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
|
||||
int src_stepx, uint8* dst_ptr, int dst_width);
|
||||
|
||||
static void ARGBTranspose(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
int i;
|
||||
int src_pixel_step = src_stride >> 2;
|
||||
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
|
||||
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i = 0; i < width; ++i) { // column of source to row of dest.
|
||||
ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
|
||||
dst += dst_stride;
|
||||
src += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBRotate90(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
// Rotate by 90 is a ARGBTranspose with the source read
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// of the buffer and flip the sign of the source stride.
|
||||
src += src_stride * (height - 1);
|
||||
src_stride = -src_stride;
|
||||
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
|
||||
}
|
||||
|
||||
void ARGBRotate270(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
// Rotate by 270 is a ARGBTranspose with the destination written
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// of the buffer and flip the sign of the destination stride.
|
||||
dst += dst_stride * (width - 1);
|
||||
dst_stride = -dst_stride;
|
||||
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
|
||||
}
|
||||
|
||||
void ARGBRotate180(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width, int height) {
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
align_buffer_64(row, width * 4);
|
||||
const uint8* src_bot = src + src_stride * (height - 1);
|
||||
uint8* dst_bot = dst + dst_stride * (height - 1);
|
||||
int half_height = (height + 1) >> 1;
|
||||
int y;
|
||||
void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
|
||||
ARGBMirrorRow_C;
|
||||
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
|
||||
#if defined(HAS_ARGBMIRRORROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBMIRRORROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBMIRRORROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
if (TestCpuFlag(kCpuHasERMS)) {
|
||||
CopyRow = CopyRow_ERMS;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
if (TestCpuFlag(kCpuHasMIPS)) {
|
||||
CopyRow = CopyRow_MIPS;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Odd height will harmlessly mirror the middle row twice.
|
||||
for (y = 0; y < half_height; ++y) {
|
||||
ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
|
||||
ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
|
||||
CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
src_bot -= src_stride;
|
||||
dst_bot -= dst_stride;
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_argb, int dst_stride_argb, int width, int height,
|
||||
enum RotationMode mode) {
|
||||
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_argb = src_argb + (height - 1) * src_stride_argb;
|
||||
src_stride_argb = -src_stride_argb;
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
case kRotate0:
|
||||
// copy frame
|
||||
return ARGBCopy(src_argb, src_stride_argb,
|
||||
dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
case kRotate90:
|
||||
ARGBRotate90(src_argb, src_stride_argb,
|
||||
dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
return 0;
|
||||
case kRotate270:
|
||||
ARGBRotate270(src_argb, src_stride_argb,
|
||||
dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
return 0;
|
||||
case kRotate180:
|
||||
ARGBRotate180(src_argb, src_stride_argb,
|
||||
dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
return 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
92
libs/libyuv/source/rotate_common.cc
Normal file
92
libs/libyuv/source/rotate_common.cc
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void TransposeWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst[0] = src[0 * src_stride];
|
||||
dst[1] = src[1 * src_stride];
|
||||
dst[2] = src[2 * src_stride];
|
||||
dst[3] = src[3 * src_stride];
|
||||
dst[4] = src[4 * src_stride];
|
||||
dst[5] = src[5 * src_stride];
|
||||
dst[6] = src[6 * src_stride];
|
||||
dst[7] = src[7 * src_stride];
|
||||
++src;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeUVWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
dst_a[0] = src[0 * src_stride + 0];
|
||||
dst_b[0] = src[0 * src_stride + 1];
|
||||
dst_a[1] = src[1 * src_stride + 0];
|
||||
dst_b[1] = src[1 * src_stride + 1];
|
||||
dst_a[2] = src[2 * src_stride + 0];
|
||||
dst_b[2] = src[2 * src_stride + 1];
|
||||
dst_a[3] = src[3 * src_stride + 0];
|
||||
dst_b[3] = src[3 * src_stride + 1];
|
||||
dst_a[4] = src[4 * src_stride + 0];
|
||||
dst_b[4] = src[4 * src_stride + 1];
|
||||
dst_a[5] = src[5 * src_stride + 0];
|
||||
dst_b[5] = src[5 * src_stride + 1];
|
||||
dst_a[6] = src[6 * src_stride + 0];
|
||||
dst_b[6] = src[6 * src_stride + 1];
|
||||
dst_a[7] = src[7 * src_stride + 0];
|
||||
dst_b[7] = src[7 * src_stride + 1];
|
||||
src += 2;
|
||||
dst_a += dst_stride_a;
|
||||
dst_b += dst_stride_b;
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst[i * dst_stride + j] = src[j * src_stride + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TransposeUVWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width, int height) {
|
||||
int i;
|
||||
for (i = 0; i < width * 2; i += 2) {
|
||||
int j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
|
||||
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
368
libs/libyuv/source/rotate_gcc.cc
Normal file
368
libs/libyuv/source/rotate_gcc.cc
Normal file
@@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC x86 and x64.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
|
||||
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
|
||||
#if defined(HAS_TRANSPOSEWX8_SSSE3)
|
||||
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n"
|
||||
"movq (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"movq (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"movq (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movq (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"movq (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movq (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"lea 0x8(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
);
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
|
||||
|
||||
// Transpose 16x8. 64 bit
|
||||
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
|
||||
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm9 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm9,%%xmm9 \n"
|
||||
"movdqu (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm10 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm10 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movdqa %%xmm10,%%xmm11 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"movdqu (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm12 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm12 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movdqa %%xmm12,%%xmm13 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movdqu (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm14 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"punpckhbw %%xmm7,%%xmm14 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"movdqa %%xmm14,%%xmm15 \n"
|
||||
"lea 0x10(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"punpcklwd %%xmm10,%%xmm8 \n"
|
||||
"punpcklwd %%xmm11,%%xmm9 \n"
|
||||
"movdqa %%xmm8,%%xmm10 \n"
|
||||
"movdqa %%xmm9,%%xmm11 \n"
|
||||
"palignr $0x8,%%xmm10,%%xmm10 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"punpcklwd %%xmm14,%%xmm12 \n"
|
||||
"punpcklwd %%xmm15,%%xmm13 \n"
|
||||
"movdqa %%xmm12,%%xmm14 \n"
|
||||
"movdqa %%xmm13,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm12,%%xmm8 \n"
|
||||
"movq %%xmm8,(%1) \n"
|
||||
"movdqa %%xmm8,%%xmm12 \n"
|
||||
"palignr $0x8,%%xmm12,%%xmm12 \n"
|
||||
"movq %%xmm12,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm14,%%xmm10 \n"
|
||||
"movdqa %%xmm10,%%xmm14 \n"
|
||||
"movq %%xmm10,(%1) \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"punpckldq %%xmm13,%%xmm9 \n"
|
||||
"movq %%xmm14,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm9,%%xmm13 \n"
|
||||
"movq %%xmm9,(%1) \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movq %%xmm13,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm15,%%xmm11 \n"
|
||||
"movq %%xmm11,(%1) \n"
|
||||
"movdqa %%xmm11,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movq %%xmm15,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
|
||||
);
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
|
||||
|
||||
// Transpose UV 8x8. 64 bit.
|
||||
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b, int width) {
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%4),%%xmm1 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm1 \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"movdqu (%0,%4),%%xmm3 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm3 \n"
|
||||
"movdqu (%0),%%xmm4 \n"
|
||||
"movdqu (%0,%4),%%xmm5 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm5 \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"movdqu (%0,%4),%%xmm7 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm8 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %4 \n"
|
||||
"lea 0x10(%0,%4,8),%0 \n"
|
||||
"punpckhbw %%xmm7,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm7 \n"
|
||||
"neg %4 \n"
|
||||
// Second round of bit swap.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"movdqa %%xmm1,%%xmm9 \n"
|
||||
"punpckhwd %%xmm2,%%xmm8 \n"
|
||||
"punpckhwd %%xmm3,%%xmm9 \n"
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm2 \n"
|
||||
"movdqa %%xmm9,%%xmm3 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"movdqa %%xmm5,%%xmm9 \n"
|
||||
"punpckhwd %%xmm6,%%xmm8 \n"
|
||||
"punpckhwd %%xmm7,%%xmm9 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm8,%%xmm6 \n"
|
||||
"movdqa %%xmm9,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movlpd %%xmm0,(%1) \n" // Write back U channel
|
||||
"movhpd %%xmm0,(%2) \n" // Write back V channel
|
||||
"punpckhdq %%xmm4,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movlpd %%xmm2,(%1) \n"
|
||||
"movhpd %%xmm2,(%2) \n"
|
||||
"punpckhdq %%xmm6,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm1,%%xmm8 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movlpd %%xmm1,(%1) \n"
|
||||
"movhpd %%xmm1,(%2) \n"
|
||||
"punpckhdq %%xmm5,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm3,%%xmm8 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movlpd %%xmm3,(%1) \n"
|
||||
"movhpd %%xmm3,(%2) \n"
|
||||
"punpckhdq %%xmm7,%%xmm8 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"((intptr_t)(src_stride)), // %4
|
||||
"r"((intptr_t)(dst_stride_a)), // %5
|
||||
"r"((intptr_t)(dst_stride_b)) // %6
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
|
||||
"xmm8", "xmm9"
|
||||
);
|
||||
}
|
||||
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
484
libs/libyuv/source/rotate_mips.cc
Normal file
484
libs/libyuv/source/rotate_mips.cc
Normal file
@@ -0,0 +1,484 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MIPS) && \
|
||||
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
|
||||
(_MIPS_SIM == _MIPS_SIM_ABI32)
|
||||
|
||||
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
|
||||
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
|
||||
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
|
||||
"addu $t3, $t2, %[src_stride] \n"
|
||||
"addu $t5, $t4, %[src_stride] \n"
|
||||
"addu $t6, $t2, $t4 \n"
|
||||
"andi $t0, %[dst], 0x3 \n"
|
||||
"andi $t1, %[dst_stride], 0x3 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"bnez $t0, 11f \n"
|
||||
" subu $t7, $t9, %[src_stride] \n"
|
||||
//dst + dst_stride word aligned
|
||||
"1: \n"
|
||||
"lbu $t0, 0(%[src]) \n"
|
||||
"lbux $t1, %[src_stride](%[src]) \n"
|
||||
"lbux $t8, $t2(%[src]) \n"
|
||||
"lbux $t9, $t3(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s0, $t8, $t0 \n"
|
||||
"lbux $t0, $t4(%[src]) \n"
|
||||
"lbux $t1, $t5(%[src]) \n"
|
||||
"lbux $t8, $t6(%[src]) \n"
|
||||
"lbux $t9, $t7(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s1, $t8, $t0 \n"
|
||||
"sw $s0, 0(%[dst]) \n"
|
||||
"addiu %[width], -1 \n"
|
||||
"addiu %[src], 1 \n"
|
||||
"sw $s1, 4(%[dst]) \n"
|
||||
"bnez %[width], 1b \n"
|
||||
" addu %[dst], %[dst], %[dst_stride] \n"
|
||||
"b 2f \n"
|
||||
//dst + dst_stride unaligned
|
||||
"11: \n"
|
||||
"lbu $t0, 0(%[src]) \n"
|
||||
"lbux $t1, %[src_stride](%[src]) \n"
|
||||
"lbux $t8, $t2(%[src]) \n"
|
||||
"lbux $t9, $t3(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s0, $t8, $t0 \n"
|
||||
"lbux $t0, $t4(%[src]) \n"
|
||||
"lbux $t1, $t5(%[src]) \n"
|
||||
"lbux $t8, $t6(%[src]) \n"
|
||||
"lbux $t9, $t7(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s1, $t8, $t0 \n"
|
||||
"swr $s0, 0(%[dst]) \n"
|
||||
"swl $s0, 3(%[dst]) \n"
|
||||
"addiu %[width], -1 \n"
|
||||
"addiu %[src], 1 \n"
|
||||
"swr $s1, 4(%[dst]) \n"
|
||||
"swl $s1, 7(%[dst]) \n"
|
||||
"bnez %[width], 11b \n"
|
||||
"addu %[dst], %[dst], %[dst_stride] \n"
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
:[src] "+r" (src),
|
||||
[dst] "+r" (dst),
|
||||
[width] "+r" (width)
|
||||
:[src_stride] "r" (src_stride),
|
||||
[dst_stride] "r" (dst_stride)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1"
|
||||
);
|
||||
}
|
||||
|
||||
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set noat \n"
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"beqz %[width], 2f \n"
|
||||
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
|
||||
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
|
||||
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
|
||||
"addu $t3, $t2, %[src_stride] \n"
|
||||
"addu $t5, $t4, %[src_stride] \n"
|
||||
"addu $t6, $t2, $t4 \n"
|
||||
|
||||
"srl $AT, %[width], 0x2 \n"
|
||||
"andi $t0, %[dst], 0x3 \n"
|
||||
"andi $t1, %[dst_stride], 0x3 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"bnez $t0, 11f \n"
|
||||
" subu $t7, $t9, %[src_stride] \n"
|
||||
//dst + dst_stride word aligned
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src]) \n"
|
||||
"lwx $t1, %[src_stride](%[src]) \n"
|
||||
"lwx $t8, $t2(%[src]) \n"
|
||||
"lwx $t9, $t3(%[src]) \n"
|
||||
|
||||
// t0 = | 30 | 20 | 10 | 00 |
|
||||
// t1 = | 31 | 21 | 11 | 01 |
|
||||
// t8 = | 32 | 22 | 12 | 02 |
|
||||
// t9 = | 33 | 23 | 13 | 03 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 21 | 01 | 20 | 00 |
|
||||
// s1 = | 23 | 03 | 22 | 02 |
|
||||
// s2 = | 31 | 11 | 30 | 10 |
|
||||
// s3 = | 33 | 13 | 32 | 12 |
|
||||
|
||||
"precr.qb.ph $s4, $s1, $s0 \n"
|
||||
"precrq.qb.ph $s5, $s1, $s0 \n"
|
||||
"precr.qb.ph $s6, $s3, $s2 \n"
|
||||
"precrq.qb.ph $s7, $s3, $s2 \n"
|
||||
|
||||
// s4 = | 03 | 02 | 01 | 00 |
|
||||
// s5 = | 23 | 22 | 21 | 20 |
|
||||
// s6 = | 13 | 12 | 11 | 10 |
|
||||
// s7 = | 33 | 32 | 31 | 30 |
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n"
|
||||
"lwx $t1, $t5(%[src]) \n"
|
||||
"lwx $t8, $t6(%[src]) \n"
|
||||
"lwx $t9, $t7(%[src]) \n"
|
||||
|
||||
// t0 = | 34 | 24 | 14 | 04 |
|
||||
// t1 = | 35 | 25 | 15 | 05 |
|
||||
// t8 = | 36 | 26 | 16 | 06 |
|
||||
// t9 = | 37 | 27 | 17 | 07 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 25 | 05 | 24 | 04 |
|
||||
// s1 = | 27 | 07 | 26 | 06 |
|
||||
// s2 = | 35 | 15 | 34 | 14 |
|
||||
// s3 = | 37 | 17 | 36 | 16 |
|
||||
|
||||
"precr.qb.ph $t0, $s1, $s0 \n"
|
||||
"precrq.qb.ph $t1, $s1, $s0 \n"
|
||||
"precr.qb.ph $t8, $s3, $s2 \n"
|
||||
"precrq.qb.ph $t9, $s3, $s2 \n"
|
||||
|
||||
// t0 = | 07 | 06 | 05 | 04 |
|
||||
// t1 = | 27 | 26 | 25 | 24 |
|
||||
// t8 = | 17 | 16 | 15 | 14 |
|
||||
// t9 = | 37 | 36 | 35 | 34 |
|
||||
|
||||
"addu $s0, %[dst], %[dst_stride] \n"
|
||||
"addu $s1, $s0, %[dst_stride] \n"
|
||||
"addu $s2, $s1, %[dst_stride] \n"
|
||||
|
||||
"sw $s4, 0(%[dst]) \n"
|
||||
"sw $t0, 4(%[dst]) \n"
|
||||
"sw $s6, 0($s0) \n"
|
||||
"sw $t8, 4($s0) \n"
|
||||
"sw $s5, 0($s1) \n"
|
||||
"sw $t1, 4($s1) \n"
|
||||
"sw $s7, 0($s2) \n"
|
||||
"sw $t9, 4($s2) \n"
|
||||
|
||||
"addiu $AT, -1 \n"
|
||||
"addiu %[src], 4 \n"
|
||||
|
||||
"bnez $AT, 1b \n"
|
||||
" addu %[dst], $s2, %[dst_stride] \n"
|
||||
"b 2f \n"
|
||||
//dst + dst_stride unaligned
|
||||
"11: \n"
|
||||
"lw $t0, 0(%[src]) \n"
|
||||
"lwx $t1, %[src_stride](%[src]) \n"
|
||||
"lwx $t8, $t2(%[src]) \n"
|
||||
"lwx $t9, $t3(%[src]) \n"
|
||||
|
||||
// t0 = | 30 | 20 | 10 | 00 |
|
||||
// t1 = | 31 | 21 | 11 | 01 |
|
||||
// t8 = | 32 | 22 | 12 | 02 |
|
||||
// t9 = | 33 | 23 | 13 | 03 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 21 | 01 | 20 | 00 |
|
||||
// s1 = | 23 | 03 | 22 | 02 |
|
||||
// s2 = | 31 | 11 | 30 | 10 |
|
||||
// s3 = | 33 | 13 | 32 | 12 |
|
||||
|
||||
"precr.qb.ph $s4, $s1, $s0 \n"
|
||||
"precrq.qb.ph $s5, $s1, $s0 \n"
|
||||
"precr.qb.ph $s6, $s3, $s2 \n"
|
||||
"precrq.qb.ph $s7, $s3, $s2 \n"
|
||||
|
||||
// s4 = | 03 | 02 | 01 | 00 |
|
||||
// s5 = | 23 | 22 | 21 | 20 |
|
||||
// s6 = | 13 | 12 | 11 | 10 |
|
||||
// s7 = | 33 | 32 | 31 | 30 |
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n"
|
||||
"lwx $t1, $t5(%[src]) \n"
|
||||
"lwx $t8, $t6(%[src]) \n"
|
||||
"lwx $t9, $t7(%[src]) \n"
|
||||
|
||||
// t0 = | 34 | 24 | 14 | 04 |
|
||||
// t1 = | 35 | 25 | 15 | 05 |
|
||||
// t8 = | 36 | 26 | 16 | 06 |
|
||||
// t9 = | 37 | 27 | 17 | 07 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 25 | 05 | 24 | 04 |
|
||||
// s1 = | 27 | 07 | 26 | 06 |
|
||||
// s2 = | 35 | 15 | 34 | 14 |
|
||||
// s3 = | 37 | 17 | 36 | 16 |
|
||||
|
||||
"precr.qb.ph $t0, $s1, $s0 \n"
|
||||
"precrq.qb.ph $t1, $s1, $s0 \n"
|
||||
"precr.qb.ph $t8, $s3, $s2 \n"
|
||||
"precrq.qb.ph $t9, $s3, $s2 \n"
|
||||
|
||||
// t0 = | 07 | 06 | 05 | 04 |
|
||||
// t1 = | 27 | 26 | 25 | 24 |
|
||||
// t8 = | 17 | 16 | 15 | 14 |
|
||||
// t9 = | 37 | 36 | 35 | 34 |
|
||||
|
||||
"addu $s0, %[dst], %[dst_stride] \n"
|
||||
"addu $s1, $s0, %[dst_stride] \n"
|
||||
"addu $s2, $s1, %[dst_stride] \n"
|
||||
|
||||
"swr $s4, 0(%[dst]) \n"
|
||||
"swl $s4, 3(%[dst]) \n"
|
||||
"swr $t0, 4(%[dst]) \n"
|
||||
"swl $t0, 7(%[dst]) \n"
|
||||
"swr $s6, 0($s0) \n"
|
||||
"swl $s6, 3($s0) \n"
|
||||
"swr $t8, 4($s0) \n"
|
||||
"swl $t8, 7($s0) \n"
|
||||
"swr $s5, 0($s1) \n"
|
||||
"swl $s5, 3($s1) \n"
|
||||
"swr $t1, 4($s1) \n"
|
||||
"swl $t1, 7($s1) \n"
|
||||
"swr $s7, 0($s2) \n"
|
||||
"swl $s7, 3($s2) \n"
|
||||
"swr $t9, 4($s2) \n"
|
||||
"swl $t9, 7($s2) \n"
|
||||
|
||||
"addiu $AT, -1 \n"
|
||||
"addiu %[src], 4 \n"
|
||||
|
||||
"bnez $AT, 11b \n"
|
||||
" addu %[dst], $s2, %[dst_stride] \n"
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
".set at \n"
|
||||
:[src] "+r" (src),
|
||||
[dst] "+r" (dst),
|
||||
[width] "+r" (width)
|
||||
:[src_stride] "r" (src_stride),
|
||||
[dst_stride] "r" (dst_stride)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
|
||||
);
|
||||
}
|
||||
|
||||
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"beqz %[width], 2f \n"
|
||||
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
|
||||
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
|
||||
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
|
||||
"addu $t3, $t2, %[src_stride] \n"
|
||||
"addu $t5, $t4, %[src_stride] \n"
|
||||
"addu $t6, $t2, $t4 \n"
|
||||
"subu $t7, $t9, %[src_stride] \n"
|
||||
"srl $t1, %[width], 1 \n"
|
||||
|
||||
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
|
||||
"andi $t0, %[dst_a], 0x3 \n"
|
||||
"andi $t8, %[dst_b], 0x3 \n"
|
||||
"or $t0, $t0, $t8 \n"
|
||||
"andi $t8, %[dst_stride_a], 0x3 \n"
|
||||
"andi $s5, %[dst_stride_b], 0x3 \n"
|
||||
"or $t8, $t8, $s5 \n"
|
||||
"or $t0, $t0, $t8 \n"
|
||||
"bnez $t0, 11f \n"
|
||||
" nop \n"
|
||||
// dst + dst_stride word aligned (both, a & b dst addresses)
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
|
||||
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
|
||||
"addu $s5, %[dst_a], %[dst_stride_a] \n"
|
||||
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
|
||||
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
|
||||
"addu $s6, %[dst_b], %[dst_stride_b] \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
|
||||
|
||||
"sw $s3, 0($s5) \n"
|
||||
"sw $s4, 0($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
|
||||
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
|
||||
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
|
||||
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
|
||||
"sw $s3, 0(%[dst_a]) \n"
|
||||
"sw $s4, 0(%[dst_b]) \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
|
||||
"sw $s3, 4($s5) \n"
|
||||
"sw $s4, 4($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
|
||||
|
||||
"addiu %[src], 4 \n"
|
||||
"addiu $t1, -1 \n"
|
||||
"sll $t0, %[dst_stride_a], 1 \n"
|
||||
"sll $t8, %[dst_stride_b], 1 \n"
|
||||
"sw $s3, 4(%[dst_a]) \n"
|
||||
"sw $s4, 4(%[dst_b]) \n"
|
||||
"addu %[dst_a], %[dst_a], $t0 \n"
|
||||
"bnez $t1, 1b \n"
|
||||
" addu %[dst_b], %[dst_b], $t8 \n"
|
||||
"b 2f \n"
|
||||
" nop \n"
|
||||
|
||||
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
|
||||
"11: \n"
|
||||
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
|
||||
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
|
||||
"addu $s5, %[dst_a], %[dst_stride_a] \n"
|
||||
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
|
||||
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
|
||||
"addu $s6, %[dst_b], %[dst_stride_b] \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
|
||||
|
||||
"swr $s3, 0($s5) \n"
|
||||
"swl $s3, 3($s5) \n"
|
||||
"swr $s4, 0($s6) \n"
|
||||
"swl $s4, 3($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
|
||||
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
|
||||
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
|
||||
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
|
||||
"swr $s3, 0(%[dst_a]) \n"
|
||||
"swl $s3, 3(%[dst_a]) \n"
|
||||
"swr $s4, 0(%[dst_b]) \n"
|
||||
"swl $s4, 3(%[dst_b]) \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
|
||||
|
||||
"swr $s3, 4($s5) \n"
|
||||
"swl $s3, 7($s5) \n"
|
||||
"swr $s4, 4($s6) \n"
|
||||
"swl $s4, 7($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
|
||||
|
||||
"addiu %[src], 4 \n"
|
||||
"addiu $t1, -1 \n"
|
||||
"sll $t0, %[dst_stride_a], 1 \n"
|
||||
"sll $t8, %[dst_stride_b], 1 \n"
|
||||
"swr $s3, 4(%[dst_a]) \n"
|
||||
"swl $s3, 7(%[dst_a]) \n"
|
||||
"swr $s4, 4(%[dst_b]) \n"
|
||||
"swl $s4, 7(%[dst_b]) \n"
|
||||
"addu %[dst_a], %[dst_a], $t0 \n"
|
||||
"bnez $t1, 11b \n"
|
||||
" addu %[dst_b], %[dst_b], $t8 \n"
|
||||
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
: [src] "+r" (src),
|
||||
[dst_a] "+r" (dst_a),
|
||||
[dst_b] "+r" (dst_b),
|
||||
[width] "+r" (width),
|
||||
[src_stride] "+r" (src_stride)
|
||||
: [dst_stride_a] "r" (dst_stride_a),
|
||||
[dst_stride_b] "r" (dst_stride_b)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3",
|
||||
"s4", "s5", "s6"
|
||||
);
|
||||
}
|
||||
|
||||
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
533
libs/libyuv/source/rotate_neon.cc
Normal file
533
libs/libyuv/source/rotate_neon.cc
Normal file
@@ -0,0 +1,533 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
|
||||
!defined(__aarch64__)
|
||||
|
||||
static uvec8 kVTbl4x4Transpose =
|
||||
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
|
||||
|
||||
void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
const uint8* src_temp = NULL;
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %5, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d0}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d1}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d2}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d3}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d4}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d5}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d6}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.8 {d7}, [%0] \n"
|
||||
|
||||
"vtrn.8 d1, d0 \n"
|
||||
"vtrn.8 d3, d2 \n"
|
||||
"vtrn.8 d5, d4 \n"
|
||||
"vtrn.8 d7, d6 \n"
|
||||
|
||||
"vtrn.16 d1, d3 \n"
|
||||
"vtrn.16 d0, d2 \n"
|
||||
"vtrn.16 d5, d7 \n"
|
||||
"vtrn.16 d4, d6 \n"
|
||||
|
||||
"vtrn.32 d1, d5 \n"
|
||||
"vtrn.32 d0, d4 \n"
|
||||
"vtrn.32 d3, d7 \n"
|
||||
"vtrn.32 d2, d6 \n"
|
||||
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d1}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d0}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d3}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d2}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d5}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d4}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d7}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d6}, [%0] \n"
|
||||
|
||||
"add %1, #8 \n" // src += 8
|
||||
"add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
|
||||
"subs %5, #8 \n" // w -= 8
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %5, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %5, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
"cmp %5, #4 \n"
|
||||
"blt 2f \n"
|
||||
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d0[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d0[1]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d1[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d1[1]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d2[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d2[1]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d3[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.32 {d3[1]}, [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(6)
|
||||
"vld1.8 {q3}, [%6] \n"
|
||||
|
||||
"vtbl.8 d4, {d0, d1}, d6 \n"
|
||||
"vtbl.8 d5, {d0, d1}, d7 \n"
|
||||
"vtbl.8 d0, {d2, d3}, d6 \n"
|
||||
"vtbl.8 d1, {d2, d3}, d7 \n"
|
||||
|
||||
// TODO(frkoenig): Rework shuffle above to
|
||||
// write out with 4 instead of 8 writes.
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d4[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d4[1]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d5[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d5[1]}, [%0] \n"
|
||||
|
||||
"add %0, %3, #4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d0[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d0[1]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d1[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d1[1]}, [%0] \n"
|
||||
|
||||
"add %1, #4 \n" // src += 4
|
||||
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
|
||||
"subs %5, #4 \n" // w -= 4
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %5, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d0[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d1[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d0[1]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d1[1]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d0[2]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d1[2]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d0[3]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.16 {d1[3]}, [%0] \n"
|
||||
|
||||
"vtrn.8 d0, d1 \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.64 {d0}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.64 {d1}, [%0] \n"
|
||||
|
||||
"add %1, #2 \n" // src += 2
|
||||
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
|
||||
"subs %5, #2 \n" // w -= 2
|
||||
"beq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[0]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[1]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[2]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[3]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[4]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[5]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[6]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {d0[7]}, [%1] \n"
|
||||
|
||||
MEMACCESS(3)
|
||||
"vst1.64 {d0}, [%3] \n"
|
||||
|
||||
"4: \n"
|
||||
|
||||
: "+r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(src_stride), // %2
|
||||
"+r"(dst), // %3
|
||||
"+r"(dst_stride), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(&kVTbl4x4Transpose) // %6
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3"
|
||||
);
|
||||
}
|
||||
|
||||
static uvec8 kVTbl4x4TransposeDi =
|
||||
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
|
||||
|
||||
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) {
|
||||
const uint8* src_temp = NULL;
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %7, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d0, d1}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d2, d3}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d4, d5}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d6, d7}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d16, d17}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d18, d19}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d20, d21}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.8 {d22, d23}, [%0] \n"
|
||||
|
||||
"vtrn.8 q1, q0 \n"
|
||||
"vtrn.8 q3, q2 \n"
|
||||
"vtrn.8 q9, q8 \n"
|
||||
"vtrn.8 q11, q10 \n"
|
||||
|
||||
"vtrn.16 q1, q3 \n"
|
||||
"vtrn.16 q0, q2 \n"
|
||||
"vtrn.16 q9, q11 \n"
|
||||
"vtrn.16 q8, q10 \n"
|
||||
|
||||
"vtrn.32 q1, q9 \n"
|
||||
"vtrn.32 q0, q8 \n"
|
||||
"vtrn.32 q3, q11 \n"
|
||||
"vtrn.32 q2, q10 \n"
|
||||
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
"vrev16.8 q8, q8 \n"
|
||||
"vrev16.8 q9, q9 \n"
|
||||
"vrev16.8 q10, q10 \n"
|
||||
"vrev16.8 q11, q11 \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d2}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d0}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d6}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d4}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d18}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d16}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d22}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d20}, [%0] \n"
|
||||
|
||||
"mov %0, %5 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d3}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d1}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d7}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d5}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d19}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d17}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d23}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d21}, [%0] \n"
|
||||
|
||||
"add %1, #8*2 \n" // src += 8*2
|
||||
"add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
|
||||
"add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
|
||||
"subs %7, #8 \n" // w -= 8
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %7, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %7, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
"cmp %7, #4 \n"
|
||||
"blt 2f \n"
|
||||
|
||||
// TODO(frkoenig): Clean this up
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d0}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d1}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d2}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d3}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d4}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d5}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d6}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld1.64 {d7}, [%0] \n"
|
||||
|
||||
MEMACCESS(8)
|
||||
"vld1.8 {q15}, [%8] \n"
|
||||
|
||||
"vtrn.8 q0, q1 \n"
|
||||
"vtrn.8 q2, q3 \n"
|
||||
|
||||
"vtbl.8 d16, {d0, d1}, d30 \n"
|
||||
"vtbl.8 d17, {d0, d1}, d31 \n"
|
||||
"vtbl.8 d18, {d2, d3}, d30 \n"
|
||||
"vtbl.8 d19, {d2, d3}, d31 \n"
|
||||
"vtbl.8 d20, {d4, d5}, d30 \n"
|
||||
"vtbl.8 d21, {d4, d5}, d31 \n"
|
||||
"vtbl.8 d22, {d6, d7}, d30 \n"
|
||||
"vtbl.8 d23, {d6, d7}, d31 \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d16[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d16[1]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d17[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d17[1]}, [%0], %4 \n"
|
||||
|
||||
"add %0, %3, #4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d20[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d20[1]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d21[0]}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d21[1]}, [%0] \n"
|
||||
|
||||
"mov %0, %5 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d18[0]}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d18[1]}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d19[0]}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d19[1]}, [%0], %6 \n"
|
||||
|
||||
"add %0, %5, #4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d22[0]}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d22[1]}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d23[0]}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {d23[1]}, [%0] \n"
|
||||
|
||||
"add %1, #4*2 \n" // src += 4 * 2
|
||||
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
|
||||
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
|
||||
"subs %7, #4 \n" // w -= 4
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %7, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
|
||||
MEMACCESS(0)
|
||||
"vld2.16 {d1[3], d3[3]}, [%0] \n"
|
||||
|
||||
"vtrn.8 d0, d1 \n"
|
||||
"vtrn.8 d2, d3 \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.64 {d0}, [%0], %4 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.64 {d2}, [%0] \n"
|
||||
|
||||
"mov %0, %5 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.64 {d1}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.64 {d3}, [%0] \n"
|
||||
|
||||
"add %1, #2*2 \n" // src += 2 * 2
|
||||
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
|
||||
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
|
||||
"subs %7, #2 \n" // w -= 2
|
||||
"beq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
|
||||
MEMACCESS(1)
|
||||
"vld2.8 {d0[7], d1[7]}, [%1] \n"
|
||||
|
||||
MEMACCESS(3)
|
||||
"vst1.64 {d0}, [%3] \n"
|
||||
MEMACCESS(5)
|
||||
"vst1.64 {d1}, [%5] \n"
|
||||
|
||||
"4: \n"
|
||||
|
||||
: "+r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(src_stride), // %2
|
||||
"+r"(dst_a), // %3
|
||||
"+r"(dst_stride_a), // %4
|
||||
"+r"(dst_b), // %5
|
||||
"+r"(dst_stride_b), // %6
|
||||
"+r"(width) // %7
|
||||
: "r"(&kVTbl4x4TransposeDi) // %8
|
||||
: "memory", "cc",
|
||||
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
|
||||
);
|
||||
}
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
543
libs/libyuv/source/rotate_neon64.cc
Normal file
543
libs/libyuv/source/rotate_neon64.cc
Normal file
@@ -0,0 +1,543 @@
|
||||
/*
|
||||
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC Neon armv8 64 bit.
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
static uvec8 kVTbl4x4Transpose =
|
||||
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
|
||||
|
||||
void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
const uint8* src_temp = NULL;
|
||||
int64 width64 = (int64) width; // Work around clang 3.4 warning.
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %3, %3, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v2.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v3.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v4.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v5.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v6.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v7.8b}, [%0] \n"
|
||||
|
||||
"trn2 v16.8b, v0.8b, v1.8b \n"
|
||||
"trn1 v17.8b, v0.8b, v1.8b \n"
|
||||
"trn2 v18.8b, v2.8b, v3.8b \n"
|
||||
"trn1 v19.8b, v2.8b, v3.8b \n"
|
||||
"trn2 v20.8b, v4.8b, v5.8b \n"
|
||||
"trn1 v21.8b, v4.8b, v5.8b \n"
|
||||
"trn2 v22.8b, v6.8b, v7.8b \n"
|
||||
"trn1 v23.8b, v6.8b, v7.8b \n"
|
||||
|
||||
"trn2 v3.4h, v17.4h, v19.4h \n"
|
||||
"trn1 v1.4h, v17.4h, v19.4h \n"
|
||||
"trn2 v2.4h, v16.4h, v18.4h \n"
|
||||
"trn1 v0.4h, v16.4h, v18.4h \n"
|
||||
"trn2 v7.4h, v21.4h, v23.4h \n"
|
||||
"trn1 v5.4h, v21.4h, v23.4h \n"
|
||||
"trn2 v6.4h, v20.4h, v22.4h \n"
|
||||
"trn1 v4.4h, v20.4h, v22.4h \n"
|
||||
|
||||
"trn2 v21.2s, v1.2s, v5.2s \n"
|
||||
"trn1 v17.2s, v1.2s, v5.2s \n"
|
||||
"trn2 v20.2s, v0.2s, v4.2s \n"
|
||||
"trn1 v16.2s, v0.2s, v4.2s \n"
|
||||
"trn2 v23.2s, v3.2s, v7.2s \n"
|
||||
"trn1 v19.2s, v3.2s, v7.2s \n"
|
||||
"trn2 v22.2s, v2.2s, v6.2s \n"
|
||||
"trn1 v18.2s, v2.2s, v6.2s \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v17.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v16.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v19.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v18.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v21.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v20.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v23.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v22.8b}, [%0] \n"
|
||||
|
||||
"add %1, %1, #8 \n" // src += 8
|
||||
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
|
||||
"subs %3, %3, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %3, %3, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
"cmp %3, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.s}[0], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.s}[1], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.s}[2], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.s}[3], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.s}[0], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.s}[1], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.s}[2], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.s}[3], [%0] \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
|
||||
MEMACCESS(4)
|
||||
"ld1 {v2.16b}, [%4] \n"
|
||||
|
||||
"tbl v3.16b, {v0.16b}, v2.16b \n"
|
||||
"tbl v0.16b, {v1.16b}, v2.16b \n"
|
||||
|
||||
// TODO(frkoenig): Rework shuffle above to
|
||||
// write out with 4 instead of 8 writes.
|
||||
MEMACCESS(0)
|
||||
"st1 {v3.s}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v3.s}[1], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v3.s}[2], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v3.s}[3], [%0] \n"
|
||||
|
||||
"add %0, %2, #4 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.s}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.s}[1], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.s}[2], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.s}[3], [%0] \n"
|
||||
|
||||
"add %1, %1, #4 \n" // src += 4
|
||||
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
|
||||
"subs %3, %3, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.h}[0], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.h}[0], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.h}[1], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.h}[1], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.h}[2], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.h}[2], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.h}[3], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.h}[3], [%0] \n"
|
||||
|
||||
"trn2 v2.8b, v0.8b, v1.8b \n"
|
||||
"trn1 v3.8b, v0.8b, v1.8b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v3.8b}, [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v2.8b}, [%0] \n"
|
||||
|
||||
"add %1, %1, #2 \n" // src += 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
|
||||
"subs %3, %3, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[0], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[1], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[2], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[3], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[4], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[5], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[6], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.b}[7], [%1] \n"
|
||||
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2] \n"
|
||||
|
||||
"4: \n"
|
||||
|
||||
: "+r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(width64) // %3
|
||||
: "r"(&kVTbl4x4Transpose), // %4
|
||||
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
);
|
||||
}
|
||||
|
||||
static uint8 kVTbl4x4TransposeDi[32] =
|
||||
{ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
|
||||
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
|
||||
|
||||
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) {
|
||||
const uint8* src_temp = NULL;
|
||||
int64 width64 = (int64) width; // Work around clang 3.4 warning.
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %4, %4, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.16b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v2.16b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v3.16b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v4.16b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v5.16b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v6.16b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v7.16b}, [%0] \n"
|
||||
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v16.d}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v18.d}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v17.d}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v19.d}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v16.d}[1], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v18.d}[1], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v17.d}[1], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v19.d}[1], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v20.d}[0], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v22.d}[0], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v21.d}[0], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v23.d}[0], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v20.d}[1], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v22.d}[1], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v21.d}[1], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v23.d}[1], [%0] \n"
|
||||
|
||||
"add %1, %1, #16 \n" // src += 8*2
|
||||
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
|
||||
"subs %4, %4, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %4, %4, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
"cmp %4, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// TODO(frkoenig): Clean this up
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v2.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v3.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v4.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v5.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v6.8b}, [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v7.8b}, [%0] \n"
|
||||
|
||||
MEMACCESS(8)
|
||||
"ld1 {v30.16b}, [%8], #16 \n"
|
||||
"ld1 {v31.16b}, [%8] \n"
|
||||
|
||||
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
|
||||
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
|
||||
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
|
||||
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v16.s}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v16.s}[1], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v16.s}[2], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v16.s}[3], [%0], %6 \n"
|
||||
|
||||
"add %0, %2, #4 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v18.s}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v18.s}[1], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v18.s}[2], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v18.s}[3], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v17.s}[0], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v17.s}[1], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v17.s}[2], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v17.s}[3], [%0], %7 \n"
|
||||
|
||||
"add %0, %3, #4 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v19.s}[0], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v19.s}[1], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v19.s}[2], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v19.s}[3], [%0] \n"
|
||||
|
||||
"add %1, %1, #8 \n" // src += 4 * 2
|
||||
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
|
||||
"subs %4, %4, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v2.h, v3.h}[3], [%0] \n"
|
||||
|
||||
"trn1 v4.8b, v0.8b, v2.8b \n"
|
||||
"trn2 v5.8b, v0.8b, v2.8b \n"
|
||||
"trn1 v6.8b, v1.8b, v3.8b \n"
|
||||
"trn2 v7.8b, v1.8b, v3.8b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v4.d}[0], [%0], %6 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v6.d}[0], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v5.d}[0], [%0], %7 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v7.d}[0], [%0] \n"
|
||||
|
||||
"add %1, %1, #4 \n" // src += 2 * 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
|
||||
"subs %4, %4, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
|
||||
MEMACCESS(1)
|
||||
"ld2 {v0.b, v1.b}[7], [%1] \n"
|
||||
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.d}[0], [%2] \n"
|
||||
MEMACCESS(3)
|
||||
"st1 {v1.d}[0], [%3] \n"
|
||||
|
||||
"4: \n"
|
||||
|
||||
: "+r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst_a), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width64) // %4
|
||||
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
|
||||
"r"(&kVTbl4x4TransposeDi) // %8
|
||||
: "memory", "cc",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v30", "v31"
|
||||
);
|
||||
}
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
247
libs/libyuv/source/rotate_win.cc
Normal file
247
libs/libyuv/source/rotate_win.cc
Normal file
@@ -0,0 +1,247 @@
|
||||
/*
|
||||
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/rotate_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for 32 bit Visual C x86 and clangcl
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
__declspec(naked)
|
||||
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm {
|
||||
push edi
|
||||
push esi
|
||||
push ebp
|
||||
mov eax, [esp + 12 + 4] // src
|
||||
mov edi, [esp + 12 + 8] // src_stride
|
||||
mov edx, [esp + 12 + 12] // dst
|
||||
mov esi, [esp + 12 + 16] // dst_stride
|
||||
mov ecx, [esp + 12 + 20] // width
|
||||
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
align 4
|
||||
convertloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea ebp, [eax + 8]
|
||||
movq xmm1, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm0, xmm1
|
||||
movq xmm2, qword ptr [eax]
|
||||
movdqa xmm1, xmm0
|
||||
palignr xmm1, xmm1, 8
|
||||
movq xmm3, qword ptr [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movq xmm4, qword ptr [eax]
|
||||
palignr xmm3, xmm3, 8
|
||||
movq xmm5, qword ptr [eax + edi]
|
||||
punpcklbw xmm4, xmm5
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm5, xmm4
|
||||
movq xmm6, qword ptr [eax]
|
||||
palignr xmm5, xmm5, 8
|
||||
movq xmm7, qword ptr [eax + edi]
|
||||
punpcklbw xmm6, xmm7
|
||||
mov eax, ebp
|
||||
movdqa xmm7, xmm6
|
||||
palignr xmm7, xmm7, 8
|
||||
// Second round of bit swap.
|
||||
punpcklwd xmm0, xmm2
|
||||
punpcklwd xmm1, xmm3
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
palignr xmm2, xmm2, 8
|
||||
palignr xmm3, xmm3, 8
|
||||
punpcklwd xmm4, xmm6
|
||||
punpcklwd xmm5, xmm7
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm5
|
||||
palignr xmm6, xmm6, 8
|
||||
palignr xmm7, xmm7, 8
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
punpckldq xmm0, xmm4
|
||||
movq qword ptr [edx], xmm0
|
||||
movdqa xmm4, xmm0
|
||||
palignr xmm4, xmm4, 8
|
||||
movq qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
punpckldq xmm2, xmm6
|
||||
movdqa xmm6, xmm2
|
||||
palignr xmm6, xmm6, 8
|
||||
movq qword ptr [edx], xmm2
|
||||
punpckldq xmm1, xmm5
|
||||
movq qword ptr [edx + esi], xmm6
|
||||
lea edx, [edx + 2 * esi]
|
||||
movdqa xmm5, xmm1
|
||||
movq qword ptr [edx], xmm1
|
||||
palignr xmm5, xmm5, 8
|
||||
punpckldq xmm3, xmm7
|
||||
movq qword ptr [edx + esi], xmm5
|
||||
lea edx, [edx + 2 * esi]
|
||||
movq qword ptr [edx], xmm3
|
||||
movdqa xmm7, xmm3
|
||||
palignr xmm7, xmm7, 8
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx + esi], xmm7
|
||||
lea edx, [edx + 2 * esi]
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
pop esi
|
||||
pop edi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int w) {
|
||||
__asm {
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov eax, [esp + 16 + 4] // src
|
||||
mov edi, [esp + 16 + 8] // src_stride
|
||||
mov edx, [esp + 16 + 12] // dst_a
|
||||
mov esi, [esp + 16 + 16] // dst_stride_a
|
||||
mov ebx, [esp + 16 + 20] // dst_b
|
||||
mov ebp, [esp + 16 + 24] // dst_stride_b
|
||||
mov ecx, esp
|
||||
sub esp, 4 + 16
|
||||
and esp, ~15
|
||||
mov [esp + 16], ecx
|
||||
mov ecx, [ecx + 16 + 28] // w
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm0 // use xmm7 as temp register.
|
||||
punpcklbw xmm0, xmm1
|
||||
punpckhbw xmm7, xmm1
|
||||
movdqa xmm1, xmm7
|
||||
movdqu xmm2, [eax]
|
||||
movdqu xmm3, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm2
|
||||
punpcklbw xmm2, xmm3
|
||||
punpckhbw xmm7, xmm3
|
||||
movdqa xmm3, xmm7
|
||||
movdqu xmm4, [eax]
|
||||
movdqu xmm5, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqa xmm7, xmm4
|
||||
punpcklbw xmm4, xmm5
|
||||
punpckhbw xmm7, xmm5
|
||||
movdqa xmm5, xmm7
|
||||
movdqu xmm6, [eax]
|
||||
movdqu xmm7, [eax + edi]
|
||||
lea eax, [eax + 2 * edi]
|
||||
movdqu [esp], xmm5 // backup xmm5
|
||||
neg edi
|
||||
movdqa xmm5, xmm6 // use xmm5 as temp register.
|
||||
punpcklbw xmm6, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
movdqa xmm7, xmm5
|
||||
lea eax, [eax + 8 * edi + 16]
|
||||
neg edi
|
||||
// Second round of bit swap.
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm5, xmm2
|
||||
movdqa xmm2, xmm5
|
||||
movdqa xmm5, xmm1
|
||||
punpcklwd xmm1, xmm3
|
||||
punpckhwd xmm5, xmm3
|
||||
movdqa xmm3, xmm5
|
||||
movdqa xmm5, xmm4
|
||||
punpcklwd xmm4, xmm6
|
||||
punpckhwd xmm5, xmm6
|
||||
movdqa xmm6, xmm5
|
||||
movdqu xmm5, [esp] // restore xmm5
|
||||
movdqu [esp], xmm6 // backup xmm6
|
||||
movdqa xmm6, xmm5 // use xmm6 as temp register.
|
||||
punpcklwd xmm5, xmm7
|
||||
punpckhwd xmm6, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
movdqa xmm6, xmm0
|
||||
punpckldq xmm0, xmm4
|
||||
punpckhdq xmm6, xmm4
|
||||
movdqa xmm4, xmm6
|
||||
movdqu xmm6, [esp] // restore xmm6
|
||||
movlpd qword ptr [edx], xmm0
|
||||
movhpd qword ptr [ebx], xmm0
|
||||
movlpd qword ptr [edx + esi], xmm4
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm4
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm2 // use xmm0 as the temp register.
|
||||
punpckldq xmm2, xmm6
|
||||
movlpd qword ptr [edx], xmm2
|
||||
movhpd qword ptr [ebx], xmm2
|
||||
punpckhdq xmm0, xmm6
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm1 // use xmm0 as the temp register.
|
||||
punpckldq xmm1, xmm5
|
||||
movlpd qword ptr [edx], xmm1
|
||||
movhpd qword ptr [ebx], xmm1
|
||||
punpckhdq xmm0, xmm5
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
movdqa xmm0, xmm3 // use xmm0 as the temp register.
|
||||
punpckldq xmm3, xmm7
|
||||
movlpd qword ptr [edx], xmm3
|
||||
movhpd qword ptr [ebx], xmm3
|
||||
punpckhdq xmm0, xmm7
|
||||
sub ecx, 8
|
||||
movlpd qword ptr [edx + esi], xmm0
|
||||
lea edx, [edx + 2 * esi]
|
||||
movhpd qword ptr [ebx + ebp], xmm0
|
||||
lea ebx, [ebx + 2 * ebp]
|
||||
jg convertloop
|
||||
|
||||
mov esp, [esp + 16]
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
818
libs/libyuv/source/row_any.cc
Normal file
818
libs/libyuv/source/row_any.cc
Normal file
@@ -0,0 +1,818 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#include <string.h> // For memset.
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Subsampled source needs to be increase by 1 of not even.
|
||||
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
|
||||
|
||||
// Any 4 planes to 1 with yuvconstants
|
||||
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
|
||||
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
|
||||
const uint8* a_buf, uint8* dst_ptr, \
|
||||
const struct YuvConstants* yuvconstants, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 5]); \
|
||||
memset(temp, 0, 64 * 4); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n, r); \
|
||||
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
memcpy(temp + 192, a_buf + n, r); \
|
||||
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
|
||||
yuvconstants, MASK + 1); \
|
||||
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
|
||||
SS(r, DUVSHIFT) * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
|
||||
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_I422ALPHATOARGBROW_AVX2
|
||||
ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422ALPHATOARGBROW_NEON
|
||||
ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
|
||||
#endif
|
||||
#undef ANY41C
|
||||
|
||||
// Any 3 planes to 1.
|
||||
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
|
||||
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
|
||||
uint8* dst_ptr, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 4]); \
|
||||
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n, r); \
|
||||
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
|
||||
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
|
||||
SS(r, DUVSHIFT) * BPP); \
|
||||
}
|
||||
#ifdef HAS_I422TOYUY2ROW_SSE2
|
||||
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
|
||||
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TOYUY2ROW_NEON
|
||||
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TOUYVYROW_NEON
|
||||
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_BLENDPLANEROW_AVX2
|
||||
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_BLENDPLANEROW_SSSE3
|
||||
ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
|
||||
#endif
|
||||
#undef ANY31
|
||||
|
||||
// Note that odd width replication includes 444 due to implementation
|
||||
// on arm that subsamples 444 to 422 internally.
|
||||
// Any 3 planes to 1 with yuvconstants
|
||||
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
|
||||
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
|
||||
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
|
||||
int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 4]); \
|
||||
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n, r); \
|
||||
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
if (width & 1) { \
|
||||
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
|
||||
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
|
||||
} \
|
||||
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
|
||||
yuvconstants, MASK + 1); \
|
||||
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
|
||||
SS(r, DUVSHIFT) * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_I411TOARGBROW_SSSE3
|
||||
ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_I444TOARGBROW_SSSE3
|
||||
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
|
||||
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
|
||||
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
|
||||
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
|
||||
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
|
||||
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
|
||||
#endif // HAS_I444TOARGBROW_SSSE3
|
||||
#ifdef HAS_I422TORGB24ROW_AVX2
|
||||
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGBROW_AVX2
|
||||
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TORGBAROW_AVX2
|
||||
ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I444TOARGBROW_AVX2
|
||||
ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I411TOARGBROW_AVX2
|
||||
ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGB4444ROW_AVX2
|
||||
ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGB1555ROW_AVX2
|
||||
ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_I422TORGB565ROW_AVX2
|
||||
ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGBROW_NEON
|
||||
ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
|
||||
ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
|
||||
ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
|
||||
ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
|
||||
ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
|
||||
ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
|
||||
ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
|
||||
ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
|
||||
#endif
|
||||
#undef ANY31C
|
||||
|
||||
// Any 2 planes to 1.
|
||||
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
||||
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
|
||||
uint8* dst_ptr, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 3]); \
|
||||
memset(temp, 0, 64 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
|
||||
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
|
||||
SS(r, UVSHIFT) * SBPP2); \
|
||||
ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
|
||||
}
|
||||
|
||||
// Merge functions.
|
||||
#ifdef HAS_MERGEUVROW_SSE2
|
||||
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_MERGEUVROW_AVX2
|
||||
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
|
||||
#endif
|
||||
#ifdef HAS_MERGEUVROW_NEON
|
||||
ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
|
||||
#endif
|
||||
|
||||
// Math functions.
|
||||
#ifdef HAS_ARGBMULTIPLYROW_SSE2
|
||||
ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBADDROW_SSE2
|
||||
ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSUBTRACTROW_SSE2
|
||||
ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMULTIPLYROW_AVX2
|
||||
ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBADDROW_AVX2
|
||||
ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSUBTRACTROW_AVX2
|
||||
ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMULTIPLYROW_NEON
|
||||
ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBADDROW_NEON
|
||||
ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSUBTRACTROW_NEON
|
||||
ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SOBELROW_SSE2
|
||||
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_SOBELROW_NEON
|
||||
ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SOBELTOPLANEROW_SSE2
|
||||
ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SOBELTOPLANEROW_NEON
|
||||
ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SOBELXYROW_SSE2
|
||||
ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_SOBELXYROW_NEON
|
||||
ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
|
||||
#endif
|
||||
#undef ANY21
|
||||
|
||||
// Any 2 planes to 1 with yuvconstants
|
||||
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
||||
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
|
||||
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
|
||||
int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 3]); \
|
||||
memset(temp, 0, 64 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
|
||||
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
|
||||
SS(r, UVSHIFT) * SBPP2); \
|
||||
ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
|
||||
}
|
||||
|
||||
// Biplanar to RGB.
|
||||
#ifdef HAS_NV12TOARGBROW_SSSE3
|
||||
ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_NV12TOARGBROW_AVX2
|
||||
ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_NV12TOARGBROW_NEON
|
||||
ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_NV21TOARGBROW_SSSE3
|
||||
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_NV21TOARGBROW_AVX2
|
||||
ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_NV21TOARGBROW_NEON
|
||||
ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_NV12TORGB565ROW_SSSE3
|
||||
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_NV12TORGB565ROW_AVX2
|
||||
ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_NV12TORGB565ROW_NEON
|
||||
ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
|
||||
#endif
|
||||
#undef ANY21C
|
||||
|
||||
// Any 1 to 1.
|
||||
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[128 * 2]); \
|
||||
memset(temp, 0, 128); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
|
||||
ANY_SIMD(temp, temp + 128, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_COPYROW_AVX
|
||||
ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_COPYROW_SSE2
|
||||
ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_COPYROW_NEON
|
||||
ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
|
||||
ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
|
||||
ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
|
||||
ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
|
||||
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
|
||||
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565ROW_AVX2)
|
||||
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
|
||||
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
|
||||
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
|
||||
#endif
|
||||
#if defined(HAS_J400TOARGBROW_SSE2)
|
||||
ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
|
||||
#endif
|
||||
#if defined(HAS_J400TOARGBROW_AVX2)
|
||||
ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
|
||||
#endif
|
||||
#if defined(HAS_I400TOARGBROW_SSE2)
|
||||
ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
|
||||
#endif
|
||||
#if defined(HAS_I400TOARGBROW_AVX2)
|
||||
ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
|
||||
#endif
|
||||
#if defined(HAS_RGB24TOARGBROW_SSSE3)
|
||||
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
|
||||
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
|
||||
ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
|
||||
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
|
||||
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
|
||||
#endif
|
||||
#if defined(HAS_RAWTORGB24ROW_SSSE3)
|
||||
ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
|
||||
#endif
|
||||
#if defined(HAS_RGB565TOARGBROW_AVX2)
|
||||
ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
|
||||
#endif
|
||||
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
|
||||
ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
|
||||
#endif
|
||||
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
|
||||
ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB24ROW_NEON)
|
||||
ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
|
||||
ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
|
||||
ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
|
||||
ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
|
||||
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
|
||||
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
|
||||
ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
|
||||
#endif
|
||||
#if defined(HAS_RAWTORGB24ROW_NEON)
|
||||
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYJROW_AVX2
|
||||
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_UYVYTOYROW_AVX2
|
||||
ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOYROW_AVX2
|
||||
ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_BGRATOYROW_SSSE3
|
||||
ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
|
||||
ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
|
||||
ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
|
||||
ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
|
||||
ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYJROW_SSSE3
|
||||
ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_NEON
|
||||
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYJROW_NEON
|
||||
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_BGRATOYROW_NEON
|
||||
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_ABGRTOYROW_NEON
|
||||
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_RGBATOYROW_NEON
|
||||
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOYROW_NEON
|
||||
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_RAWTOYROW_NEON
|
||||
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_RGB565TOYROW_NEON
|
||||
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGB1555TOYROW_NEON
|
||||
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGB4444TOYROW_NEON
|
||||
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOYROW_NEON
|
||||
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_UYVYTOYROW_NEON
|
||||
ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOARGBROW_NEON
|
||||
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_RAWTOARGBROW_NEON
|
||||
ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_RGB565TOARGBROW_NEON
|
||||
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGB1555TOARGBROW_NEON
|
||||
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGB4444TOARGBROW_NEON
|
||||
ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBATTENUATEROW_SSSE3
|
||||
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
|
||||
ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBATTENUATEROW_AVX2
|
||||
ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
|
||||
ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBATTENUATEROW_NEON
|
||||
ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
|
||||
#endif
|
||||
#undef ANY11
|
||||
|
||||
// Any 1 to 1 with yuvconstants
|
||||
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
|
||||
const struct YuvConstants* yuvconstants, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[128 * 2]); \
|
||||
memset(temp, 0, 128); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
|
||||
ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
|
||||
}
|
||||
#if defined(HAS_YUY2TOARGBROW_SSSE3)
|
||||
ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
|
||||
ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
|
||||
#endif
|
||||
#if defined(HAS_YUY2TOARGBROW_AVX2)
|
||||
ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
|
||||
ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
|
||||
#endif
|
||||
#if defined(HAS_YUY2TOARGBROW_NEON)
|
||||
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
|
||||
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
|
||||
#endif
|
||||
#undef ANY11C
|
||||
|
||||
// Any 1 to 1 blended.
|
||||
#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[128 * 2]); \
|
||||
memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
|
||||
memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
|
||||
ANY_SIMD(temp, temp + 128, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
|
||||
ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
|
||||
ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
|
||||
ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
|
||||
ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
|
||||
#endif
|
||||
#undef ANY11B
|
||||
|
||||
// Any 1 to 1 with parameter.
|
||||
#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
|
||||
T shuffler, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 2]); \
|
||||
memset(temp, 0, 64); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
|
||||
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
|
||||
}
|
||||
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
|
||||
ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
|
||||
const uint32, 4, 2, 3)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
|
||||
ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
|
||||
const uint32, 4, 2, 7)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
|
||||
ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
|
||||
const uint32, 4, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSHUFFLEROW_SSE2
|
||||
ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
|
||||
ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSHUFFLEROW_AVX2
|
||||
ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSHUFFLEROW_NEON
|
||||
ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
|
||||
#endif
|
||||
#undef ANY11P
|
||||
|
||||
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
|
||||
#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
|
||||
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
|
||||
ptrdiff_t src_stride_ptr, int width, \
|
||||
int source_y_fraction) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 3]); \
|
||||
memset(temp, 0, 64 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
|
||||
memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
|
||||
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_AVX2
|
||||
ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_SSSE3
|
||||
ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_NEON
|
||||
ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_DSPR2
|
||||
ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
|
||||
#endif
|
||||
#undef ANY11T
|
||||
|
||||
// Any 1 to 1 mirror.
|
||||
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64 * 2]); \
|
||||
memset(temp, 0, 64); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr, r * BPP); \
|
||||
ANY_SIMD(temp, temp + 64, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_MIRRORROW_AVX2
|
||||
ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_MIRRORROW_SSSE3
|
||||
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_MIRRORROW_NEON
|
||||
ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMIRRORROW_AVX2
|
||||
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMIRRORROW_SSE2
|
||||
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMIRRORROW_NEON
|
||||
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
|
||||
#endif
|
||||
#undef ANY11M
|
||||
|
||||
// Any 1 plane. (memset)
|
||||
#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
|
||||
void NAMEANY(uint8* dst_ptr, T v32, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[64]); \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(dst_ptr, v32, n); \
|
||||
} \
|
||||
ANY_SIMD(temp, v32, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SETROW_X86
|
||||
ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
|
||||
#endif
|
||||
#ifdef HAS_SETROW_NEON
|
||||
ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBSETROW_NEON
|
||||
ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
|
||||
#endif
|
||||
#undef ANY1
|
||||
|
||||
// Any 1 to 2. Outputs UV planes.
|
||||
#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
|
||||
SIMD_ALIGNED(uint8 temp[128 * 3]); \
|
||||
memset(temp, 0, 128); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_u, dst_v, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
|
||||
/* repeat last 4 bytes for 422 subsampler */ \
|
||||
if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \
|
||||
memcpy(temp + SS(r, UVSHIFT) * BPP, \
|
||||
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
|
||||
} \
|
||||
/* repeat last 4 - 12 bytes for 411 subsampler */ \
|
||||
if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \
|
||||
memcpy(temp + SS(r, UVSHIFT) * BPP, \
|
||||
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
|
||||
memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \
|
||||
temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \
|
||||
} \
|
||||
if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \
|
||||
memcpy(temp + SS(r, UVSHIFT) * BPP, \
|
||||
temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \
|
||||
} \
|
||||
if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \
|
||||
memcpy(temp + SS(r, UVSHIFT) * BPP, \
|
||||
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
|
||||
} \
|
||||
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
|
||||
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
|
||||
memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SPLITUVROW_SSE2
|
||||
ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
|
||||
#endif
|
||||
#ifdef HAS_SPLITUVROW_AVX2
|
||||
ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
|
||||
#endif
|
||||
#ifdef HAS_SPLITUVROW_NEON
|
||||
ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
|
||||
#endif
|
||||
#ifdef HAS_SPLITUVROW_DSPR2
|
||||
ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUV444ROW_SSSE3
|
||||
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUV422ROW_AVX2
|
||||
ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
|
||||
ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUV422ROW_SSE2
|
||||
ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
|
||||
ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUV422ROW_NEON
|
||||
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
|
||||
ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
|
||||
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
|
||||
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
|
||||
#endif
|
||||
#undef ANY12
|
||||
|
||||
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
|
||||
// 128 byte row allows for 32 avx ARGB pixels.
|
||||
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \
|
||||
uint8* dst_u, uint8* dst_v, int width) { \
|
||||
SIMD_ALIGNED(uint8 temp[128 * 4]); \
|
||||
memset(temp, 0, 128 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
|
||||
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
|
||||
SS(r, UVSHIFT) * BPP); \
|
||||
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
|
||||
memcpy(temp + SS(r, UVSHIFT) * BPP, \
|
||||
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
|
||||
memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
|
||||
temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
|
||||
} \
|
||||
ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
|
||||
memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
|
||||
memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOUVROW_AVX2
|
||||
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVJROW_AVX2
|
||||
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||
ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
|
||||
ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
|
||||
ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
|
||||
ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
|
||||
ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUVROW_AVX2
|
||||
ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
|
||||
ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUVROW_SSE2
|
||||
ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
|
||||
ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVROW_NEON
|
||||
ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVJROW_NEON
|
||||
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_BGRATOUVROW_NEON
|
||||
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ABGRTOUVROW_NEON
|
||||
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGBATOUVROW_NEON
|
||||
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOUVROW_NEON
|
||||
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_RAWTOUVROW_NEON
|
||||
ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGB565TOUVROW_NEON
|
||||
ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGB1555TOUVROW_NEON
|
||||
ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGB4444TOUVROW_NEON
|
||||
ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUVROW_NEON
|
||||
ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_UYVYTOUVROW_NEON
|
||||
ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
|
||||
#endif
|
||||
#undef ANY12S
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
2614
libs/libyuv/source/row_common.cc
Normal file
2614
libs/libyuv/source/row_common.cc
Normal file
File diff suppressed because it is too large
Load Diff
5507
libs/libyuv/source/row_gcc.cc
Normal file
5507
libs/libyuv/source/row_gcc.cc
Normal file
File diff suppressed because it is too large
Load Diff
782
libs/libyuv/source/row_mips.cc
Normal file
782
libs/libyuv/source/row_mips.cc
Normal file
@@ -0,0 +1,782 @@
|
||||
/*
|
||||
* Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms:
|
||||
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
|
||||
(_MIPS_SIM == _MIPS_SIM_ABI32)
|
||||
|
||||
#ifdef HAS_COPYROW_MIPS
|
||||
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
|
||||
__asm__ __volatile__ (
|
||||
".set noreorder \n"
|
||||
".set noat \n"
|
||||
"slti $at, %[count], 8 \n"
|
||||
"bne $at ,$zero, $last8 \n"
|
||||
"xor $t8, %[src], %[dst] \n"
|
||||
"andi $t8, $t8, 0x3 \n"
|
||||
|
||||
"bne $t8, $zero, unaligned \n"
|
||||
"negu $a3, %[dst] \n"
|
||||
// make dst/src aligned
|
||||
"andi $a3, $a3, 0x3 \n"
|
||||
"beq $a3, $zero, $chk16w \n"
|
||||
// word-aligned now count is the remining bytes count
|
||||
"subu %[count], %[count], $a3 \n"
|
||||
|
||||
"lwr $t8, 0(%[src]) \n"
|
||||
"addu %[src], %[src], $a3 \n"
|
||||
"swr $t8, 0(%[dst]) \n"
|
||||
"addu %[dst], %[dst], $a3 \n"
|
||||
|
||||
// Now the dst/src are mutually word-aligned with word-aligned addresses
|
||||
"$chk16w: \n"
|
||||
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
|
||||
// t8 is the byte count after 64-byte chunks
|
||||
"beq %[count], $t8, chk8w \n"
|
||||
// There will be at most 1 32-byte chunk after it
|
||||
"subu $a3, %[count], $t8 \n" // the reminder
|
||||
// Here a3 counts bytes in 16w chunks
|
||||
"addu $a3, %[dst], $a3 \n"
|
||||
// Now a3 is the final dst after 64-byte chunks
|
||||
"addu $t0, %[dst], %[count] \n"
|
||||
// t0 is the "past the end" address
|
||||
|
||||
// When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
|
||||
// the "t0-32" address
|
||||
// This means: for x=128 the last "safe" a1 address is "t0-160"
|
||||
// Alternatively, for x=64 the last "safe" a1 address is "t0-96"
|
||||
// we will use "pref 30,128(a1)", so "t0-160" is the limit
|
||||
"subu $t9, $t0, 160 \n"
|
||||
// t9 is the "last safe pref 30,128(a1)" address
|
||||
"pref 0, 0(%[src]) \n" // first line of src
|
||||
"pref 0, 32(%[src]) \n" // second line of src
|
||||
"pref 0, 64(%[src]) \n"
|
||||
"pref 30, 32(%[dst]) \n"
|
||||
// In case the a1 > t9 don't use "pref 30" at all
|
||||
"sgtu $v1, %[dst], $t9 \n"
|
||||
"bgtz $v1, $loop16w \n"
|
||||
"nop \n"
|
||||
// otherwise, start with using pref30
|
||||
"pref 30, 64(%[dst]) \n"
|
||||
"$loop16w: \n"
|
||||
"pref 0, 96(%[src]) \n"
|
||||
"lw $t0, 0(%[src]) \n"
|
||||
"bgtz $v1, $skip_pref30_96 \n" // skip
|
||||
"lw $t1, 4(%[src]) \n"
|
||||
"pref 30, 96(%[dst]) \n" // continue
|
||||
"$skip_pref30_96: \n"
|
||||
"lw $t2, 8(%[src]) \n"
|
||||
"lw $t3, 12(%[src]) \n"
|
||||
"lw $t4, 16(%[src]) \n"
|
||||
"lw $t5, 20(%[src]) \n"
|
||||
"lw $t6, 24(%[src]) \n"
|
||||
"lw $t7, 28(%[src]) \n"
|
||||
"pref 0, 128(%[src]) \n"
|
||||
// bring the next lines of src, addr 128
|
||||
"sw $t0, 0(%[dst]) \n"
|
||||
"sw $t1, 4(%[dst]) \n"
|
||||
"sw $t2, 8(%[dst]) \n"
|
||||
"sw $t3, 12(%[dst]) \n"
|
||||
"sw $t4, 16(%[dst]) \n"
|
||||
"sw $t5, 20(%[dst]) \n"
|
||||
"sw $t6, 24(%[dst]) \n"
|
||||
"sw $t7, 28(%[dst]) \n"
|
||||
"lw $t0, 32(%[src]) \n"
|
||||
"bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
|
||||
"lw $t1, 36(%[src]) \n"
|
||||
"pref 30, 128(%[dst]) \n" // set dest, addr 128
|
||||
"$skip_pref30_128: \n"
|
||||
"lw $t2, 40(%[src]) \n"
|
||||
"lw $t3, 44(%[src]) \n"
|
||||
"lw $t4, 48(%[src]) \n"
|
||||
"lw $t5, 52(%[src]) \n"
|
||||
"lw $t6, 56(%[src]) \n"
|
||||
"lw $t7, 60(%[src]) \n"
|
||||
"pref 0, 160(%[src]) \n"
|
||||
// bring the next lines of src, addr 160
|
||||
"sw $t0, 32(%[dst]) \n"
|
||||
"sw $t1, 36(%[dst]) \n"
|
||||
"sw $t2, 40(%[dst]) \n"
|
||||
"sw $t3, 44(%[dst]) \n"
|
||||
"sw $t4, 48(%[dst]) \n"
|
||||
"sw $t5, 52(%[dst]) \n"
|
||||
"sw $t6, 56(%[dst]) \n"
|
||||
"sw $t7, 60(%[dst]) \n"
|
||||
|
||||
"addiu %[dst], %[dst], 64 \n" // adding 64 to dest
|
||||
"sgtu $v1, %[dst], $t9 \n"
|
||||
"bne %[dst], $a3, $loop16w \n"
|
||||
" addiu %[src], %[src], 64 \n" // adding 64 to src
|
||||
"move %[count], $t8 \n"
|
||||
|
||||
// Here we have src and dest word-aligned but less than 64-bytes to go
|
||||
|
||||
"chk8w: \n"
|
||||
"pref 0, 0x0(%[src]) \n"
|
||||
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
|
||||
// the t8 is the reminder count past 32-bytes
|
||||
"beq %[count], $t8, chk1w \n"
|
||||
// count=t8,no 32-byte chunk
|
||||
" nop \n"
|
||||
|
||||
"lw $t0, 0(%[src]) \n"
|
||||
"lw $t1, 4(%[src]) \n"
|
||||
"lw $t2, 8(%[src]) \n"
|
||||
"lw $t3, 12(%[src]) \n"
|
||||
"lw $t4, 16(%[src]) \n"
|
||||
"lw $t5, 20(%[src]) \n"
|
||||
"lw $t6, 24(%[src]) \n"
|
||||
"lw $t7, 28(%[src]) \n"
|
||||
"addiu %[src], %[src], 32 \n"
|
||||
|
||||
"sw $t0, 0(%[dst]) \n"
|
||||
"sw $t1, 4(%[dst]) \n"
|
||||
"sw $t2, 8(%[dst]) \n"
|
||||
"sw $t3, 12(%[dst]) \n"
|
||||
"sw $t4, 16(%[dst]) \n"
|
||||
"sw $t5, 20(%[dst]) \n"
|
||||
"sw $t6, 24(%[dst]) \n"
|
||||
"sw $t7, 28(%[dst]) \n"
|
||||
"addiu %[dst], %[dst], 32 \n"
|
||||
|
||||
"chk1w: \n"
|
||||
"andi %[count], $t8, 0x3 \n"
|
||||
// now count is the reminder past 1w chunks
|
||||
"beq %[count], $t8, $last8 \n"
|
||||
" subu $a3, $t8, %[count] \n"
|
||||
// a3 is count of bytes in 1w chunks
|
||||
"addu $a3, %[dst], $a3 \n"
|
||||
// now a3 is the dst address past the 1w chunks
|
||||
// copying in words (4-byte chunks)
|
||||
"$wordCopy_loop: \n"
|
||||
"lw $t3, 0(%[src]) \n"
|
||||
// the first t3 may be equal t0 ... optimize?
|
||||
"addiu %[src], %[src],4 \n"
|
||||
"addiu %[dst], %[dst],4 \n"
|
||||
"bne %[dst], $a3,$wordCopy_loop \n"
|
||||
" sw $t3, -4(%[dst]) \n"
|
||||
|
||||
// For the last (<8) bytes
|
||||
"$last8: \n"
|
||||
"blez %[count], leave \n"
|
||||
" addu $a3, %[dst], %[count] \n" // a3 -last dst address
|
||||
"$last8loop: \n"
|
||||
"lb $v1, 0(%[src]) \n"
|
||||
"addiu %[src], %[src], 1 \n"
|
||||
"addiu %[dst], %[dst], 1 \n"
|
||||
"bne %[dst], $a3, $last8loop \n"
|
||||
" sb $v1, -1(%[dst]) \n"
|
||||
|
||||
"leave: \n"
|
||||
" j $ra \n"
|
||||
" nop \n"
|
||||
|
||||
//
|
||||
// UNALIGNED case
|
||||
//
|
||||
|
||||
"unaligned: \n"
|
||||
// got here with a3="negu a1"
|
||||
"andi $a3, $a3, 0x3 \n" // a1 is word aligned?
|
||||
"beqz $a3, $ua_chk16w \n"
|
||||
" subu %[count], %[count], $a3 \n"
|
||||
// bytes left after initial a3 bytes
|
||||
"lwr $v1, 0(%[src]) \n"
|
||||
"lwl $v1, 3(%[src]) \n"
|
||||
"addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
|
||||
"swr $v1, 0(%[dst]) \n"
|
||||
"addu %[dst], %[dst], $a3 \n"
|
||||
// below the dst will be word aligned (NOTE1)
|
||||
"$ua_chk16w: \n"
|
||||
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
|
||||
// t8 is the byte count after 64-byte chunks
|
||||
"beq %[count], $t8, ua_chk8w \n"
|
||||
// if a2==t8, no 64-byte chunks
|
||||
// There will be at most 1 32-byte chunk after it
|
||||
"subu $a3, %[count], $t8 \n" // the reminder
|
||||
// Here a3 counts bytes in 16w chunks
|
||||
"addu $a3, %[dst], $a3 \n"
|
||||
// Now a3 is the final dst after 64-byte chunks
|
||||
"addu $t0, %[dst], %[count] \n" // t0 "past the end"
|
||||
"subu $t9, $t0, 160 \n"
|
||||
// t9 is the "last safe pref 30,128(a1)" address
|
||||
"pref 0, 0(%[src]) \n" // first line of src
|
||||
"pref 0, 32(%[src]) \n" // second line addr 32
|
||||
"pref 0, 64(%[src]) \n"
|
||||
"pref 30, 32(%[dst]) \n"
|
||||
// safe, as we have at least 64 bytes ahead
|
||||
// In case the a1 > t9 don't use "pref 30" at all
|
||||
"sgtu $v1, %[dst], $t9 \n"
|
||||
"bgtz $v1, $ua_loop16w \n"
|
||||
// skip "pref 30,64(a1)" for too short arrays
|
||||
" nop \n"
|
||||
// otherwise, start with using pref30
|
||||
"pref 30, 64(%[dst]) \n"
|
||||
"$ua_loop16w: \n"
|
||||
"pref 0, 96(%[src]) \n"
|
||||
"lwr $t0, 0(%[src]) \n"
|
||||
"lwl $t0, 3(%[src]) \n"
|
||||
"lwr $t1, 4(%[src]) \n"
|
||||
"bgtz $v1, $ua_skip_pref30_96 \n"
|
||||
" lwl $t1, 7(%[src]) \n"
|
||||
"pref 30, 96(%[dst]) \n"
|
||||
// continue setting up the dest, addr 96
|
||||
"$ua_skip_pref30_96: \n"
|
||||
"lwr $t2, 8(%[src]) \n"
|
||||
"lwl $t2, 11(%[src]) \n"
|
||||
"lwr $t3, 12(%[src]) \n"
|
||||
"lwl $t3, 15(%[src]) \n"
|
||||
"lwr $t4, 16(%[src]) \n"
|
||||
"lwl $t4, 19(%[src]) \n"
|
||||
"lwr $t5, 20(%[src]) \n"
|
||||
"lwl $t5, 23(%[src]) \n"
|
||||
"lwr $t6, 24(%[src]) \n"
|
||||
"lwl $t6, 27(%[src]) \n"
|
||||
"lwr $t7, 28(%[src]) \n"
|
||||
"lwl $t7, 31(%[src]) \n"
|
||||
"pref 0, 128(%[src]) \n"
|
||||
// bring the next lines of src, addr 128
|
||||
"sw $t0, 0(%[dst]) \n"
|
||||
"sw $t1, 4(%[dst]) \n"
|
||||
"sw $t2, 8(%[dst]) \n"
|
||||
"sw $t3, 12(%[dst]) \n"
|
||||
"sw $t4, 16(%[dst]) \n"
|
||||
"sw $t5, 20(%[dst]) \n"
|
||||
"sw $t6, 24(%[dst]) \n"
|
||||
"sw $t7, 28(%[dst]) \n"
|
||||
"lwr $t0, 32(%[src]) \n"
|
||||
"lwl $t0, 35(%[src]) \n"
|
||||
"lwr $t1, 36(%[src]) \n"
|
||||
"bgtz $v1, ua_skip_pref30_128 \n"
|
||||
" lwl $t1, 39(%[src]) \n"
|
||||
"pref 30, 128(%[dst]) \n"
|
||||
// continue setting up the dest, addr 128
|
||||
"ua_skip_pref30_128: \n"
|
||||
|
||||
"lwr $t2, 40(%[src]) \n"
|
||||
"lwl $t2, 43(%[src]) \n"
|
||||
"lwr $t3, 44(%[src]) \n"
|
||||
"lwl $t3, 47(%[src]) \n"
|
||||
"lwr $t4, 48(%[src]) \n"
|
||||
"lwl $t4, 51(%[src]) \n"
|
||||
"lwr $t5, 52(%[src]) \n"
|
||||
"lwl $t5, 55(%[src]) \n"
|
||||
"lwr $t6, 56(%[src]) \n"
|
||||
"lwl $t6, 59(%[src]) \n"
|
||||
"lwr $t7, 60(%[src]) \n"
|
||||
"lwl $t7, 63(%[src]) \n"
|
||||
"pref 0, 160(%[src]) \n"
|
||||
// bring the next lines of src, addr 160
|
||||
"sw $t0, 32(%[dst]) \n"
|
||||
"sw $t1, 36(%[dst]) \n"
|
||||
"sw $t2, 40(%[dst]) \n"
|
||||
"sw $t3, 44(%[dst]) \n"
|
||||
"sw $t4, 48(%[dst]) \n"
|
||||
"sw $t5, 52(%[dst]) \n"
|
||||
"sw $t6, 56(%[dst]) \n"
|
||||
"sw $t7, 60(%[dst]) \n"
|
||||
|
||||
"addiu %[dst],%[dst],64 \n" // adding 64 to dest
|
||||
"sgtu $v1,%[dst],$t9 \n"
|
||||
"bne %[dst],$a3,$ua_loop16w \n"
|
||||
" addiu %[src],%[src],64 \n" // adding 64 to src
|
||||
"move %[count],$t8 \n"
|
||||
|
||||
// Here we have src and dest word-aligned but less than 64-bytes to go
|
||||
|
||||
"ua_chk8w: \n"
|
||||
"pref 0, 0x0(%[src]) \n"
|
||||
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
|
||||
// the t8 is the reminder count
|
||||
"beq %[count], $t8, $ua_chk1w \n"
|
||||
// when count==t8, no 32-byte chunk
|
||||
|
||||
"lwr $t0, 0(%[src]) \n"
|
||||
"lwl $t0, 3(%[src]) \n"
|
||||
"lwr $t1, 4(%[src]) \n"
|
||||
"lwl $t1, 7(%[src]) \n"
|
||||
"lwr $t2, 8(%[src]) \n"
|
||||
"lwl $t2, 11(%[src]) \n"
|
||||
"lwr $t3, 12(%[src]) \n"
|
||||
"lwl $t3, 15(%[src]) \n"
|
||||
"lwr $t4, 16(%[src]) \n"
|
||||
"lwl $t4, 19(%[src]) \n"
|
||||
"lwr $t5, 20(%[src]) \n"
|
||||
"lwl $t5, 23(%[src]) \n"
|
||||
"lwr $t6, 24(%[src]) \n"
|
||||
"lwl $t6, 27(%[src]) \n"
|
||||
"lwr $t7, 28(%[src]) \n"
|
||||
"lwl $t7, 31(%[src]) \n"
|
||||
"addiu %[src], %[src], 32 \n"
|
||||
|
||||
"sw $t0, 0(%[dst]) \n"
|
||||
"sw $t1, 4(%[dst]) \n"
|
||||
"sw $t2, 8(%[dst]) \n"
|
||||
"sw $t3, 12(%[dst]) \n"
|
||||
"sw $t4, 16(%[dst]) \n"
|
||||
"sw $t5, 20(%[dst]) \n"
|
||||
"sw $t6, 24(%[dst]) \n"
|
||||
"sw $t7, 28(%[dst]) \n"
|
||||
"addiu %[dst], %[dst], 32 \n"
|
||||
|
||||
"$ua_chk1w: \n"
|
||||
"andi %[count], $t8, 0x3 \n"
|
||||
// now count is the reminder past 1w chunks
|
||||
"beq %[count], $t8, ua_smallCopy \n"
|
||||
"subu $a3, $t8, %[count] \n"
|
||||
// a3 is count of bytes in 1w chunks
|
||||
"addu $a3, %[dst], $a3 \n"
|
||||
// now a3 is the dst address past the 1w chunks
|
||||
|
||||
// copying in words (4-byte chunks)
|
||||
"$ua_wordCopy_loop: \n"
|
||||
"lwr $v1, 0(%[src]) \n"
|
||||
"lwl $v1, 3(%[src]) \n"
|
||||
"addiu %[src], %[src], 4 \n"
|
||||
"addiu %[dst], %[dst], 4 \n"
|
||||
// note: dst=a1 is word aligned here, see NOTE1
|
||||
"bne %[dst], $a3, $ua_wordCopy_loop \n"
|
||||
" sw $v1,-4(%[dst]) \n"
|
||||
|
||||
// Now less than 4 bytes (value in count) left to copy
|
||||
"ua_smallCopy: \n"
|
||||
"beqz %[count], leave \n"
|
||||
" addu $a3, %[dst], %[count] \n" // a3 = last dst address
|
||||
"$ua_smallCopy_loop: \n"
|
||||
"lb $v1, 0(%[src]) \n"
|
||||
"addiu %[src], %[src], 1 \n"
|
||||
"addiu %[dst], %[dst], 1 \n"
|
||||
"bne %[dst],$a3,$ua_smallCopy_loop \n"
|
||||
" sb $v1, -1(%[dst]) \n"
|
||||
|
||||
"j $ra \n"
|
||||
" nop \n"
|
||||
".set at \n"
|
||||
".set reorder \n"
|
||||
: [dst] "+r" (dst), [src] "+r" (src)
|
||||
: [count] "r" (count)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
|
||||
"t8", "t9", "a3", "v1", "at"
|
||||
);
|
||||
}
|
||||
#endif // HAS_COPYROW_MIPS
|
||||
|
||||
// DSPR2 functions
|
||||
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
|
||||
(__mips_dsp_rev >= 2) && \
|
||||
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
|
||||
|
||||
void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"srl $t4, %[width], 4 \n" // multiplies of 16
|
||||
"blez $t4, 2f \n"
|
||||
" andi %[width], %[width], 0xf \n" // residual
|
||||
|
||||
"1: \n"
|
||||
"addiu $t4, $t4, -1 \n"
|
||||
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
|
||||
"lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
|
||||
"lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
|
||||
"lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
|
||||
"lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
|
||||
"lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
|
||||
"lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
|
||||
"lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
|
||||
"addiu %[src_uv], %[src_uv], 32 \n"
|
||||
"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
|
||||
"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
|
||||
"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
|
||||
"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
|
||||
"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
|
||||
"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
|
||||
"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
|
||||
"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
|
||||
"sw $t9, 0(%[dst_v]) \n"
|
||||
"sw $t0, 0(%[dst_u]) \n"
|
||||
"sw $t1, 4(%[dst_v]) \n"
|
||||
"sw $t2, 4(%[dst_u]) \n"
|
||||
"sw $t3, 8(%[dst_v]) \n"
|
||||
"sw $t5, 8(%[dst_u]) \n"
|
||||
"sw $t6, 12(%[dst_v]) \n"
|
||||
"sw $t7, 12(%[dst_u]) \n"
|
||||
"addiu %[dst_v], %[dst_v], 16 \n"
|
||||
"bgtz $t4, 1b \n"
|
||||
" addiu %[dst_u], %[dst_u], 16 \n"
|
||||
|
||||
"beqz %[width], 3f \n"
|
||||
" nop \n"
|
||||
|
||||
"2: \n"
|
||||
"lbu $t0, 0(%[src_uv]) \n"
|
||||
"lbu $t1, 1(%[src_uv]) \n"
|
||||
"addiu %[src_uv], %[src_uv], 2 \n"
|
||||
"addiu %[width], %[width], -1 \n"
|
||||
"sb $t0, 0(%[dst_u]) \n"
|
||||
"sb $t1, 0(%[dst_v]) \n"
|
||||
"addiu %[dst_u], %[dst_u], 1 \n"
|
||||
"bgtz %[width], 2b \n"
|
||||
" addiu %[dst_v], %[dst_v], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_uv] "+r" (src_uv),
|
||||
[width] "+r" (width),
|
||||
[dst_u] "+r" (dst_u),
|
||||
[dst_v] "+r" (dst_v)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3",
|
||||
"t4", "t5", "t6", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t4, %[width], 4 \n" // multiplies of 16
|
||||
"andi $t5, %[width], 0xf \n"
|
||||
"blez $t4, 2f \n"
|
||||
" addu %[src], %[src], %[width] \n" // src += width
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
|
||||
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
|
||||
"lw $t2, -8(%[src]) \n" // |11|10|9|8|
|
||||
"lw $t3, -4(%[src]) \n" // |15|14|13|12|
|
||||
"wsbh $t0, $t0 \n" // |2|3|0|1|
|
||||
"wsbh $t1, $t1 \n" // |6|7|4|5|
|
||||
"wsbh $t2, $t2 \n" // |10|11|8|9|
|
||||
"wsbh $t3, $t3 \n" // |14|15|12|13|
|
||||
"rotr $t0, $t0, 16 \n" // |0|1|2|3|
|
||||
"rotr $t1, $t1, 16 \n" // |4|5|6|7|
|
||||
"rotr $t2, $t2, 16 \n" // |8|9|10|11|
|
||||
"rotr $t3, $t3, 16 \n" // |12|13|14|15|
|
||||
"addiu %[src], %[src], -16 \n"
|
||||
"addiu $t4, $t4, -1 \n"
|
||||
"sw $t3, 0(%[dst]) \n" // |15|14|13|12|
|
||||
"sw $t2, 4(%[dst]) \n" // |11|10|9|8|
|
||||
"sw $t1, 8(%[dst]) \n" // |7|6|5|4|
|
||||
"sw $t0, 12(%[dst]) \n" // |3|2|1|0|
|
||||
"bgtz $t4, 1b \n"
|
||||
" addiu %[dst], %[dst], 16 \n"
|
||||
"beqz $t5, 3f \n"
|
||||
" nop \n"
|
||||
|
||||
"2: \n"
|
||||
"lbu $t0, -1(%[src]) \n"
|
||||
"addiu $t5, $t5, -1 \n"
|
||||
"addiu %[src], %[src], -1 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
"bgez $t5, 2b \n"
|
||||
" addiu %[dst], %[dst], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src] "+r" (src), [dst] "+r" (dst)
|
||||
: [width] "r" (width)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5"
|
||||
);
|
||||
}
|
||||
|
||||
void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"addu $t4, %[width], %[width] \n"
|
||||
"srl %[x], %[width], 4 \n"
|
||||
"andi %[y], %[width], 0xf \n"
|
||||
"blez %[x], 2f \n"
|
||||
" addu %[src_uv], %[src_uv], $t4 \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
|
||||
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
|
||||
"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
|
||||
"lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
|
||||
"lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
|
||||
"lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
|
||||
"lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
|
||||
"lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
|
||||
|
||||
"rotr $t0, $t0, 16 \n" // |1|0|3|2|
|
||||
"rotr $t1, $t1, 16 \n" // |5|4|7|6|
|
||||
"rotr $t2, $t2, 16 \n" // |9|8|11|10|
|
||||
"rotr $t3, $t3, 16 \n" // |13|12|15|14|
|
||||
"rotr $t4, $t4, 16 \n" // |17|16|19|18|
|
||||
"rotr $t6, $t6, 16 \n" // |21|20|23|22|
|
||||
"rotr $t7, $t7, 16 \n" // |25|24|27|26|
|
||||
"rotr $t8, $t8, 16 \n" // |29|28|31|30|
|
||||
"precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
|
||||
"precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
|
||||
"precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
|
||||
"precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
|
||||
"precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
|
||||
"precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
|
||||
"precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
|
||||
"precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
|
||||
"addiu %[src_uv], %[src_uv], -32 \n"
|
||||
"addiu %[x], %[x], -1 \n"
|
||||
"swr $t4, 0(%[dst_u]) \n"
|
||||
"swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
|
||||
"swr $t6, 0(%[dst_v]) \n"
|
||||
"swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
|
||||
"swr $t2, 4(%[dst_u]) \n"
|
||||
"swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
|
||||
"swr $t3, 4(%[dst_v]) \n"
|
||||
"swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
|
||||
"swr $t0, 8(%[dst_u]) \n"
|
||||
"swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
|
||||
"swr $t1, 8(%[dst_v]) \n"
|
||||
"swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
|
||||
"swr $t9, 12(%[dst_u]) \n"
|
||||
"swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
|
||||
"swr $t5, 12(%[dst_v]) \n"
|
||||
"swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
|
||||
"addiu %[dst_v], %[dst_v], 16 \n"
|
||||
"bgtz %[x], 1b \n"
|
||||
" addiu %[dst_u], %[dst_u], 16 \n"
|
||||
"beqz %[y], 3f \n"
|
||||
" nop \n"
|
||||
"b 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"2: \n"
|
||||
"lbu $t0, -2(%[src_uv]) \n"
|
||||
"lbu $t1, -1(%[src_uv]) \n"
|
||||
"addiu %[src_uv], %[src_uv], -2 \n"
|
||||
"addiu %[y], %[y], -1 \n"
|
||||
"sb $t0, 0(%[dst_u]) \n"
|
||||
"sb $t1, 0(%[dst_v]) \n"
|
||||
"addiu %[dst_u], %[dst_u], 1 \n"
|
||||
"bgtz %[y], 2b \n"
|
||||
" addiu %[dst_v], %[dst_v], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_uv] "+r" (src_uv),
|
||||
[dst_u] "+r" (dst_u),
|
||||
[dst_v] "+r" (dst_v),
|
||||
[x] "=&r" (x),
|
||||
[y] "+r" (y)
|
||||
: [width] "r" (width)
|
||||
: "t0", "t1", "t2", "t3", "t4",
|
||||
"t5", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
// Convert (4 Y and 2 VU) I422 and arrange RGB values into
|
||||
// t5 = | 0 | B0 | 0 | b0 |
|
||||
// t4 = | 0 | B1 | 0 | b1 |
|
||||
// t9 = | 0 | G0 | 0 | g0 |
|
||||
// t8 = | 0 | G1 | 0 | g1 |
|
||||
// t2 = | 0 | R0 | 0 | r0 |
|
||||
// t1 = | 0 | R1 | 0 | r1 |
|
||||
#define YUVTORGB \
|
||||
"lw $t0, 0(%[y_buf]) \n" \
|
||||
"lhu $t1, 0(%[u_buf]) \n" \
|
||||
"lhu $t2, 0(%[v_buf]) \n" \
|
||||
"preceu.ph.qbr $t1, $t1 \n" \
|
||||
"preceu.ph.qbr $t2, $t2 \n" \
|
||||
"preceu.ph.qbra $t3, $t0 \n" \
|
||||
"preceu.ph.qbla $t0, $t0 \n" \
|
||||
"subu.ph $t1, $t1, $s5 \n" \
|
||||
"subu.ph $t2, $t2, $s5 \n" \
|
||||
"subu.ph $t3, $t3, $s4 \n" \
|
||||
"subu.ph $t0, $t0, $s4 \n" \
|
||||
"mul.ph $t3, $t3, $s0 \n" \
|
||||
"mul.ph $t0, $t0, $s0 \n" \
|
||||
"shll.ph $t4, $t1, 0x7 \n" \
|
||||
"subu.ph $t4, $t4, $t1 \n" \
|
||||
"mul.ph $t6, $t1, $s1 \n" \
|
||||
"mul.ph $t1, $t2, $s2 \n" \
|
||||
"addq_s.ph $t5, $t4, $t3 \n" \
|
||||
"addq_s.ph $t4, $t4, $t0 \n" \
|
||||
"shra.ph $t5, $t5, 6 \n" \
|
||||
"shra.ph $t4, $t4, 6 \n" \
|
||||
"addiu %[u_buf], 2 \n" \
|
||||
"addiu %[v_buf], 2 \n" \
|
||||
"addu.ph $t6, $t6, $t1 \n" \
|
||||
"mul.ph $t1, $t2, $s3 \n" \
|
||||
"addu.ph $t9, $t6, $t3 \n" \
|
||||
"addu.ph $t8, $t6, $t0 \n" \
|
||||
"shra.ph $t9, $t9, 6 \n" \
|
||||
"shra.ph $t8, $t8, 6 \n" \
|
||||
"addu.ph $t2, $t1, $t3 \n" \
|
||||
"addu.ph $t1, $t1, $t0 \n" \
|
||||
"shra.ph $t2, $t2, 6 \n" \
|
||||
"shra.ph $t1, $t1, 6 \n" \
|
||||
"subu.ph $t5, $t5, $s5 \n" \
|
||||
"subu.ph $t4, $t4, $s5 \n" \
|
||||
"subu.ph $t9, $t9, $s5 \n" \
|
||||
"subu.ph $t8, $t8, $s5 \n" \
|
||||
"subu.ph $t2, $t2, $s5 \n" \
|
||||
"subu.ph $t1, $t1, $s5 \n" \
|
||||
"shll_s.ph $t5, $t5, 8 \n" \
|
||||
"shll_s.ph $t4, $t4, 8 \n" \
|
||||
"shll_s.ph $t9, $t9, 8 \n" \
|
||||
"shll_s.ph $t8, $t8, 8 \n" \
|
||||
"shll_s.ph $t2, $t2, 8 \n" \
|
||||
"shll_s.ph $t1, $t1, 8 \n" \
|
||||
"shra.ph $t5, $t5, 8 \n" \
|
||||
"shra.ph $t4, $t4, 8 \n" \
|
||||
"shra.ph $t9, $t9, 8 \n" \
|
||||
"shra.ph $t8, $t8, 8 \n" \
|
||||
"shra.ph $t2, $t2, 8 \n" \
|
||||
"shra.ph $t1, $t1, 8 \n" \
|
||||
"addu.ph $t5, $t5, $s5 \n" \
|
||||
"addu.ph $t4, $t4, $s5 \n" \
|
||||
"addu.ph $t9, $t9, $s5 \n" \
|
||||
"addu.ph $t8, $t8, $s5 \n" \
|
||||
"addu.ph $t2, $t2, $s5 \n" \
|
||||
"addu.ph $t1, $t1, $s5 \n"
|
||||
|
||||
// TODO(fbarchard): accept yuv conversion constants.
|
||||
void I422ToARGBRow_DSPR2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"beqz %[width], 2f \n"
|
||||
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
|
||||
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
|
||||
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
|
||||
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
|
||||
"repl.ph $s4, 16 \n" // |0|16|0|16|
|
||||
"repl.ph $s5, 128 \n" // |128|128| // clipping
|
||||
"lui $s6, 0xff00 \n"
|
||||
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
|
||||
|
||||
"1: \n"
|
||||
YUVTORGB
|
||||
// Arranging into argb format
|
||||
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
|
||||
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
|
||||
"addiu %[width], -4 \n"
|
||||
"precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
|
||||
"precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
|
||||
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
|
||||
|
||||
"addiu %[y_buf], 4 \n"
|
||||
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
|
||||
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
|
||||
"or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
|
||||
"or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
|
||||
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
|
||||
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"sll $t8, $t8, 16 \n"
|
||||
"packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
|
||||
"packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
|
||||
// Store results.
|
||||
"sw $t2, 0(%[rgb_buf]) \n"
|
||||
"sw $t0, 4(%[rgb_buf]) \n"
|
||||
"sw $t1, 8(%[rgb_buf]) \n"
|
||||
"sw $t3, 12(%[rgb_buf]) \n"
|
||||
"bnez %[width], 1b \n"
|
||||
" addiu %[rgb_buf], 16 \n"
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
:[y_buf] "+r" (y_buf),
|
||||
[u_buf] "+r" (u_buf),
|
||||
[v_buf] "+r" (v_buf),
|
||||
[width] "+r" (width),
|
||||
[rgb_buf] "+r" (rgb_buf)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3",
|
||||
"s4", "s5", "s6"
|
||||
);
|
||||
}
|
||||
|
||||
// Bilinear filter 8x2 -> 8x1
|
||||
void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) {
|
||||
int y0_fraction = 256 - source_y_fraction;
|
||||
const uint8* src_ptr1 = src_ptr + src_stride;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"replv.ph $t0, %[y0_fraction] \n"
|
||||
"replv.ph $t1, %[source_y_fraction] \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t2, 0(%[src_ptr]) \n"
|
||||
"lw $t3, 0(%[src_ptr1]) \n"
|
||||
"lw $t4, 4(%[src_ptr]) \n"
|
||||
"lw $t5, 4(%[src_ptr1]) \n"
|
||||
"muleu_s.ph.qbl $t6, $t2, $t0 \n"
|
||||
"muleu_s.ph.qbr $t7, $t2, $t0 \n"
|
||||
"muleu_s.ph.qbl $t8, $t3, $t1 \n"
|
||||
"muleu_s.ph.qbr $t9, $t3, $t1 \n"
|
||||
"muleu_s.ph.qbl $t2, $t4, $t0 \n"
|
||||
"muleu_s.ph.qbr $t3, $t4, $t0 \n"
|
||||
"muleu_s.ph.qbl $t4, $t5, $t1 \n"
|
||||
"muleu_s.ph.qbr $t5, $t5, $t1 \n"
|
||||
"addq.ph $t6, $t6, $t8 \n"
|
||||
"addq.ph $t7, $t7, $t9 \n"
|
||||
"addq.ph $t2, $t2, $t4 \n"
|
||||
"addq.ph $t3, $t3, $t5 \n"
|
||||
"shra.ph $t6, $t6, 8 \n"
|
||||
"shra.ph $t7, $t7, 8 \n"
|
||||
"shra.ph $t2, $t2, 8 \n"
|
||||
"shra.ph $t3, $t3, 8 \n"
|
||||
"precr.qb.ph $t6, $t6, $t7 \n"
|
||||
"precr.qb.ph $t2, $t2, $t3 \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 8 \n"
|
||||
"addiu %[src_ptr1], %[src_ptr1], 8 \n"
|
||||
"addiu %[dst_width], %[dst_width], -8 \n"
|
||||
"sw $t6, 0(%[dst_ptr]) \n"
|
||||
"sw $t2, 4(%[dst_ptr]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" addiu %[dst_ptr], %[dst_ptr], 8 \n"
|
||||
|
||||
".set pop \n"
|
||||
: [dst_ptr] "+r" (dst_ptr),
|
||||
[src_ptr1] "+r" (src_ptr1),
|
||||
[src_ptr] "+r" (src_ptr),
|
||||
[dst_width] "+r" (dst_width)
|
||||
: [source_y_fraction] "r" (source_y_fraction),
|
||||
[y0_fraction] "r" (y0_fraction),
|
||||
[src_stride] "r" (src_stride)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
#endif // __mips_dsp_rev >= 2
|
||||
|
||||
#endif // defined(__mips__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
2839
libs/libyuv/source/row_neon.cc
Normal file
2839
libs/libyuv/source/row_neon.cc
Normal file
File diff suppressed because it is too large
Load Diff
2961
libs/libyuv/source/row_neon64.cc
Normal file
2961
libs/libyuv/source/row_neon64.cc
Normal file
File diff suppressed because it is too large
Load Diff
6241
libs/libyuv/source/row_win.cc
Normal file
6241
libs/libyuv/source/row_win.cc
Normal file
File diff suppressed because it is too large
Load Diff
1672
libs/libyuv/source/scale.cc
Normal file
1672
libs/libyuv/source/scale.cc
Normal file
File diff suppressed because it is too large
Load Diff
221
libs/libyuv/source/scale_any.cc
Normal file
221
libs/libyuv/source/scale_any.cc
Normal file
@@ -0,0 +1,221 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/scale.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
|
||||
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
|
||||
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
|
||||
int dst_width, int x, int dx) { \
|
||||
int n = dst_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
|
||||
} \
|
||||
TERP_C(dst_ptr + n * BPP, src_ptr, \
|
||||
dst_width & MASK, x + n * dx, dx); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_NEON
|
||||
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
|
||||
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
|
||||
ScaleARGBFilterCols_C, 4, 3)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
// Fixed scale down.
|
||||
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
|
||||
uint8* dst_ptr, int dst_width) { \
|
||||
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
|
||||
int n = dst_width - r; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
|
||||
dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
// Fixed scale down for odd source width. Used by I420Blend subsampling.
|
||||
// Since dst_width is (width + 1) / 2, this function scales one less pixel
|
||||
// and copies the last pixel.
|
||||
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
|
||||
uint8* dst_ptr, int dst_width) { \
|
||||
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
|
||||
int n = dst_width - r; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
|
||||
dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN2_SSSE3
|
||||
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
|
||||
ScaleRowDown2Linear_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
|
||||
2, 1, 15)
|
||||
SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
|
||||
ScaleRowDown2Box_Odd_C, 2, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
|
||||
ScaleRowDown2Linear_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
|
||||
2, 1, 31)
|
||||
SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
|
||||
2, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_NEON
|
||||
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
|
||||
ScaleRowDown2Linear_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
|
||||
ScaleRowDown2Box_C, 2, 1, 15)
|
||||
SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
|
||||
ScaleRowDown2Box_Odd_C, 2, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_SSSE3
|
||||
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
|
||||
SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
|
||||
4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_AVX2
|
||||
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
|
||||
SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
|
||||
4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_NEON
|
||||
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
|
||||
SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
|
||||
4, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_SSSE3
|
||||
SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
|
||||
ScaleRowDown34_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
|
||||
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
|
||||
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_NEON
|
||||
SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
|
||||
ScaleRowDown34_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
|
||||
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
|
||||
SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
|
||||
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_SSSE3
|
||||
SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
|
||||
ScaleRowDown38_C, 8 / 3, 1, 11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
|
||||
ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
|
||||
ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_NEON
|
||||
SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
|
||||
ScaleRowDown38_C, 8 / 3, 1, 11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
|
||||
ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
|
||||
ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
|
||||
SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
|
||||
ScaleARGBRowDown2_C, 2, 4, 3)
|
||||
SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
|
||||
ScaleARGBRowDown2Linear_C, 2, 4, 3)
|
||||
SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
|
||||
ScaleARGBRowDown2Box_C, 2, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_NEON
|
||||
SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
|
||||
ScaleARGBRowDown2_C, 2, 4, 7)
|
||||
SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
|
||||
ScaleARGBRowDown2Linear_C, 2, 4, 7)
|
||||
SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
|
||||
ScaleARGBRowDown2Box_C, 2, 4, 7)
|
||||
#endif
|
||||
#undef SDANY
|
||||
|
||||
// Scale down by even scale factor.
|
||||
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
|
||||
uint8* dst_ptr, int dst_width) { \
|
||||
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
|
||||
int n = dst_width - r; \
|
||||
if (n > 0) { \
|
||||
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
|
||||
} \
|
||||
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
|
||||
src_stepx, dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||
SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
|
||||
ScaleARGBRowDownEven_C, 4, 3)
|
||||
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
|
||||
ScaleARGBRowDownEvenBox_C, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
|
||||
ScaleARGBRowDownEven_C, 4, 3)
|
||||
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
|
||||
ScaleARGBRowDownEvenBox_C, 4, 3)
|
||||
#endif
|
||||
|
||||
// Add rows box filter scale down.
|
||||
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
|
||||
int n = src_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEADDROW_SSE2
|
||||
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_NEON
|
||||
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
859
libs/libyuv/source/scale_argb.cc
Normal file
859
libs/libyuv/source/scale_argb.cc
Normal file
@@ -0,0 +1,859 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/scale.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/planar_functions.h" // For CopyARGB
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static __inline int Abs(int v) {
|
||||
return v >= 0 ? v : -v;
|
||||
}
|
||||
|
||||
// ScaleARGB ARGB, 1/2
|
||||
// This is an optimized version for scaling down a ARGB to 1/2 of
|
||||
// its original size.
|
||||
static void ScaleARGBDown2(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_argb, uint8* dst_argb,
|
||||
int x, int dx, int y, int dy,
|
||||
enum FilterMode filtering) {
|
||||
int j;
|
||||
int row_stride = src_stride * (dy >> 16);
|
||||
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width) =
|
||||
filtering == kFilterNone ? ScaleARGBRowDown2_C :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
|
||||
ScaleARGBRowDown2Box_C);
|
||||
assert(dx == 65536 * 2); // Test scale factor of 2.
|
||||
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
|
||||
// Advance to odd row, even column.
|
||||
if (filtering == kFilterBilinear) {
|
||||
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
|
||||
} else {
|
||||
src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
|
||||
}
|
||||
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
|
||||
ScaleARGBRowDown2Box_Any_SSE2);
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
|
||||
ScaleARGBRowDown2Box_SSE2);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
|
||||
ScaleARGBRowDown2Box_Any_NEON);
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
|
||||
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
|
||||
ScaleARGBRowDown2Box_NEON);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (filtering == kFilterLinear) {
|
||||
src_stride = 0;
|
||||
}
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
|
||||
src_argb += row_stride;
|
||||
dst_argb += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
// ScaleARGB ARGB, 1/4
|
||||
// This is an optimized version for scaling down a ARGB to 1/4 of
|
||||
// its original size.
|
||||
static void ScaleARGBDown4Box(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_argb, uint8* dst_argb,
|
||||
int x, int dx, int y, int dy) {
|
||||
int j;
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
int row_stride = src_stride * (dy >> 16);
|
||||
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
|
||||
// Advance to odd row, even column.
|
||||
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
|
||||
assert(dx == 65536 * 4); // Test scale factor of 4.
|
||||
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
|
||||
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
|
||||
row + kRowSize, dst_width * 2);
|
||||
ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
|
||||
src_argb += row_stride;
|
||||
dst_argb += dst_stride;
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
}
|
||||
|
||||
// ScaleARGB ARGB Even
|
||||
// This is an optimized version for scaling down a ARGB to even
|
||||
// multiple of its original size.
|
||||
static void ScaleARGBDownEven(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_argb, uint8* dst_argb,
|
||||
int x, int dx, int y, int dy,
|
||||
enum FilterMode filtering) {
|
||||
int j;
|
||||
int col_step = dx >> 16;
|
||||
int row_stride = (dy >> 16) * src_stride;
|
||||
void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
int src_step, uint8* dst_argb, int dst_width) =
|
||||
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
|
||||
assert(IS_ALIGNED(src_width, 2));
|
||||
assert(IS_ALIGNED(src_height, 2));
|
||||
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
|
||||
ScaleARGBRowDownEven_Any_SSE2;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
|
||||
ScaleARGBRowDownEven_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
|
||||
ScaleARGBRowDownEven_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
|
||||
ScaleARGBRowDownEven_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (filtering == kFilterLinear) {
|
||||
src_stride = 0;
|
||||
}
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
|
||||
src_argb += row_stride;
|
||||
dst_argb += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
// Scale ARGB down with bilinear interpolation.
|
||||
static void ScaleARGBBilinearDown(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_argb, uint8* dst_argb,
|
||||
int x, int dx, int y, int dy,
|
||||
enum FilterMode filtering) {
|
||||
int j;
|
||||
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
|
||||
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
|
||||
InterpolateRow_C;
|
||||
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) =
|
||||
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
|
||||
int64 xlast = x + (int64)(dst_width - 1) * dx;
|
||||
int64 xl = (dx >= 0) ? x : xlast;
|
||||
int64 xr = (dx >= 0) ? xlast : x;
|
||||
int clip_src_width;
|
||||
xl = (xl >> 16) & ~3; // Left edge aligned.
|
||||
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
|
||||
xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
|
||||
if (xr > src_width) {
|
||||
xr = src_width;
|
||||
}
|
||||
clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
|
||||
src_argb += xl * 4;
|
||||
x -= (int)(xl << 16);
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(clip_src_width, 16)) {
|
||||
InterpolateRow = InterpolateRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
InterpolateRow = InterpolateRow_Any_AVX2;
|
||||
if (IS_ALIGNED(clip_src_width, 32)) {
|
||||
InterpolateRow = InterpolateRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
InterpolateRow = InterpolateRow_Any_NEON;
|
||||
if (IS_ALIGNED(clip_src_width, 16)) {
|
||||
InterpolateRow = InterpolateRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2) &&
|
||||
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
InterpolateRow = InterpolateRow_Any_DSPR2;
|
||||
if (IS_ALIGNED(clip_src_width, 4)) {
|
||||
InterpolateRow = InterpolateRow_DSPR2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
|
||||
// Allocate a row of ARGB.
|
||||
{
|
||||
align_buffer_64(row, clip_src_width * 4);
|
||||
|
||||
const int max_y = (src_height - 1) << 16;
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
}
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
int yi = y >> 16;
|
||||
const uint8* src = src_argb + yi * src_stride;
|
||||
if (filtering == kFilterLinear) {
|
||||
ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
|
||||
} else {
|
||||
int yf = (y >> 8) & 255;
|
||||
InterpolateRow(row, src, src_stride, clip_src_width, yf);
|
||||
ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
|
||||
}
|
||||
dst_argb += dst_stride;
|
||||
y += dy;
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
}
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
}
|
||||
}
|
||||
|
||||
// Scale ARGB up with bilinear interpolation.
|
||||
static void ScaleARGBBilinearUp(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_argb, uint8* dst_argb,
|
||||
int x, int dx, int y, int dy,
|
||||
enum FilterMode filtering) {
|
||||
int j;
|
||||
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
|
||||
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
|
||||
InterpolateRow_C;
|
||||
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
|
||||
const int max_y = (src_height - 1) << 16;
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
InterpolateRow = InterpolateRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
InterpolateRow = InterpolateRow_Any_AVX2;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
InterpolateRow = InterpolateRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
InterpolateRow = InterpolateRow_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
InterpolateRow = InterpolateRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
|
||||
InterpolateRow = InterpolateRow_DSPR2;
|
||||
}
|
||||
#endif
|
||||
if (src_width >= 32768) {
|
||||
ScaleARGBFilterCols = filtering ?
|
||||
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
|
||||
}
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
|
||||
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
|
||||
if (filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
}
|
||||
|
||||
{
|
||||
int yi = y >> 16;
|
||||
const uint8* src = src_argb + yi * src_stride;
|
||||
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (dst_width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
|
||||
uint8* rowptr = row;
|
||||
int rowstride = kRowSize;
|
||||
int lasty = yi;
|
||||
|
||||
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
|
||||
if (src_height > 1) {
|
||||
src += src_stride;
|
||||
}
|
||||
ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
|
||||
src += src_stride;
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
yi = y >> 16;
|
||||
if (yi != lasty) {
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
yi = y >> 16;
|
||||
src = src_argb + yi * src_stride;
|
||||
}
|
||||
if (yi != lasty) {
|
||||
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
|
||||
rowptr += rowstride;
|
||||
rowstride = -rowstride;
|
||||
lasty = yi;
|
||||
src += src_stride;
|
||||
}
|
||||
}
|
||||
if (filtering == kFilterLinear) {
|
||||
InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
|
||||
} else {
|
||||
int yf = (y >> 8) & 255;
|
||||
InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
|
||||
}
|
||||
dst_argb += dst_stride;
|
||||
y += dy;
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef YUVSCALEUP
|
||||
// Scale YUV to ARGB up with bilinear interpolation.
|
||||
static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride_y,
|
||||
int src_stride_u,
|
||||
int src_stride_v,
|
||||
int dst_stride_argb,
|
||||
const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_argb,
|
||||
int x, int dx, int y, int dy,
|
||||
enum FilterMode filtering) {
|
||||
int j;
|
||||
void (*I422ToARGBRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) = I422ToARGBRow_C;
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(src_width, 8)) {
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(src_width, 16)) {
|
||||
I422ToARGBRow = I422ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(src_width, 8)) {
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
I422ToARGBRow = I422ToARGBRow_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
|
||||
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
|
||||
InterpolateRow_C;
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
InterpolateRow = InterpolateRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
InterpolateRow = InterpolateRow_Any_AVX2;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
InterpolateRow = InterpolateRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
InterpolateRow = InterpolateRow_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
InterpolateRow = InterpolateRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasDSPR2) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
InterpolateRow = InterpolateRow_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) =
|
||||
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
|
||||
if (src_width >= 32768) {
|
||||
ScaleARGBFilterCols = filtering ?
|
||||
ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
|
||||
}
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
|
||||
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
|
||||
if (filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
const int max_y = (src_height - 1) << 16;
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
}
|
||||
const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
|
||||
int yi = y >> 16;
|
||||
int uv_yi = yi >> kYShift;
|
||||
const uint8* src_row_y = src_y + yi * src_stride_y;
|
||||
const uint8* src_row_u = src_u + uv_yi * src_stride_u;
|
||||
const uint8* src_row_v = src_v + uv_yi * src_stride_v;
|
||||
|
||||
// Allocate 2 rows of ARGB.
|
||||
const int kRowSize = (dst_width * 4 + 31) & ~31;
|
||||
align_buffer_64(row, kRowSize * 2);
|
||||
|
||||
// Allocate 1 row of ARGB for source conversion.
|
||||
align_buffer_64(argb_row, src_width * 4);
|
||||
|
||||
uint8* rowptr = row;
|
||||
int rowstride = kRowSize;
|
||||
int lasty = yi;
|
||||
|
||||
// TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
|
||||
ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
|
||||
if (src_height > 1) {
|
||||
src_row_y += src_stride_y;
|
||||
if (yi & 1) {
|
||||
src_row_u += src_stride_u;
|
||||
src_row_v += src_stride_v;
|
||||
}
|
||||
}
|
||||
ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
|
||||
if (src_height > 2) {
|
||||
src_row_y += src_stride_y;
|
||||
if (!(yi & 1)) {
|
||||
src_row_u += src_stride_u;
|
||||
src_row_v += src_stride_v;
|
||||
}
|
||||
}
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
yi = y >> 16;
|
||||
if (yi != lasty) {
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
yi = y >> 16;
|
||||
uv_yi = yi >> kYShift;
|
||||
src_row_y = src_y + yi * src_stride_y;
|
||||
src_row_u = src_u + uv_yi * src_stride_u;
|
||||
src_row_v = src_v + uv_yi * src_stride_v;
|
||||
}
|
||||
if (yi != lasty) {
|
||||
// TODO(fbarchard): Convert the clipped region of row.
|
||||
I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
|
||||
ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
|
||||
rowptr += rowstride;
|
||||
rowstride = -rowstride;
|
||||
lasty = yi;
|
||||
src_row_y += src_stride_y;
|
||||
if (yi & 1) {
|
||||
src_row_u += src_stride_u;
|
||||
src_row_v += src_stride_v;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (filtering == kFilterLinear) {
|
||||
InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
|
||||
} else {
|
||||
int yf = (y >> 8) & 255;
|
||||
InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
|
||||
}
|
||||
dst_argb += dst_stride_argb;
|
||||
y += dy;
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
free_aligned_buffer_64(row_argb);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Scale ARGB to/from any dimensions, without interpolation.
|
||||
// Fixed point math is used for performance: The upper 16 bits
|
||||
// of x and dx is the integer part of the source position and
|
||||
// the lower 16 bits are the fixed decimal part.
|
||||
|
||||
static void ScaleARGBSimple(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_argb, uint8* dst_argb,
|
||||
int x, int dx, int y, int dy) {
|
||||
int j;
|
||||
void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) =
|
||||
(src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBCols = ScaleARGBColsUp2_C;
|
||||
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
|
||||
dst_width, x, dx);
|
||||
dst_argb += dst_stride;
|
||||
y += dy;
|
||||
}
|
||||
}
|
||||
|
||||
// ScaleARGB a ARGB.
|
||||
// This function in turn calls a scaling function
|
||||
// suitable for handling the desired resolutions.
|
||||
static void ScaleARGB(const uint8* src, int src_stride,
|
||||
int src_width, int src_height,
|
||||
uint8* dst, int dst_stride,
|
||||
int dst_width, int dst_height,
|
||||
int clip_x, int clip_y, int clip_width, int clip_height,
|
||||
enum FilterMode filtering) {
|
||||
// Initial source x/y coordinate and step values as 16.16 fixed point.
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
int dx = 0;
|
||||
int dy = 0;
|
||||
// ARGB does not support box filter yet, but allow the user to pass it.
|
||||
// Simplify filtering when possible.
|
||||
filtering = ScaleFilterReduce(src_width, src_height,
|
||||
dst_width, dst_height,
|
||||
filtering);
|
||||
|
||||
// Negative src_height means invert the image.
|
||||
if (src_height < 0) {
|
||||
src_height = -src_height;
|
||||
src = src + (src_height - 1) * src_stride;
|
||||
src_stride = -src_stride;
|
||||
}
|
||||
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
|
||||
&x, &y, &dx, &dy);
|
||||
src_width = Abs(src_width);
|
||||
if (clip_x) {
|
||||
int64 clipf = (int64)(clip_x) * dx;
|
||||
x += (clipf & 0xffff);
|
||||
src += (clipf >> 16) * 4;
|
||||
dst += clip_x * 4;
|
||||
}
|
||||
if (clip_y) {
|
||||
int64 clipf = (int64)(clip_y) * dy;
|
||||
y += (clipf & 0xffff);
|
||||
src += (clipf >> 16) * src_stride;
|
||||
dst += clip_y * dst_stride;
|
||||
}
|
||||
|
||||
// Special case for integer step values.
|
||||
if (((dx | dy) & 0xffff) == 0) {
|
||||
if (!dx || !dy) { // 1 pixel wide and/or tall.
|
||||
filtering = kFilterNone;
|
||||
} else {
|
||||
// Optimized even scale down. ie 2, 4, 6, 8, 10x.
|
||||
if (!(dx & 0x10000) && !(dy & 0x10000)) {
|
||||
if (dx == 0x20000) {
|
||||
// Optimized 1/2 downsample.
|
||||
ScaleARGBDown2(src_width, src_height,
|
||||
clip_width, clip_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
x, dx, y, dy, filtering);
|
||||
return;
|
||||
}
|
||||
if (dx == 0x40000 && filtering == kFilterBox) {
|
||||
// Optimized 1/4 box downsample.
|
||||
ScaleARGBDown4Box(src_width, src_height,
|
||||
clip_width, clip_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
x, dx, y, dy);
|
||||
return;
|
||||
}
|
||||
ScaleARGBDownEven(src_width, src_height,
|
||||
clip_width, clip_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
x, dx, y, dy, filtering);
|
||||
return;
|
||||
}
|
||||
// Optimized odd scale down. ie 3, 5, 7, 9x.
|
||||
if ((dx & 0x10000) && (dy & 0x10000)) {
|
||||
filtering = kFilterNone;
|
||||
if (dx == 0x10000 && dy == 0x10000) {
|
||||
// Straight copy.
|
||||
ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
|
||||
dst, dst_stride, clip_width, clip_height);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (dx == 0x10000 && (x & 0xffff) == 0) {
|
||||
// Arbitrary scale vertically, but unscaled vertically.
|
||||
ScalePlaneVertical(src_height,
|
||||
clip_width, clip_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
x, y, dy, 4, filtering);
|
||||
return;
|
||||
}
|
||||
if (filtering && dy < 65536) {
|
||||
ScaleARGBBilinearUp(src_width, src_height,
|
||||
clip_width, clip_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
x, dx, y, dy, filtering);
|
||||
return;
|
||||
}
|
||||
if (filtering) {
|
||||
ScaleARGBBilinearDown(src_width, src_height,
|
||||
clip_width, clip_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
x, dx, y, dy, filtering);
|
||||
return;
|
||||
}
|
||||
ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
|
||||
src_stride, dst_stride, src, dst,
|
||||
x, dx, y, dy);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
|
||||
int src_width, int src_height,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int dst_width, int dst_height,
|
||||
int clip_x, int clip_y, int clip_width, int clip_height,
|
||||
enum FilterMode filtering) {
|
||||
if (!src_argb || src_width == 0 || src_height == 0 ||
|
||||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
|
||||
clip_x < 0 || clip_y < 0 ||
|
||||
clip_width > 32768 || clip_height > 32768 ||
|
||||
(clip_x + clip_width) > dst_width ||
|
||||
(clip_y + clip_height) > dst_height) {
|
||||
return -1;
|
||||
}
|
||||
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
|
||||
dst_argb, dst_stride_argb, dst_width, dst_height,
|
||||
clip_x, clip_y, clip_width, clip_height, filtering);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Scale an ARGB image.
|
||||
LIBYUV_API
|
||||
int ARGBScale(const uint8* src_argb, int src_stride_argb,
|
||||
int src_width, int src_height,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int dst_width, int dst_height,
|
||||
enum FilterMode filtering) {
|
||||
if (!src_argb || src_width == 0 || src_height == 0 ||
|
||||
src_width > 32768 || src_height > 32768 ||
|
||||
!dst_argb || dst_width <= 0 || dst_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
|
||||
dst_argb, dst_stride_argb, dst_width, dst_height,
|
||||
0, 0, dst_width, dst_height, filtering);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Scale with YUV conversion to ARGB and clipping.
|
||||
LIBYUV_API
|
||||
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
uint32 src_fourcc,
|
||||
int src_width, int src_height,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
uint32 dst_fourcc,
|
||||
int dst_width, int dst_height,
|
||||
int clip_x, int clip_y, int clip_width, int clip_height,
|
||||
enum FilterMode filtering) {
|
||||
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
|
||||
int r;
|
||||
I420ToARGB(src_y, src_stride_y,
|
||||
src_u, src_stride_u,
|
||||
src_v, src_stride_v,
|
||||
argb_buffer, src_width * 4,
|
||||
src_width, src_height);
|
||||
|
||||
r = ARGBScaleClip(argb_buffer, src_width * 4,
|
||||
src_width, src_height,
|
||||
dst_argb, dst_stride_argb,
|
||||
dst_width, dst_height,
|
||||
clip_x, clip_y, clip_width, clip_height,
|
||||
filtering);
|
||||
free(argb_buffer);
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
1151
libs/libyuv/source/scale_common.cc
Normal file
1151
libs/libyuv/source/scale_common.cc
Normal file
File diff suppressed because it is too large
Load Diff
1292
libs/libyuv/source/scale_gcc.cc
Normal file
1292
libs/libyuv/source/scale_gcc.cc
Normal file
File diff suppressed because it is too large
Load Diff
644
libs/libyuv/source/scale_mips.cc
Normal file
644
libs/libyuv/source/scale_mips.cc
Normal file
@@ -0,0 +1,644 @@
|
||||
/*
|
||||
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC MIPS DSPR2
|
||||
#if !defined(LIBYUV_DISABLE_MIPS) && \
|
||||
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
|
||||
(_MIPS_SIM == _MIPS_SIM_ABI32)
|
||||
|
||||
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
__asm__ __volatile__(
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
|
||||
"beqz $t9, 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
|
||||
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
|
||||
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
|
||||
// TODO(fbarchard): Use odd pixels instead of even.
|
||||
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
|
||||
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
|
||||
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
|
||||
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
|
||||
"addiu %[src_ptr], %[src_ptr], 32 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"sw $t8, 0(%[dst]) \n"
|
||||
"sw $t0, 4(%[dst]) \n"
|
||||
"sw $t1, 8(%[dst]) \n"
|
||||
"sw $t2, 12(%[dst]) \n"
|
||||
"bgtz $t9, 1b \n"
|
||||
" addiu %[dst], %[dst], 16 \n"
|
||||
|
||||
"2: \n"
|
||||
"andi $t9, %[dst_width], 0xf \n" // residue
|
||||
"beqz $t9, 3f \n"
|
||||
" nop \n"
|
||||
|
||||
"21: \n"
|
||||
"lbu $t0, 0(%[src_ptr]) \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 2 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
"bgtz $t9, 21b \n"
|
||||
" addiu %[dst], %[dst], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst)
|
||||
: [dst_width] "r" (dst_width)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
const uint8* t = src_ptr + src_stride;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
|
||||
"bltz $t9, 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
|
||||
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
|
||||
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
|
||||
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"srl $t8, $t0, 16 \n" // |X|X|3|2|
|
||||
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
|
||||
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
|
||||
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
|
||||
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
|
||||
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
|
||||
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
|
||||
"srl $t8, $t1, 16 \n" // |X|X|7|6|
|
||||
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
|
||||
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
|
||||
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
|
||||
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
|
||||
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
|
||||
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
|
||||
"srl $t8, $t2, 16 \n" // |X|X|11|10|
|
||||
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
|
||||
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
|
||||
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
|
||||
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
|
||||
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
|
||||
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
|
||||
"srl $t8, $t3, 16 \n" // |X|X|15|14|
|
||||
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
|
||||
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
|
||||
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
|
||||
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
|
||||
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
|
||||
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
|
||||
"addiu %[src_ptr], %[src_ptr], 16 \n"
|
||||
"addiu %[t], %[t], 16 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
"sb $t4, 1(%[dst]) \n"
|
||||
"sb $t1, 2(%[dst]) \n"
|
||||
"sb $t5, 3(%[dst]) \n"
|
||||
"sb $t2, 4(%[dst]) \n"
|
||||
"sb $t6, 5(%[dst]) \n"
|
||||
"sb $t3, 6(%[dst]) \n"
|
||||
"sb $t7, 7(%[dst]) \n"
|
||||
"bgtz $t9, 1b \n"
|
||||
" addiu %[dst], %[dst], 8 \n"
|
||||
|
||||
"2: \n"
|
||||
"andi $t9, %[dst_width], 0x7 \n" // x = residue
|
||||
"beqz $t9, 3f \n"
|
||||
" nop \n"
|
||||
|
||||
"21: \n"
|
||||
"lwr $t1, 0(%[src_ptr]) \n"
|
||||
"lwl $t1, 3(%[src_ptr]) \n"
|
||||
"lwr $t2, 0(%[t]) \n"
|
||||
"lwl $t2, 3(%[t]) \n"
|
||||
"srl $t8, $t1, 16 \n"
|
||||
"ins $t1, $t2, 16, 16 \n"
|
||||
"ins $t2, $t8, 0, 16 \n"
|
||||
"raddu.w.qb $t1, $t1 \n"
|
||||
"raddu.w.qb $t2, $t2 \n"
|
||||
"shra_r.w $t1, $t1, 2 \n"
|
||||
"shra_r.w $t2, $t2, 2 \n"
|
||||
"sb $t1, 0(%[dst]) \n"
|
||||
"sb $t2, 1(%[dst]) \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 4 \n"
|
||||
"addiu $t9, $t9, -2 \n"
|
||||
"addiu %[t], %[t], 4 \n"
|
||||
"bgtz $t9, 21b \n"
|
||||
" addiu %[dst], %[dst], 2 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst), [t] "+r" (t)
|
||||
: [dst_width] "r" (dst_width)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t9, %[dst_width], 3 \n"
|
||||
"beqz $t9, 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
|
||||
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
|
||||
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
|
||||
"precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
|
||||
"precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
|
||||
"precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
|
||||
"precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
|
||||
"precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
|
||||
"precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
|
||||
"addiu %[src_ptr], %[src_ptr], 32 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"sw $t1, 0(%[dst]) \n"
|
||||
"sw $t5, 4(%[dst]) \n"
|
||||
"bgtz $t9, 1b \n"
|
||||
" addiu %[dst], %[dst], 8 \n"
|
||||
|
||||
"2: \n"
|
||||
"andi $t9, %[dst_width], 7 \n" // residue
|
||||
"beqz $t9, 3f \n"
|
||||
" nop \n"
|
||||
|
||||
"21: \n"
|
||||
"lbu $t1, 0(%[src_ptr]) \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 4 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"sb $t1, 0(%[dst]) \n"
|
||||
"bgtz $t9, 21b \n"
|
||||
" addiu %[dst], %[dst], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst)
|
||||
: [dst_width] "r" (dst_width)
|
||||
: "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
intptr_t stride = src_stride;
|
||||
const uint8* s1 = src_ptr + stride;
|
||||
const uint8* s2 = s1 + stride;
|
||||
const uint8* s3 = s2 + stride;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t9, %[dst_width], 1 \n"
|
||||
"andi $t8, %[dst_width], 1 \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
|
||||
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
|
||||
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
|
||||
"lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t5, 4(%[s1]) \n" // |23|22|21|20|
|
||||
"lw $t6, 4(%[s2]) \n" // |27|26|25|24|
|
||||
"lw $t7, 4(%[s3]) \n" // |31|30|29|28|
|
||||
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
|
||||
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
|
||||
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
|
||||
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
|
||||
"raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
|
||||
"raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
|
||||
"raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
|
||||
"raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"add $t1, $t2, $t3 \n"
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"add $t4, $t4, $t5 \n"
|
||||
"add $t6, $t6, $t7 \n"
|
||||
"add $t4, $t4, $t6 \n"
|
||||
"shra_r.w $t0, $t0, 4 \n"
|
||||
"shra_r.w $t4, $t4, 4 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
"sb $t4, 1(%[dst]) \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 8 \n"
|
||||
"addiu %[s1], %[s1], 8 \n"
|
||||
"addiu %[s2], %[s2], 8 \n"
|
||||
"addiu %[s3], %[s3], 8 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"bgtz $t9, 1b \n"
|
||||
" addiu %[dst], %[dst], 2 \n"
|
||||
"beqz $t8, 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
|
||||
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
|
||||
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
|
||||
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
|
||||
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
|
||||
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
|
||||
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"add $t1, $t2, $t3 \n"
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"shra_r.w $t0, $t0, 4 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst),
|
||||
[s1] "+r" (s1),
|
||||
[s2] "+r" (s2),
|
||||
[s3] "+r" (s3)
|
||||
: [dst_width] "r" (dst_width)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6","t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"1: \n"
|
||||
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
|
||||
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
|
||||
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
|
||||
"precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
|
||||
"precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
|
||||
"addiu %[dst_width], %[dst_width], -24 \n"
|
||||
"ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
|
||||
"ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
|
||||
"ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
|
||||
"ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
|
||||
"addiu %[src_ptr], %[src_ptr], 32 \n"
|
||||
"packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
|
||||
"packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
|
||||
"prepend $t1, $t2, 8 \n" // |4|3|1|0|
|
||||
"prepend $t3, $t4, 24 \n" // |15|13|12|11|
|
||||
"prepend $t5, $t6, 8 \n" // |20|19|17|16|
|
||||
"prepend $t7, $t8, 24 \n" // |31|29|28|27|
|
||||
"sw $t1, 0(%[dst]) \n"
|
||||
"sw $t0, 4(%[dst]) \n"
|
||||
"sw $t3, 8(%[dst]) \n"
|
||||
"sw $t5, 12(%[dst]) \n"
|
||||
"sw $t9, 16(%[dst]) \n"
|
||||
"sw $t7, 20(%[dst]) \n"
|
||||
"bnez %[dst_width], 1b \n"
|
||||
" addiu %[dst], %[dst], 24 \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6","t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* d, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"repl.ph $t3, 3 \n" // 0x00030003
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
|
||||
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
|
||||
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
|
||||
"muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
|
||||
"muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
|
||||
"andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
|
||||
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
|
||||
"raddu.w.qb $t0, $t0 \n"
|
||||
"raddu.w.qb $t1, $t1 \n"
|
||||
"shra_r.w $t0, $t0, 1 \n"
|
||||
"shra_r.w $t1, $t1, 1 \n"
|
||||
"preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
|
||||
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
|
||||
"rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
|
||||
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
|
||||
"addu.ph $t2, $t2, $t4 \n"
|
||||
"addu.ph $t6, $t6, $t5 \n"
|
||||
"sll $t5, $t0, 1 \n"
|
||||
"add $t0, $t5, $t0 \n"
|
||||
"shra_r.ph $t2, $t2, 2 \n"
|
||||
"shra_r.ph $t6, $t6, 2 \n"
|
||||
"shll.ph $t4, $t2, 1 \n"
|
||||
"addq.ph $t4, $t4, $t2 \n"
|
||||
"addu $t0, $t0, $t1 \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 4 \n"
|
||||
"shra_r.w $t0, $t0, 2 \n"
|
||||
"addu.ph $t6, $t6, $t4 \n"
|
||||
"shra_r.ph $t6, $t6, 2 \n"
|
||||
"srl $t1, $t6, 16 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"sb $t1, 0(%[d]) \n"
|
||||
"sb $t0, 1(%[d]) \n"
|
||||
"sb $t6, 2(%[d]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" addiu %[d], %[d], 3 \n"
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[src_stride] "+r" (src_stride),
|
||||
[d] "+r" (d),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3",
|
||||
"t4", "t5", "t6"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* d, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"repl.ph $t2, 3 \n" // 0x00030003
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
|
||||
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
|
||||
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
|
||||
"muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
|
||||
"muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
|
||||
"andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
|
||||
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
|
||||
"raddu.w.qb $t0, $t0 \n"
|
||||
"raddu.w.qb $t1, $t1 \n"
|
||||
"shra_r.w $t0, $t0, 1 \n"
|
||||
"shra_r.w $t1, $t1, 1 \n"
|
||||
"preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
|
||||
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
|
||||
"rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
|
||||
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
|
||||
"addu.ph $t4, $t4, $t3 \n"
|
||||
"addu.ph $t6, $t6, $t5 \n"
|
||||
"shra_r.ph $t6, $t6, 2 \n"
|
||||
"shra_r.ph $t4, $t4, 2 \n"
|
||||
"addu.ph $t6, $t6, $t4 \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 4 \n"
|
||||
"shra_r.ph $t6, $t6, 1 \n"
|
||||
"addu $t0, $t0, $t1 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"shra_r.w $t0, $t0, 1 \n"
|
||||
"srl $t1, $t6, 16 \n"
|
||||
"sb $t1, 0(%[d]) \n"
|
||||
"sb $t0, 1(%[d]) \n"
|
||||
"sb $t6, 2(%[d]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" addiu %[d], %[d], 3 \n"
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[src_stride] "+r" (src_stride),
|
||||
[d] "+r" (d),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3",
|
||||
"t4", "t5", "t6"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
|
||||
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
|
||||
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
|
||||
"wsbh $t0, $t0 \n" // |2|3|0|1|
|
||||
"wsbh $t6, $t6 \n" // |26|27|24|25|
|
||||
"srl $t0, $t0, 8 \n" // |X|2|3|0|
|
||||
"srl $t3, $t3, 16 \n" // |X|X|15|14|
|
||||
"srl $t5, $t5, 16 \n" // |X|X|23|22|
|
||||
"srl $t7, $t7, 16 \n" // |X|X|31|30|
|
||||
"ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
|
||||
"ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
|
||||
"ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
|
||||
"ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
|
||||
"prepend $t2, $t3, 24 \n" // |X|15|14|11|
|
||||
"ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
|
||||
"ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
|
||||
"addiu %[src_ptr], %[src_ptr], 32 \n"
|
||||
"addiu %[dst_width], %[dst_width], -12 \n"
|
||||
"addiu $t8,%[dst_width], -12 \n"
|
||||
"sw $t1, 0(%[dst]) \n"
|
||||
"sw $t4, 4(%[dst]) \n"
|
||||
"sw $t6, 8(%[dst]) \n"
|
||||
"bgez $t8, 1b \n"
|
||||
" addiu %[dst], %[dst], 12 \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4",
|
||||
"t5", "t6", "t7", "t8"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
intptr_t stride = src_stride;
|
||||
const uint8* t = src_ptr + stride;
|
||||
const int c = 0x2AAA;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
|
||||
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
|
||||
"lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
|
||||
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
|
||||
"packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
|
||||
"packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
|
||||
"raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
|
||||
"raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
|
||||
"precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
|
||||
"precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
|
||||
"srl $t4, $t4, 2 \n" // t4 / 4
|
||||
"srl $t6, $t6, 16 \n" // |0|0|S3|T3|
|
||||
"raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
|
||||
"addu $t6, $t5, $t6 \n"
|
||||
"mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
|
||||
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
|
||||
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
|
||||
"raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
|
||||
"raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
|
||||
"addu $t0, $t0, $t2 \n"
|
||||
"mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
|
||||
"addiu %[src_ptr], %[src_ptr], 8 \n"
|
||||
"addiu %[t], %[t], 8 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
|
||||
"srl $t6, $t6, 16 \n"
|
||||
"srl $t0, $t0, 16 \n"
|
||||
"sb $t4, -1(%[dst_ptr]) \n"
|
||||
"sb $t6, -2(%[dst_ptr]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" sb $t0, -3(%[dst_ptr]) \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst_ptr] "+r" (dst_ptr),
|
||||
[t] "+r" (t),
|
||||
[dst_width] "+r" (dst_width)
|
||||
: [c] "r" (c)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5", "t6"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
intptr_t stride = src_stride;
|
||||
const uint8* s1 = src_ptr + stride;
|
||||
stride += stride;
|
||||
const uint8* s2 = src_ptr + stride;
|
||||
const int c1 = 0x1C71;
|
||||
const int c2 = 0x2AAA;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
|
||||
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
|
||||
"lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
|
||||
"lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
|
||||
"lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
|
||||
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
|
||||
"packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
|
||||
"raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
|
||||
"packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
|
||||
"raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
|
||||
"sll $t8, $t5, 16 \n" // |R5|R4|0|0|
|
||||
"raddu.w.qb $t8, $t8 \n" // R5+R4
|
||||
"addu $t7, $t7, $t8 \n"
|
||||
"srl $t8, $t5, 16 \n" // |0|0|R7|R6|
|
||||
"raddu.w.qb $t8, $t8 \n" // R7 + R6
|
||||
"addu $t6, $t6, $t8 \n"
|
||||
"mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
|
||||
"precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
|
||||
"precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
|
||||
"srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
|
||||
"raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
|
||||
"addu $t7, $t7, $t8 \n"
|
||||
"mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
|
||||
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
|
||||
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
|
||||
"sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
|
||||
"raddu.w.qb $t0, $t0 \n"
|
||||
"raddu.w.qb $t2, $t2 \n"
|
||||
"raddu.w.qb $t4, $t4 \n"
|
||||
"addu $t0, $t0, $t2 \n"
|
||||
"addu $t0, $t0, $t4 \n"
|
||||
"mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
|
||||
"addiu %[src_ptr], %[src_ptr], 8 \n"
|
||||
"addiu %[s1], %[s1], 8 \n"
|
||||
"addiu %[s2], %[s2], 8 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
|
||||
"srl $t6, $t6, 16 \n"
|
||||
"srl $t7, $t7, 16 \n"
|
||||
"srl $t0, $t0, 16 \n"
|
||||
"sb $t6, -1(%[dst_ptr]) \n"
|
||||
"sb $t7, -2(%[dst_ptr]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" sb $t0, -3(%[dst_ptr]) \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst_ptr] "+r" (dst_ptr),
|
||||
[s1] "+r" (s1),
|
||||
[s2] "+r" (s2),
|
||||
[dst_width] "+r" (dst_width)
|
||||
: [c1] "r" (c1), [c2] "r" (c2)
|
||||
: "t0", "t1", "t2", "t3", "t4",
|
||||
"t5", "t6", "t7", "t8"
|
||||
);
|
||||
}
|
||||
|
||||
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
1017
libs/libyuv/source/scale_neon.cc
Normal file
1017
libs/libyuv/source/scale_neon.cc
Normal file
File diff suppressed because it is too large
Load Diff
1042
libs/libyuv/source/scale_neon64.cc
Normal file
1042
libs/libyuv/source/scale_neon64.cc
Normal file
File diff suppressed because it is too large
Load Diff
1357
libs/libyuv/source/scale_win.cc
Normal file
1357
libs/libyuv/source/scale_win.cc
Normal file
File diff suppressed because it is too large
Load Diff
64
libs/libyuv/source/video_common.cc
Normal file
64
libs/libyuv/source/video_common.cc
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
|
||||
|
||||
struct FourCCAliasEntry {
|
||||
uint32 alias;
|
||||
uint32 canonical;
|
||||
};
|
||||
|
||||
static const struct FourCCAliasEntry kFourCCAliases[] = {
|
||||
{FOURCC_IYUV, FOURCC_I420},
|
||||
{FOURCC_YU16, FOURCC_I422},
|
||||
{FOURCC_YU24, FOURCC_I444},
|
||||
{FOURCC_YUYV, FOURCC_YUY2},
|
||||
{FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
|
||||
{FOURCC_HDYC, FOURCC_UYVY},
|
||||
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
|
||||
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
|
||||
{FOURCC_DMB1, FOURCC_MJPG},
|
||||
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
|
||||
{FOURCC_RGB3, FOURCC_RAW },
|
||||
{FOURCC_BGR3, FOURCC_24BG},
|
||||
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
|
||||
{FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
|
||||
{FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
|
||||
{FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
|
||||
{FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
|
||||
};
|
||||
// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
|
||||
// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
|
||||
|
||||
LIBYUV_API
|
||||
uint32 CanonicalFourCC(uint32 fourcc) {
|
||||
int i;
|
||||
for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
|
||||
if (kFourCCAliases[i].alias == fourcc) {
|
||||
return kFourCCAliases[i].canonical;
|
||||
}
|
||||
}
|
||||
// Not an alias, so return it as-is.
|
||||
return fourcc;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user