Support for Signal calls.

Merge in RedPhone

// FREEBIE
This commit is contained in:
Moxie Marlinspike
2015-09-09 13:54:29 -07:00
parent 3d4ae60d81
commit d83a3d71bc
2585 changed files with 803492 additions and 45 deletions

View File

@@ -0,0 +1,49 @@
# Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
#
# Use of this source code is governed by a BSD-style license
# that can be found in the LICENSE file in the root of the source
# tree. An additional intellectual property rights grant can be found
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
include $(LOCAL_PATH)/../../../../android-webrtc.mk
LOCAL_MODULE_CLASS := STATIC_LIBRARIES
LOCAL_MODULE := libwebrtc_aec
LOCAL_MODULE_TAGS := optional
LOCAL_SRC_FILES := \
echo_cancellation.c \
aec_resampler.c \
aec_core.c \
aec_rdft.c \
ifeq ($(TARGET_ARCH),x86)
LOCAL_SRC_FILES += \
aec_core_sse2.c \
aec_rdft_sse2.c
endif
# Flags passed to both C and C++ files.
LOCAL_CFLAGS := \
$(MY_WEBRTC_COMMON_DEFS)
LOCAL_C_INCLUDES := \
$(LOCAL_PATH)/include \
$(LOCAL_PATH)/../utility \
$(LOCAL_PATH)/../../.. \
$(LOCAL_PATH)/../../../common_audio/signal_processing/include \
$(LOCAL_PATH)/../../../..
LOCAL_SHARED_LIBRARIES := \
libcutils \
libdl \
libstlport
ifndef NDK_ROOT
include external/stlport/libstlport.mk
endif
include $(BUILD_STATIC_LIBRARY)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,120 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* Specifies the interface for the AEC core.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_
#include "webrtc/typedefs.h"
#define FRAME_LEN 80
#define PART_LEN 64 // Length of partition
#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
typedef float complex_t[2];
// For performance reasons, some arrays of complex numbers are replaced by twice
// as long arrays of float, all the real parts followed by all the imaginary
// ones (complex_t[SIZE] -> float[2][SIZE]). This allows SIMD optimizations and
// is better than two arrays (one for the real parts and one for the imaginary
// parts) as this other way would require two pointers instead of one and cause
// extra register spilling. This also allows the offsets to be calculated at
// compile time.
// Metrics
enum {
kOffsetLevel = -100
};
typedef struct Stats {
float instant;
float average;
float min;
float max;
float sum;
float hisum;
float himean;
int counter;
int hicounter;
} Stats;
typedef struct AecCore AecCore;
int WebRtcAec_CreateAec(AecCore** aec);
int WebRtcAec_FreeAec(AecCore* aec);
int WebRtcAec_InitAec(AecCore* aec, int sampFreq);
void WebRtcAec_InitAec_SSE2(void);
#if defined(MIPS_FPU_LE)
void WebRtcAec_InitAec_mips(void);
#endif
void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend);
void WebRtcAec_ProcessFrame(AecCore* aec,
const short* nearend,
const short* nearendH,
int knownDelay,
int16_t* out,
int16_t* outH);
// A helper function to call WebRtc_MoveReadPtr() for all far-end buffers.
// Returns the number of elements moved, and adjusts |system_delay| by the
// corresponding amount in ms.
int WebRtcAec_MoveFarReadPtr(AecCore* aec, int elements);
// Calculates the median and standard deviation among the delay estimates
// collected since the last call to this function.
int WebRtcAec_GetDelayMetricsCore(AecCore* self, int* median, int* std);
// Returns the echo state (1: echo, 0: no echo).
int WebRtcAec_echo_state(AecCore* self);
// Gets statistics of the echo metrics ERL, ERLE, A_NLP.
void WebRtcAec_GetEchoStats(AecCore* self,
Stats* erl,
Stats* erle,
Stats* a_nlp);
#ifdef WEBRTC_AEC_DEBUG_DUMP
void* WebRtcAec_far_time_buf(AecCore* self);
#endif
// Sets local configuration modes.
void WebRtcAec_SetConfigCore(AecCore* self,
int nlp_mode,
int metrics_mode,
int delay_logging);
// Non-zero enables, zero disables.
void WebRtcAec_enable_reported_delay(AecCore* self, int enable);
// Returns non-zero if reported delay is enabled and zero if disabled.
int WebRtcAec_reported_delay_enabled(AecCore* self);
// We now interpret delay correction to mean an extended filter length feature.
// We reuse the delay correction infrastructure to avoid changes through to
// libjingle. See details along with |DelayCorrection| in
// echo_cancellation_impl.h. Non-zero enables, zero disables.
void WebRtcAec_enable_delay_correction(AecCore* self, int enable);
// Returns non-zero if delay correction is enabled and zero if disabled.
int WebRtcAec_delay_correction_enabled(AecCore* self);
// Returns the current |system_delay|, i.e., the buffered difference between
// far-end and near-end.
int WebRtcAec_system_delay(AecCore* self);
// Sets the |system_delay| to |value|. Note that if the value is changed
// improperly, there can be a performance regression. So it should be used with
// care.
void WebRtcAec_SetSystemDelay(AecCore* self, int delay);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_H_

View File

@@ -0,0 +1,173 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_
#ifdef WEBRTC_AEC_DEBUG_DUMP
#include <stdio.h>
#endif
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include "webrtc/modules/audio_processing/utility/ring_buffer.h"
#include "webrtc/typedefs.h"
// Number of partitions for the extended filter mode. The first one is an enum
// to be used in array declarations, as it represents the maximum filter length.
enum {
kExtendedNumPartitions = 32
};
static const int kNormalNumPartitions = 12;
// Delay estimator constants, used for logging.
enum {
kMaxDelayBlocks = 60
};
enum {
kLookaheadBlocks = 15
};
enum {
kHistorySizeBlocks = kMaxDelayBlocks + kLookaheadBlocks
};
// Extended filter adaptation parameters.
// TODO(ajm): No narrowband tuning yet.
static const float kExtendedMu = 0.4f;
static const float kExtendedErrorThreshold = 1.0e-6f;
typedef struct PowerLevel {
float sfrsum;
int sfrcounter;
float framelevel;
float frsum;
int frcounter;
float minlevel;
float averagelevel;
} PowerLevel;
struct AecCore {
int farBufWritePos, farBufReadPos;
int knownDelay;
int inSamples, outSamples;
int delayEstCtr;
RingBuffer* nearFrBuf;
RingBuffer* outFrBuf;
RingBuffer* nearFrBufH;
RingBuffer* outFrBufH;
float dBuf[PART_LEN2]; // nearend
float eBuf[PART_LEN2]; // error
float dBufH[PART_LEN2]; // nearend
float xPow[PART_LEN1];
float dPow[PART_LEN1];
float dMinPow[PART_LEN1];
float dInitMinPow[PART_LEN1];
float* noisePow;
float xfBuf[2][kExtendedNumPartitions * PART_LEN1]; // farend fft buffer
float wfBuf[2][kExtendedNumPartitions * PART_LEN1]; // filter fft
complex_t sde[PART_LEN1]; // cross-psd of nearend and error
complex_t sxd[PART_LEN1]; // cross-psd of farend and nearend
// Farend windowed fft buffer.
complex_t xfwBuf[kExtendedNumPartitions * PART_LEN1];
float sx[PART_LEN1], sd[PART_LEN1], se[PART_LEN1]; // far, near, error psd
float hNs[PART_LEN1];
float hNlFbMin, hNlFbLocalMin;
float hNlXdAvgMin;
int hNlNewMin, hNlMinCtr;
float overDrive, overDriveSm;
int nlp_mode;
float outBuf[PART_LEN];
int delayIdx;
short stNearState, echoState;
short divergeState;
int xfBufBlockPos;
RingBuffer* far_buf;
RingBuffer* far_buf_windowed;
int system_delay; // Current system delay buffered in AEC.
int mult; // sampling frequency multiple
int sampFreq;
uint32_t seed;
float normal_mu; // stepsize
float normal_error_threshold; // error threshold
int noiseEstCtr;
PowerLevel farlevel;
PowerLevel nearlevel;
PowerLevel linoutlevel;
PowerLevel nlpoutlevel;
int metricsMode;
int stateCounter;
Stats erl;
Stats erle;
Stats aNlp;
Stats rerl;
// Quantities to control H band scaling for SWB input
int freq_avg_ic; // initial bin for averaging nlp gain
int flag_Hband_cn; // for comfort noise
float cn_scale_Hband; // scale for comfort noise in H band
int delay_histogram[kHistorySizeBlocks];
int delay_logging_enabled;
void* delay_estimator_farend;
void* delay_estimator;
int reported_delay_enabled; // 0 = disabled, otherwise enabled.
// 1 = extended filter mode enabled, 0 = disabled.
int extended_filter_enabled;
// Runtime selection of number of filter partitions.
int num_partitions;
#ifdef WEBRTC_AEC_DEBUG_DUMP
RingBuffer* far_time_buf;
FILE* farFile;
FILE* nearFile;
FILE* outFile;
FILE* outLinearFile;
#endif
};
typedef void (*WebRtcAec_FilterFar_t)(AecCore* aec, float yf[2][PART_LEN1]);
extern WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
typedef void (*WebRtcAec_ScaleErrorSignal_t)(AecCore* aec,
float ef[2][PART_LEN1]);
extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
typedef void (*WebRtcAec_FilterAdaptation_t)(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]);
extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
typedef void (*WebRtcAec_OverdriveAndSuppress_t)(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]);
extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
typedef void (*WebRtcAec_ComfortNoise_t)(AecCore* aec,
float efw[2][PART_LEN1],
complex_t* comfortNoiseHband,
const float* noisePow,
const float* lambda);
extern WebRtcAec_ComfortNoise_t WebRtcAec_ComfortNoise;
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_

View File

@@ -0,0 +1,774 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The core AEC algorithm, which is presented with time-aligned signals.
*/
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include <math.h>
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
static const int flagHbandCn = 1; // flag for adding comfort noise in H band
extern const float WebRtcAec_weightCurve[65];
extern const float WebRtcAec_overDriveCurve[65];
void WebRtcAec_ComfortNoise_mips(AecCore* aec,
float efw[2][PART_LEN1],
complex_t* comfortNoiseHband,
const float* noisePow,
const float* lambda) {
int i, num;
float rand[PART_LEN];
float noise, noiseAvg, tmp, tmpAvg;
int16_t randW16[PART_LEN];
complex_t u[PART_LEN1];
const float pi2 = 6.28318530717959f;
const float pi2t = pi2 / 32768;
// Generate a uniform random array on [0 1]
WebRtcSpl_RandUArray(randW16, PART_LEN, &aec->seed);
int16_t *randWptr = randW16;
float randTemp, randTemp2, randTemp3, randTemp4;
short tmp1s, tmp2s, tmp3s, tmp4s;
for (i = 0; i < PART_LEN; i+=4) {
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"lh %[tmp1s], 0(%[randWptr]) \n\t"
"lh %[tmp2s], 2(%[randWptr]) \n\t"
"lh %[tmp3s], 4(%[randWptr]) \n\t"
"lh %[tmp4s], 6(%[randWptr]) \n\t"
"mtc1 %[tmp1s], %[randTemp] \n\t"
"mtc1 %[tmp2s], %[randTemp2] \n\t"
"mtc1 %[tmp3s], %[randTemp3] \n\t"
"mtc1 %[tmp4s], %[randTemp4] \n\t"
"cvt.s.w %[randTemp], %[randTemp] \n\t"
"cvt.s.w %[randTemp2], %[randTemp2] \n\t"
"cvt.s.w %[randTemp3], %[randTemp3] \n\t"
"cvt.s.w %[randTemp4], %[randTemp4] \n\t"
"addiu %[randWptr], %[randWptr], 8 \n\t"
"mul.s %[randTemp], %[randTemp], %[pi2t] \n\t"
"mul.s %[randTemp2], %[randTemp2], %[pi2t] \n\t"
"mul.s %[randTemp3], %[randTemp3], %[pi2t] \n\t"
"mul.s %[randTemp4], %[randTemp4], %[pi2t] \n\t"
".set pop \n\t"
: [randWptr] "+r" (randWptr), [randTemp] "=&f" (randTemp),
[randTemp2] "=&f" (randTemp2), [randTemp3] "=&f" (randTemp3),
[randTemp4] "=&f" (randTemp4), [tmp1s] "=&r" (tmp1s),
[tmp2s] "=&r" (tmp2s), [tmp3s] "=&r" (tmp3s),
[tmp4s] "=&r" (tmp4s)
: [pi2t] "f" (pi2t)
: "memory"
);
u[i+1][0] = (float)cos(randTemp);
u[i+1][1] = (float)sin(randTemp);
u[i+2][0] = (float)cos(randTemp2);
u[i+2][1] = (float)sin(randTemp2);
u[i+3][0] = (float)cos(randTemp3);
u[i+3][1] = (float)sin(randTemp3);
u[i+4][0] = (float)cos(randTemp4);
u[i+4][1] = (float)sin(randTemp4);
}
// Reject LF noise
float *u_ptr = &u[1][0];
float noise2, noise3, noise4;
float tmp1f, tmp2f, tmp3f, tmp4f, tmp5f, tmp6f, tmp7f, tmp8f;
u[0][0] = 0;
u[0][1] = 0;
for (i = 1; i < PART_LEN1; i+=4) {
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"lwc1 %[noise], 4(%[noisePow]) \n\t"
"lwc1 %[noise2], 8(%[noisePow]) \n\t"
"lwc1 %[noise3], 12(%[noisePow]) \n\t"
"lwc1 %[noise4], 16(%[noisePow]) \n\t"
"sqrt.s %[noise], %[noise] \n\t"
"sqrt.s %[noise2], %[noise2] \n\t"
"sqrt.s %[noise3], %[noise3] \n\t"
"sqrt.s %[noise4], %[noise4] \n\t"
"lwc1 %[tmp1f], 0(%[u_ptr]) \n\t"
"lwc1 %[tmp2f], 4(%[u_ptr]) \n\t"
"lwc1 %[tmp3f], 8(%[u_ptr]) \n\t"
"lwc1 %[tmp4f], 12(%[u_ptr]) \n\t"
"lwc1 %[tmp5f], 16(%[u_ptr]) \n\t"
"lwc1 %[tmp6f], 20(%[u_ptr]) \n\t"
"lwc1 %[tmp7f], 24(%[u_ptr]) \n\t"
"lwc1 %[tmp8f], 28(%[u_ptr]) \n\t"
"addiu %[noisePow], %[noisePow], 16 \n\t"
"mul.s %[tmp1f], %[tmp1f], %[noise] \n\t"
"mul.s %[tmp2f], %[tmp2f], %[noise] \n\t"
"mul.s %[tmp3f], %[tmp3f], %[noise2] \n\t"
"mul.s %[tmp4f], %[tmp4f], %[noise2] \n\t"
"mul.s %[tmp5f], %[tmp5f], %[noise3] \n\t"
"mul.s %[tmp6f], %[tmp6f], %[noise3] \n\t"
"swc1 %[tmp1f], 0(%[u_ptr]) \n\t"
"swc1 %[tmp3f], 8(%[u_ptr]) \n\t"
"mul.s %[tmp8f], %[tmp8f], %[noise4] \n\t"
"mul.s %[tmp7f], %[tmp7f], %[noise4] \n\t"
"neg.s %[tmp2f] \n\t"
"neg.s %[tmp4f] \n\t"
"neg.s %[tmp6f] \n\t"
"neg.s %[tmp8f] \n\t"
"swc1 %[tmp5f], 16(%[u_ptr]) \n\t"
"swc1 %[tmp7f], 24(%[u_ptr]) \n\t"
"swc1 %[tmp2f], 4(%[u_ptr]) \n\t"
"swc1 %[tmp4f], 12(%[u_ptr]) \n\t"
"swc1 %[tmp6f], 20(%[u_ptr]) \n\t"
"swc1 %[tmp8f], 28(%[u_ptr]) \n\t"
"addiu %[u_ptr], %[u_ptr], 32 \n\t"
".set pop \n\t"
: [u_ptr] "+r" (u_ptr), [noisePow] "+r" (noisePow),
[noise] "=&f" (noise), [noise2] "=&f" (noise2),
[noise3] "=&f" (noise3), [noise4] "=&f" (noise4),
[tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f),
[tmp3f] "=&f" (tmp3f), [tmp4f] "=&f" (tmp4f),
[tmp5f] "=&f" (tmp5f), [tmp6f] "=&f" (tmp6f),
[tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f)
:
: "memory"
);
}
u[PART_LEN][1] = 0;
noisePow -= PART_LEN;
u_ptr = &u[0][0];
float *u_ptr_end = &u[PART_LEN][0];
float *efw_ptr_0 = &efw[0][0];
float *efw_ptr_1 = &efw[1][0];
float tmp9f, tmp10f;
const float tmp1c = 1.0;
const float tmp2c = 0.0;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lwc1 %[tmp1f], 0(%[lambda]) \n\t"
"lwc1 %[tmp6f], 4(%[lambda]) \n\t"
"addiu %[lambda], %[lambda], 8 \n\t"
"c.lt.s %[tmp1f], %[tmp1c] \n\t"
"bc1f 4f \n\t"
" nop \n\t"
"c.lt.s %[tmp6f], %[tmp1c] \n\t"
"bc1f 3f \n\t"
" nop \n\t"
"2: \n\t"
"mul.s %[tmp1f], %[tmp1f], %[tmp1f] \n\t"
"mul.s %[tmp6f], %[tmp6f], %[tmp6f] \n\t"
"sub.s %[tmp1f], %[tmp1c], %[tmp1f] \n\t"
"sub.s %[tmp6f], %[tmp1c], %[tmp6f] \n\t"
"sqrt.s %[tmp1f], %[tmp1f] \n\t"
"sqrt.s %[tmp6f], %[tmp6f] \n\t"
"lwc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp3f], 0(%[u_ptr]) \n\t"
"lwc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp8f], 8(%[u_ptr]) \n\t"
"lwc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp5f], 4(%[u_ptr]) \n\t"
"lwc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp10f], 12(%[u_ptr]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[tmp3f], %[tmp1f], %[tmp3f] \n\t"
"add.s %[tmp2f], %[tmp2f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp1f], %[tmp5f] \n\t"
"add.s %[tmp4f], %[tmp4f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp6f], %[tmp8f] \n\t"
"add.s %[tmp7f], %[tmp7f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp6f], %[tmp10f] \n\t"
"add.s %[tmp9f], %[tmp9f], %[tmp3f] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[tmp2f], %[tmp2f], %[tmp1f], %[tmp3f] \n\t"
"madd.s %[tmp4f], %[tmp4f], %[tmp1f], %[tmp5f] \n\t"
"madd.s %[tmp7f], %[tmp7f], %[tmp6f], %[tmp8f] \n\t"
"madd.s %[tmp9f], %[tmp9f], %[tmp6f], %[tmp10f] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"swc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"swc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"b 5f \n\t"
" swc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"3: \n\t"
"mul.s %[tmp1f], %[tmp1f], %[tmp1f] \n\t"
"sub.s %[tmp1f], %[tmp1c], %[tmp1f] \n\t"
"sqrt.s %[tmp1f], %[tmp1f] \n\t"
"lwc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp3f], 0(%[u_ptr]) \n\t"
"lwc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp5f], 4(%[u_ptr]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[tmp3f], %[tmp1f], %[tmp3f] \n\t"
"add.s %[tmp2f], %[tmp2f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp1f], %[tmp5f] \n\t"
"add.s %[tmp4f], %[tmp4f], %[tmp3f] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[tmp2f], %[tmp2f], %[tmp1f], %[tmp3f] \n\t"
"madd.s %[tmp4f], %[tmp4f], %[tmp1f], %[tmp5f] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[tmp2f], 0(%[efw_ptr_0]) \n\t"
"b 5f \n\t"
" swc1 %[tmp4f], 0(%[efw_ptr_1]) \n\t"
"4: \n\t"
"c.lt.s %[tmp6f], %[tmp1c] \n\t"
"bc1f 5f \n\t"
" nop \n\t"
"mul.s %[tmp6f], %[tmp6f], %[tmp6f] \n\t"
"sub.s %[tmp6f], %[tmp1c], %[tmp6f] \n\t"
"sqrt.s %[tmp6f], %[tmp6f] \n\t"
"lwc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"lwc1 %[tmp8f], 8(%[u_ptr]) \n\t"
"lwc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"lwc1 %[tmp10f], 12(%[u_ptr]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[tmp3f], %[tmp6f], %[tmp8f] \n\t"
"add.s %[tmp7f], %[tmp7f], %[tmp3f] \n\t"
"mul.s %[tmp3f], %[tmp6f], %[tmp10f] \n\t"
"add.s %[tmp9f], %[tmp9f], %[tmp3f] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[tmp7f], %[tmp7f], %[tmp6f], %[tmp8f] \n\t"
"madd.s %[tmp9f], %[tmp9f], %[tmp6f], %[tmp10f] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[tmp7f], 4(%[efw_ptr_0]) \n\t"
"swc1 %[tmp9f], 4(%[efw_ptr_1]) \n\t"
"5: \n\t"
"addiu %[u_ptr], %[u_ptr], 16 \n\t"
"addiu %[efw_ptr_0], %[efw_ptr_0], 8 \n\t"
"bne %[u_ptr], %[u_ptr_end], 1b \n\t"
" addiu %[efw_ptr_1], %[efw_ptr_1], 8 \n\t"
".set pop \n\t"
: [lambda] "+r" (lambda), [u_ptr] "+r" (u_ptr),
[efw_ptr_0] "+r" (efw_ptr_0), [efw_ptr_1] "+r" (efw_ptr_1),
[tmp1f] "=&f" (tmp1f), [tmp2f] "=&f" (tmp2f), [tmp3f] "=&f" (tmp3f),
[tmp4f] "=&f" (tmp4f), [tmp5f] "=&f" (tmp5f),
[tmp6f] "=&f" (tmp6f), [tmp7f] "=&f" (tmp7f), [tmp8f] "=&f" (tmp8f),
[tmp9f] "=&f" (tmp9f), [tmp10f] "=&f" (tmp10f)
: [tmp1c] "f" (tmp1c), [tmp2c] "f" (tmp2c), [u_ptr_end] "r" (u_ptr_end)
: "memory"
);
lambda -= PART_LEN;
tmp = sqrtf(WEBRTC_SPL_MAX(1 - lambda[PART_LEN] * lambda[PART_LEN], 0));
//tmp = 1 - lambda[i];
efw[0][PART_LEN] += tmp * u[PART_LEN][0];
efw[1][PART_LEN] += tmp * u[PART_LEN][1];
// For H band comfort noise
// TODO: don't compute noise and "tmp" twice. Use the previous results.
noiseAvg = 0.0;
tmpAvg = 0.0;
num = 0;
if (aec->sampFreq == 32000 && flagHbandCn == 1) {
for (i = 0; i < PART_LEN; i++) {
rand[i] = ((float)randW16[i]) / 32768;
}
// average noise scale
// average over second half of freq spectrum (i.e., 4->8khz)
// TODO: we shouldn't need num. We know how many elements we're summing.
for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
num++;
noiseAvg += sqrtf(noisePow[i]);
}
noiseAvg /= (float)num;
// average nlp scale
// average over second half of freq spectrum (i.e., 4->8khz)
// TODO: we shouldn't need num. We know how many elements we're summing.
num = 0;
for (i = PART_LEN1 >> 1; i < PART_LEN1; i++) {
num++;
tmpAvg += sqrtf(WEBRTC_SPL_MAX(1 - lambda[i] * lambda[i], 0));
}
tmpAvg /= (float)num;
// Use average noise for H band
// TODO: we should probably have a new random vector here.
// Reject LF noise
u[0][0] = 0;
u[0][1] = 0;
for (i = 1; i < PART_LEN1; i++) {
tmp = pi2 * rand[i - 1];
// Use average noise for H band
u[i][0] = noiseAvg * (float)cos(tmp);
u[i][1] = -noiseAvg * (float)sin(tmp);
}
u[PART_LEN][1] = 0;
for (i = 0; i < PART_LEN1; i++) {
// Use average NLP weight for H band
comfortNoiseHband[i][0] = tmpAvg * u[i][0];
comfortNoiseHband[i][1] = tmpAvg * u[i][1];
}
}
}
void WebRtcAec_FilterFar_mips(AecCore *aec, float yf[2][PART_LEN1]) {
int i;
for (i = 0; i < aec->num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
xPos -= aec->num_partitions * (PART_LEN1);
}
float *yf0 = yf[0];
float *yf1 = yf[1];
float *aRe = aec->xfBuf[0] + xPos;
float *aIm = aec->xfBuf[1] + xPos;
float *bRe = aec->wfBuf[0] + pos;
float *bIm = aec->wfBuf[1] + pos;
float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13;
int len = PART_LEN1 >> 1;
int len1 = PART_LEN1 & 1;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 4(%[bRe]) \n\t"
"lwc1 %[f6], 4(%[bIm]) \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
"mul.s %[f0], %[f0], %[f2] \n\t"
"mul.s %[f9], %[f4], %[f5] \n\t"
"mul.s %[f4], %[f4], %[f6] \n\t"
"lwc1 %[f7], 4(%[aIm]) \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f12], %[f2], %[f3] \n\t"
"mul.s %[f1], %[f3], %[f1] \n\t"
"mul.s %[f11], %[f6], %[f7] \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"sub.s %[f8], %[f8], %[f12] \n\t"
"mul.s %[f12], %[f7], %[f5] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"add.s %[f1], %[f0], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
"sub.s %[f9], %[f9], %[f11] \n\t"
"lwc1 %[f6], 4(%[yf0]) \n\t"
"add.s %[f4], %[f4], %[f12] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"addiu %[aRe], %[aRe], 8 \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"nmsub.s %[f8], %[f8], %[f2], %[f3] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"madd.s %[f1], %[f0], %[f3], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
"nmsub.s %[f9], %[f9], %[f6], %[f7] \n\t"
"lwc1 %[f6], 4(%[yf0]) \n\t"
"madd.s %[f4], %[f4], %[f7], %[f5] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"lwc1 %[f5], 4(%[yf1]) \n\t"
"add.s %[f2], %[f2], %[f8] \n\t"
"addiu %[bRe], %[bRe], 8 \n\t"
"addiu %[bIm], %[bIm], 8 \n\t"
"add.s %[f3], %[f3], %[f1] \n\t"
"add.s %[f6], %[f6], %[f9] \n\t"
"add.s %[f5], %[f5], %[f4] \n\t"
"swc1 %[f2], 0(%[yf0]) \n\t"
"swc1 %[f3], 0(%[yf1]) \n\t"
"swc1 %[f6], 4(%[yf0]) \n\t"
"swc1 %[f5], 4(%[yf1]) \n\t"
"addiu %[yf0], %[yf0], 8 \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[yf1], %[yf1], 8 \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
"mul.s %[f0], %[f0], %[f2] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f12], %[f2], %[f3] \n\t"
"mul.s %[f1], %[f3], %[f1] \n\t"
"sub.s %[f8], %[f8], %[f12] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"add.s %[f1], %[f0], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
#else // #if !defined(MIPS32_R2_LE)
"nmsub.s %[f8], %[f8], %[f2], %[f3] \n\t"
"lwc1 %[f2], 0(%[yf0]) \n\t"
"madd.s %[f1], %[f0], %[f3], %[f1] \n\t"
"lwc1 %[f3], 0(%[yf1]) \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"add.s %[f2], %[f2], %[f8] \n\t"
"add.s %[f3], %[f3], %[f1] \n\t"
"swc1 %[f2], 0(%[yf0]) \n\t"
"swc1 %[f3], 0(%[yf1]) \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
[f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
[f12] "=&f" (f12), [f13] "=&f" (f13), [aRe] "+r" (aRe),
[aIm] "+r" (aIm), [bRe] "+r" (bRe), [bIm] "+r" (bIm),
[yf0] "+r" (yf0), [yf1] "+r" (yf1), [len] "+r" (len)
: [len1] "r" (len1)
: "memory"
);
}
}
void WebRtcAec_FilterAdaptation_mips(AecCore *aec,
float *fft,
float ef[2][PART_LEN1]) {
int i;
for (i = 0; i < aec->num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
int pos;
// Check for wrap
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
xPos -= aec->num_partitions * PART_LEN1;
}
pos = i * PART_LEN1;
float *aRe = aec->xfBuf[0] + xPos;
float *aIm = aec->xfBuf[1] + xPos;
float *bRe = ef[0];
float *bIm = ef[1];
float *fft_tmp = fft;
float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12;
int len = PART_LEN >> 1;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 4(%[bRe]) \n\t"
"lwc1 %[f6], 4(%[bIm]) \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"addiu %[bRe], %[bRe], 8 \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
"mul.s %[f0], %[f0], %[f2] \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"mul.s %[f9], %[f4], %[f5] \n\t"
"lwc1 %[f7], 4(%[aIm]) \n\t"
"mul.s %[f4], %[f4], %[f6] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f10], %[f3], %[f2] \n\t"
"mul.s %[f1], %[f3], %[f1] \n\t"
"mul.s %[f11], %[f7], %[f6] \n\t"
"mul.s %[f5], %[f7], %[f5] \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[bIm], %[bIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"add.s %[f8], %[f8], %[f10] \n\t"
"sub.s %[f1], %[f0], %[f1] \n\t"
"add.s %[f9], %[f9], %[f11] \n\t"
"sub.s %[f5], %[f4], %[f5] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"addiu %[aIm], %[aIm], 8 \n\t"
"addiu %[bIm], %[bIm], 8 \n\t"
"addiu %[len], %[len], -1 \n\t"
"madd.s %[f8], %[f8], %[f3], %[f2] \n\t"
"nmsub.s %[f1], %[f0], %[f3], %[f1] \n\t"
"madd.s %[f9], %[f9], %[f7], %[f6] \n\t"
"nmsub.s %[f5], %[f4], %[f7], %[f5] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[f8], 0(%[fft_tmp]) \n\t"
"swc1 %[f1], 4(%[fft_tmp]) \n\t"
"swc1 %[f9], 8(%[fft_tmp]) \n\t"
"swc1 %[f5], 12(%[fft_tmp]) \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[bRe]) \n\t"
"lwc1 %[f2], 0(%[bIm]) \n\t"
"lwc1 %[f3], 0(%[aIm]) \n\t"
"mul.s %[f8], %[f0], %[f1] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[f10], %[f3], %[f2] \n\t"
"add.s %[f8], %[f8], %[f10] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[f8], %[f8], %[f3], %[f2] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[f8], 4(%[fft]) \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [f8] "=&f" (f8),
[f9] "=&f" (f9), [f10] "=&f" (f10), [f11] "=&f" (f11),
[f12] "=&f" (f12), [aRe] "+r" (aRe), [aIm] "+r" (aIm),
[bRe] "+r" (bRe), [bIm] "+r" (bIm), [fft_tmp] "+r" (fft_tmp),
[len] "+r" (len), [fft] "=&r" (fft)
:
: "memory"
);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling
{
float scale = 2.0f / PART_LEN2;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[fft_tmp], %[fft], 0 \n\t"
"addiu %[len], $zero, 8 \n\t"
"1: \n\t"
"addiu %[len], %[len], -1 \n\t"
"lwc1 %[f0], 0(%[fft_tmp]) \n\t"
"lwc1 %[f1], 4(%[fft_tmp]) \n\t"
"lwc1 %[f2], 8(%[fft_tmp]) \n\t"
"lwc1 %[f3], 12(%[fft_tmp]) \n\t"
"mul.s %[f0], %[f0], %[scale] \n\t"
"mul.s %[f1], %[f1], %[scale] \n\t"
"mul.s %[f2], %[f2], %[scale] \n\t"
"mul.s %[f3], %[f3], %[scale] \n\t"
"lwc1 %[f4], 16(%[fft_tmp]) \n\t"
"lwc1 %[f5], 20(%[fft_tmp]) \n\t"
"lwc1 %[f6], 24(%[fft_tmp]) \n\t"
"lwc1 %[f7], 28(%[fft_tmp]) \n\t"
"mul.s %[f4], %[f4], %[scale] \n\t"
"mul.s %[f5], %[f5], %[scale] \n\t"
"mul.s %[f6], %[f6], %[scale] \n\t"
"mul.s %[f7], %[f7], %[scale] \n\t"
"swc1 %[f0], 0(%[fft_tmp]) \n\t"
"swc1 %[f1], 4(%[fft_tmp]) \n\t"
"swc1 %[f2], 8(%[fft_tmp]) \n\t"
"swc1 %[f3], 12(%[fft_tmp]) \n\t"
"swc1 %[f4], 16(%[fft_tmp]) \n\t"
"swc1 %[f5], 20(%[fft_tmp]) \n\t"
"swc1 %[f6], 24(%[fft_tmp]) \n\t"
"swc1 %[f7], 28(%[fft_tmp]) \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[fft_tmp], %[fft_tmp], 32 \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
[fft_tmp] "=&r" (fft_tmp)
: [scale] "f" (scale), [fft] "r" (fft)
: "memory"
);
}
aec_rdft_forward_128(fft);
aRe = aec->wfBuf[0] + pos;
aIm = aec->wfBuf[1] + pos;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[fft_tmp], %[fft], 0 \n\t"
"addiu %[len], $zero, 31 \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[fft_tmp]) \n\t"
"lwc1 %[f2], 256(%[aRe]) \n\t"
"lwc1 %[f3], 4(%[fft_tmp]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 8(%[fft_tmp]) \n\t"
"lwc1 %[f6], 4(%[aIm]) \n\t"
"lwc1 %[f7], 12(%[fft_tmp]) \n\t"
"add.s %[f0], %[f0], %[f1] \n\t"
"add.s %[f2], %[f2], %[f3] \n\t"
"add.s %[f4], %[f4], %[f5] \n\t"
"add.s %[f6], %[f6], %[f7] \n\t"
"addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
"swc1 %[f0], 0(%[aRe]) \n\t"
"swc1 %[f2], 256(%[aRe]) \n\t"
"swc1 %[f4], 4(%[aRe]) \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"swc1 %[f6], 4(%[aIm]) \n\t"
"addiu %[aIm], %[aIm], 8 \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[aRe]) \n\t"
"lwc1 %[f1], 0(%[fft_tmp]) \n\t"
"lwc1 %[f2], 0(%[aIm]) \n\t"
"lwc1 %[f3], 4(%[fft_tmp]) \n\t"
"lwc1 %[f4], 4(%[aRe]) \n\t"
"lwc1 %[f5], 8(%[fft_tmp]) \n\t"
"lwc1 %[f6], 4(%[aIm]) \n\t"
"lwc1 %[f7], 12(%[fft_tmp]) \n\t"
"add.s %[f0], %[f0], %[f1] \n\t"
"add.s %[f2], %[f2], %[f3] \n\t"
"add.s %[f4], %[f4], %[f5] \n\t"
"add.s %[f6], %[f6], %[f7] \n\t"
"addiu %[len], %[len], -1 \n\t"
"addiu %[fft_tmp], %[fft_tmp], 16 \n\t"
"swc1 %[f0], 0(%[aRe]) \n\t"
"swc1 %[f2], 0(%[aIm]) \n\t"
"swc1 %[f4], 4(%[aRe]) \n\t"
"addiu %[aRe], %[aRe], 8 \n\t"
"swc1 %[f6], 4(%[aIm]) \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[aIm], %[aIm], 8 \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
[f3] "=&f" (f3), [f4] "=&f" (f4), [f5] "=&f" (f5),
[f6] "=&f" (f6), [f7] "=&f" (f7), [len] "=&r" (len),
[fft_tmp] "=&r" (fft_tmp)
: [aRe] "r" (aRe), [aIm] "r" (aIm), [fft] "r" (fft)
: "memory"
);
}
}
void WebRtcAec_OverdriveAndSuppress_mips(AecCore *aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
const float one = 1.0;
float *p_hNl, *p_efw0, *p_efw1;
float *p_WebRtcAec_wC;
float temp1, temp2, temp3, temp4;
p_hNl = &hNl[0];
p_efw0 = &efw[0][0];
p_efw1 = &efw[1][0];
p_WebRtcAec_wC = (float*)&WebRtcAec_weightCurve[0];
for (i = 0; i < PART_LEN1; i++) {
// Weight subbands
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"lwc1 %[temp1], 0(%[p_hNl]) \n\t"
"lwc1 %[temp2], 0(%[p_wC]) \n\t"
"c.lt.s %[hNlFb], %[temp1] \n\t"
"bc1f 1f \n\t"
" mul.s %[temp3], %[temp2], %[hNlFb] \n\t"
"sub.s %[temp4], %[one], %[temp2] \n\t"
#if !defined(MIPS32_R2_LE)
"mul.s %[temp1], %[temp1], %[temp4] \n\t"
"add.s %[temp1], %[temp3], %[temp1] \n\t"
#else // #if !defined(MIPS32_R2_LE)
"madd.s %[temp1], %[temp3], %[temp1], %[temp4] \n\t"
#endif // #if !defined(MIPS32_R2_LE)
"swc1 %[temp1], 0(%[p_hNl]) \n\t"
"1: \n\t"
"addiu %[p_wC], %[p_wC], 4 \n\t"
".set pop \n\t"
: [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
[temp4] "=&f" (temp4), [p_wC] "+r" (p_WebRtcAec_wC)
: [hNlFb] "f" (hNlFb), [one] "f" (one), [p_hNl] "r" (p_hNl)
: "memory"
);
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
__asm __volatile (
"lwc1 %[temp1], 0(%[p_hNl]) \n\t"
"lwc1 %[temp3], 0(%[p_efw1]) \n\t"
"lwc1 %[temp2], 0(%[p_efw0]) \n\t"
"addiu %[p_hNl], %[p_hNl], 4 \n\t"
"mul.s %[temp3], %[temp3], %[temp1] \n\t"
"mul.s %[temp2], %[temp2], %[temp1] \n\t"
"addiu %[p_efw0], %[p_efw0], 4 \n\t"
"addiu %[p_efw1], %[p_efw1], 4 \n\t"
"neg.s %[temp4], %[temp3] \n\t"
"swc1 %[temp2], -4(%[p_efw0]) \n\t"
"swc1 %[temp4], -4(%[p_efw1]) \n\t"
: [temp1] "=&f" (temp1), [temp2] "=&f" (temp2), [temp3] "=&f" (temp3),
[temp4] "=&f" (temp4), [p_efw0] "+r" (p_efw0), [p_efw1] "+r" (p_efw1),
[p_hNl] "+r" (p_hNl)
:
: "memory"
);
}
}
void WebRtcAec_ScaleErrorSignal_mips(AecCore *aec, float ef[2][PART_LEN1]) {
const float mu = aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
const float error_threshold = aec->extended_filter_enabled
? kExtendedErrorThreshold
: aec->normal_error_threshold;
int len = (PART_LEN1);
float *ef0 = ef[0];
float *ef1 = ef[1];
float *xPow = aec->xPow;
float fac1 = 1e-10f;
float err_th2 = error_threshold * error_threshold;
float f0, f1, f2;
#if !defined(MIPS32_R2_LE)
float f3;
#endif
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"lwc1 %[f0], 0(%[xPow]) \n\t"
"lwc1 %[f1], 0(%[ef0]) \n\t"
"lwc1 %[f2], 0(%[ef1]) \n\t"
"add.s %[f0], %[f0], %[fac1] \n\t"
"div.s %[f1], %[f1], %[f0] \n\t"
"div.s %[f2], %[f2], %[f0] \n\t"
"mul.s %[f0], %[f1], %[f1] \n\t"
#if defined(MIPS32_R2_LE)
"madd.s %[f0], %[f0], %[f2], %[f2] \n\t"
#else
"mul.s %[f3], %[f2], %[f2] \n\t"
"add.s %[f0], %[f0], %[f3] \n\t"
#endif
"c.le.s %[f0], %[err_th2] \n\t"
"nop \n\t"
"bc1t 2f \n\t"
" nop \n\t"
"sqrt.s %[f0], %[f0] \n\t"
"add.s %[f0], %[f0], %[fac1] \n\t"
"div.s %[f0], %[err_th], %[f0] \n\t"
"mul.s %[f1], %[f1], %[f0] \n\t"
"mul.s %[f2], %[f2], %[f0] \n\t"
"2: \n\t"
"mul.s %[f1], %[f1], %[mu] \n\t"
"mul.s %[f2], %[f2], %[mu] \n\t"
"swc1 %[f1], 0(%[ef0]) \n\t"
"swc1 %[f2], 0(%[ef1]) \n\t"
"addiu %[len], %[len], -1 \n\t"
"addiu %[xPow], %[xPow], 4 \n\t"
"addiu %[ef0], %[ef0], 4 \n\t"
"bgtz %[len], 1b \n\t"
" addiu %[ef1], %[ef1], 4 \n\t"
".set pop \n\t"
: [f0] "=&f" (f0), [f1] "=&f" (f1), [f2] "=&f" (f2),
#if !defined(MIPS32_R2_LE)
[f3] "=&f" (f3),
#endif
[xPow] "+r" (xPow), [ef0] "+r" (ef0), [ef1] "+r" (ef1),
[len] "+r" (len)
: [fac1] "f" (fac1), [err_th2] "f" (err_th2), [mu] "f" (mu),
[err_th] "f" (error_threshold)
: "memory"
);
}
void WebRtcAec_InitAec_mips(void)
{
WebRtcAec_FilterFar = WebRtcAec_FilterFar_mips;
WebRtcAec_FilterAdaptation = WebRtcAec_FilterAdaptation_mips;
WebRtcAec_ScaleErrorSignal = WebRtcAec_ScaleErrorSignal_mips;
WebRtcAec_ComfortNoise = WebRtcAec_ComfortNoise_mips;
WebRtcAec_OverdriveAndSuppress = WebRtcAec_OverdriveAndSuppress_mips;
}

View File

@@ -0,0 +1,431 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* The core AEC algorithm, SSE2 version of speed-critical functions.
*/
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include <emmintrin.h>
#include <math.h>
#include <string.h> // memset
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
return aRe * bRe - aIm * bIm;
}
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
return aRe * bIm + aIm * bRe;
}
static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1]) {
int i;
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int j;
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions * (PART_LEN1);
}
// vectorized code (four at once)
for (j = 0; j + 3 < PART_LEN1; j += 4) {
const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
const __m128 wfBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
const __m128 wfBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
const __m128 yf_re = _mm_loadu_ps(&yf[0][j]);
const __m128 yf_im = _mm_loadu_ps(&yf[1][j]);
const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re);
const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im);
const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im);
const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re);
const __m128 e = _mm_sub_ps(a, b);
const __m128 f = _mm_add_ps(c, d);
const __m128 g = _mm_add_ps(yf_re, e);
const __m128 h = _mm_add_ps(yf_im, f);
_mm_storeu_ps(&yf[0][j], g);
_mm_storeu_ps(&yf[1][j], h);
}
// scalar code for the remaining items.
for (; j < PART_LEN1; j++) {
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
aec->xfBuf[1][xPos + j],
aec->wfBuf[0][pos + j],
aec->wfBuf[1][pos + j]);
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
aec->xfBuf[1][xPos + j],
aec->wfBuf[0][pos + j],
aec->wfBuf[1][pos + j]);
}
}
}
static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) {
const __m128 k1e_10f = _mm_set1_ps(1e-10f);
const __m128 kMu = aec->extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
: _mm_set1_ps(aec->normal_mu);
const __m128 kThresh = aec->extended_filter_enabled
? _mm_set1_ps(kExtendedErrorThreshold)
: _mm_set1_ps(aec->normal_error_threshold);
int i;
// vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i += 4) {
const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);
const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
__m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
__m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
const __m128 absEf = _mm_sqrt_ps(ef_sum2);
const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
__m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
__m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
__m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
ef_re_if = _mm_and_ps(bigger, ef_re_if);
ef_im_if = _mm_and_ps(bigger, ef_im_if);
ef_re = _mm_andnot_ps(bigger, ef_re);
ef_im = _mm_andnot_ps(bigger, ef_im);
ef_re = _mm_or_ps(ef_re, ef_re_if);
ef_im = _mm_or_ps(ef_im, ef_im_if);
ef_re = _mm_mul_ps(ef_re, kMu);
ef_im = _mm_mul_ps(ef_im, kMu);
_mm_storeu_ps(&ef[0][i], ef_re);
_mm_storeu_ps(&ef[1][i], ef_im);
}
// scalar code for the remaining items.
{
const float mu =
aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
const float error_threshold = aec->extended_filter_enabled
? kExtendedErrorThreshold
: aec->normal_error_threshold;
for (; i < (PART_LEN1); i++) {
float abs_ef;
ef[0][i] /= (aec->xPow[i] + 1e-10f);
ef[1][i] /= (aec->xPow[i] + 1e-10f);
abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
if (abs_ef > error_threshold) {
abs_ef = error_threshold / (abs_ef + 1e-10f);
ef[0][i] *= abs_ef;
ef[1][i] *= abs_ef;
}
// Stepsize factor
ef[0][i] *= mu;
ef[1][i] *= mu;
}
}
}
static void FilterAdaptationSSE2(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
int i, j;
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// Process the whole array...
for (j = 0; j < PART_LEN; j += 4) {
// Load xfBuf and ef.
const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);
// Calculate the product of conjugate(xfBuf) by ef.
// re(conjugate(a) * b) = aRe * bRe + aIm * bIm
// im(conjugate(a) * b)= aRe * bIm - aIm * bRe
const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);
const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);
const __m128 c = _mm_mul_ps(xfBuf_re, ef_im);
const __m128 d = _mm_mul_ps(xfBuf_im, ef_re);
const __m128 e = _mm_add_ps(a, b);
const __m128 f = _mm_sub_ps(c, d);
// Interleave real and imaginary parts.
const __m128 g = _mm_unpacklo_ps(e, f);
const __m128 h = _mm_unpackhi_ps(e, f);
// Store
_mm_storeu_ps(&fft[2 * j + 0], g);
_mm_storeu_ps(&fft[2 * j + 4], h);
}
// ... and fixup the first imaginary entry.
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN],
ef[1][PART_LEN]);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling
{
float scale = 2.0f / PART_LEN2;
const __m128 scale_ps = _mm_load_ps1(&scale);
for (j = 0; j < PART_LEN; j += 4) {
const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
_mm_storeu_ps(&fft[j], fft_scale);
}
}
aec_rdft_forward_128(fft);
{
float wt1 = aec->wfBuf[1][pos];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
for (j = 0; j < PART_LEN; j += 4) {
__m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
__m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
const __m128 fft_re =
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 fft_im =
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
_mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
_mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
}
aec->wfBuf[1][pos] = wt1;
}
}
}
static __m128 mm_pow_ps(__m128 a, __m128 b) {
// a^b = exp2(b * log2(a))
// exp2(x) and log2(x) are calculated using polynomial approximations.
__m128 log2_a, b_log2_a, a_exp_b;
// Calculate log2(x), x = a.
{
// To calculate log2(x), we decompose x like this:
// x = y * 2^n
// n is an integer
// y is in the [1.0, 2.0) range
//
// log2(x) = log2(y) + n
// n can be evaluated by playing with float representation.
// log2(y) in a small range can be approximated, this code uses an order
// five polynomial approximation. The coefficients have been
// estimated with the Remez algorithm and the resulting
// polynomial has a maximum relative error of 0.00086%.
// Compute n.
// This is done by masking the exponent, shifting it into the top bit of
// the mantissa, putting eight into the biased exponent (to shift/
// compensate the fact that the exponent has been shifted in the top/
// fractional part and finally getting rid of the implicit leading one
// from the mantissa by substracting it out.
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
0x43800000, 0x43800000, 0x43800000, 0x43800000};
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
static const int shift_exponent_into_top_mantissa = 8;
const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
_mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));
// Compute y.
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
const __m128 y =
_mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));
// Approximate log2(y) ~= (y - 1) * pol5(y).
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
static const ALIGN16_BEG float ALIGN16_END C5[4] = {
-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
static const ALIGN16_BEG float ALIGN16_END
C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
static const ALIGN16_BEG float ALIGN16_END
C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
static const ALIGN16_BEG float ALIGN16_END
C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
static const ALIGN16_BEG float ALIGN16_END
C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
static const ALIGN16_BEG float ALIGN16_END
C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
const __m128 y_minus_one =
_mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);
// Combine parts.
log2_a = _mm_add_ps(n, log2_y);
}
// b * log2(a)
b_log2_a = _mm_mul_ps(b, log2_a);
// Calculate exp2(x), x = b * log2(a).
{
// To calculate 2^x, we decompose x like this:
// x = n + y
// n is an integer, the value of x - 0.5 rounded down, therefore
// y is in the [0.5, 1.5) range
//
// 2^x = 2^n * 2^y
// 2^n can be evaluated by playing with float representation.
// 2^y in a small range can be approximated, this code uses an order two
// polynomial approximation. The coefficients have been estimated
// with the Remez algorithm and the resulting polynomial has a
// maximum relative error of 0.17%.
// To avoid over/underflow, we reduce the range of input to ]-127, 129].
static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
129.f, 129.f};
static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
-126.99999f, -126.99999f, -126.99999f, -126.99999f};
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
// Compute n.
static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
0.5f, 0.5f};
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
// Compute 2^n.
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
127, 127, 127, 127};
static const int float_exponent_shift = 23;
const __m128i two_n_exponent =
_mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
const __m128 two_n =
_mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
// Compute y.
const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
static const ALIGN16_BEG float C2[4] ALIGN16_END = {
3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
static const ALIGN16_BEG float C1[4] ALIGN16_END = {
6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
1.0017247f, 1.0017247f};
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));
// Combine parts.
a_exp_b = _mm_mul_ps(exp2_y, two_n);
}
return a_exp_b;
}
extern const float WebRtcAec_weightCurve[65];
extern const float WebRtcAec_overDriveCurve[65];
static void OverdriveAndSuppressSSE2(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb,
float efw[2][PART_LEN1]) {
int i;
const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
const __m128 vec_one = _mm_set1_ps(1.0f);
const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
// vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i += 4) {
// Weight subbands
__m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
const __m128 vec_one_weightCurve_hNl =
_mm_mul_ps(vec_one_weightCurve, vec_hNl);
const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
const __m128 vec_if1 = _mm_and_ps(
bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
vec_hNl = _mm_or_ps(vec_if0, vec_if1);
{
const __m128 vec_overDriveCurve =
_mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
const __m128 vec_overDriveSm_overDriveCurve =
_mm_mul_ps(vec_overDriveSm, vec_overDriveCurve);
vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
_mm_storeu_ps(&hNl[i], vec_hNl);
}
// Suppress error signal
{
__m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
__m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
_mm_storeu_ps(&efw[0][i], vec_efw_re);
_mm_storeu_ps(&efw[1][i], vec_efw_im);
}
}
// scalar code for the remaining items.
for (; i < PART_LEN1; i++) {
// Weight subbands
if (hNl[i] > hNlFb) {
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
(1 - WebRtcAec_weightCurve[i]) * hNl[i];
}
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
// Suppress error signal
efw[0][i] *= hNl[i];
efw[1][i] *= hNl[i];
// Ooura fft returns incorrect sign on imaginary component. It matters
// here because we are making an additive change with comfort noise.
efw[1][i] *= -1;
}
}
void WebRtcAec_InitAec_SSE2(void) {
WebRtcAec_FilterFar = FilterFarSSE2;
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
}

View File

@@ -0,0 +1,666 @@
/*
* http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
* Copyright Takuya OOURA, 1996-2001
*
* You may use, copy, modify and distribute this code for any purpose (include
* commercial use) and without fee. Please refer to this package when you modify
* this code.
*
* Changes by the WebRTC authors:
* - Trivial type modifications.
* - Minimal code subset to do rdft of length 128.
* - Optimizations because of known length.
*
* All changes are covered by the WebRTC license and IP grant:
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
#include <math.h>
#include "webrtc/system_wrappers/interface/cpu_features_wrapper.h"
#include "webrtc/typedefs.h"
// constants shared by all paths (C, SSE2).
float rdft_w[64];
// constants used by the C path.
float rdft_wk3ri_first[32];
float rdft_wk3ri_second[32];
// constants used by SSE2 but initialized in C path.
ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
static int ip[16];
static void bitrv2_32(int* ip, float* a) {
const int n = 32;
int j, j1, k, k1, m, m2;
float xr, xi, yr, yi;
ip[0] = 0;
{
int l = n;
m = 1;
while ((m << 3) < l) {
l >>= 1;
for (j = 0; j < m; j++) {
ip[m + j] = ip[j] + l;
}
m <<= 1;
}
}
m2 = 2 * m;
for (k = 0; k < m; k++) {
for (j = 0; j < k; j++) {
j1 = 2 * j + ip[k];
k1 = 2 * k + ip[j];
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 += 2 * m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 -= m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
j1 += m2;
k1 += 2 * m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
}
j1 = 2 * k + m2 + ip[k];
k1 = j1 + m2;
xr = a[j1];
xi = a[j1 + 1];
yr = a[k1];
yi = a[k1 + 1];
a[j1] = yr;
a[j1 + 1] = yi;
a[k1] = xr;
a[k1 + 1] = xi;
}
}
static void bitrv2_128_C(float* a) {
/*
Following things have been attempted but are no faster:
(a) Storing the swap indexes in a LUT (index calculations are done
for 'free' while waiting on memory/L1).
(b) Consolidate the load/store of two consecutive floats by a 64 bit
integer (execution is memory/L1 bound).
(c) Do a mix of floats and 64 bit integer to maximize register
utilization (execution is memory/L1 bound).
(d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
(e) Hard-coding of the offsets to completely eliminates index
calculations.
*/
unsigned int j, j1, k, k1;
float xr, xi, yr, yi;
static const int ip[4] = {0, 64, 32, 96};
for (k = 0; k < 4; k++) {
for (j = 0; j < k; j++) {
j1 = 2 * j + ip[k];
k1 = 2 * k + ip[j];
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += 8;
k1 += 16;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += 8;
k1 -= 8;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
j1 += 8;
k1 += 16;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
}
j1 = 2 * k + 8 + ip[k];
k1 = j1 + 8;
xr = a[j1 + 0];
xi = a[j1 + 1];
yr = a[k1 + 0];
yi = a[k1 + 1];
a[j1 + 0] = yr;
a[j1 + 1] = yi;
a[k1 + 0] = xr;
a[k1 + 1] = xi;
}
}
static void makewt_32(void) {
const int nw = 32;
int j, nwh;
float delta, x, y;
ip[0] = nw;
ip[1] = 1;
nwh = nw >> 1;
delta = atanf(1.0f) / nwh;
rdft_w[0] = 1;
rdft_w[1] = 0;
rdft_w[nwh] = cosf(delta * nwh);
rdft_w[nwh + 1] = rdft_w[nwh];
for (j = 2; j < nwh; j += 2) {
x = cosf(delta * j);
y = sinf(delta * j);
rdft_w[j] = x;
rdft_w[j + 1] = y;
rdft_w[nw - j] = y;
rdft_w[nw - j + 1] = x;
}
bitrv2_32(ip + 2, rdft_w);
// pre-calculate constants used by cft1st_128 and cftmdl_128...
cftmdl_wk1r[0] = rdft_w[2];
cftmdl_wk1r[1] = rdft_w[2];
cftmdl_wk1r[2] = rdft_w[2];
cftmdl_wk1r[3] = -rdft_w[2];
{
int k1;
for (k1 = 0, j = 0; j < 128; j += 16, k1 += 2) {
const int k2 = 2 * k1;
const float wk2r = rdft_w[k1 + 0];
const float wk2i = rdft_w[k1 + 1];
float wk1r, wk1i;
// ... scalar version.
wk1r = rdft_w[k2 + 0];
wk1i = rdft_w[k2 + 1];
rdft_wk3ri_first[k1 + 0] = wk1r - 2 * wk2i * wk1i;
rdft_wk3ri_first[k1 + 1] = 2 * wk2i * wk1r - wk1i;
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
rdft_wk3ri_second[k1 + 0] = wk1r - 2 * wk2r * wk1i;
rdft_wk3ri_second[k1 + 1] = 2 * wk2r * wk1r - wk1i;
// ... vector version.
rdft_wk1r[k2 + 0] = rdft_w[k2 + 0];
rdft_wk1r[k2 + 1] = rdft_w[k2 + 0];
rdft_wk1r[k2 + 2] = rdft_w[k2 + 2];
rdft_wk1r[k2 + 3] = rdft_w[k2 + 2];
rdft_wk2r[k2 + 0] = rdft_w[k1 + 0];
rdft_wk2r[k2 + 1] = rdft_w[k1 + 0];
rdft_wk2r[k2 + 2] = -rdft_w[k1 + 1];
rdft_wk2r[k2 + 3] = -rdft_w[k1 + 1];
rdft_wk3r[k2 + 0] = rdft_wk3ri_first[k1 + 0];
rdft_wk3r[k2 + 1] = rdft_wk3ri_first[k1 + 0];
rdft_wk3r[k2 + 2] = rdft_wk3ri_second[k1 + 0];
rdft_wk3r[k2 + 3] = rdft_wk3ri_second[k1 + 0];
rdft_wk1i[k2 + 0] = -rdft_w[k2 + 1];
rdft_wk1i[k2 + 1] = rdft_w[k2 + 1];
rdft_wk1i[k2 + 2] = -rdft_w[k2 + 3];
rdft_wk1i[k2 + 3] = rdft_w[k2 + 3];
rdft_wk2i[k2 + 0] = -rdft_w[k1 + 1];
rdft_wk2i[k2 + 1] = rdft_w[k1 + 1];
rdft_wk2i[k2 + 2] = -rdft_w[k1 + 0];
rdft_wk2i[k2 + 3] = rdft_w[k1 + 0];
rdft_wk3i[k2 + 0] = -rdft_wk3ri_first[k1 + 1];
rdft_wk3i[k2 + 1] = rdft_wk3ri_first[k1 + 1];
rdft_wk3i[k2 + 2] = -rdft_wk3ri_second[k1 + 1];
rdft_wk3i[k2 + 3] = rdft_wk3ri_second[k1 + 1];
}
}
}
static void makect_32(void) {
float* c = rdft_w + 32;
const int nc = 32;
int j, nch;
float delta;
ip[1] = nc;
nch = nc >> 1;
delta = atanf(1.0f) / nch;
c[0] = cosf(delta * nch);
c[nch] = 0.5f * c[0];
for (j = 1; j < nch; j++) {
c[j] = 0.5f * cosf(delta * j);
c[nc - j] = 0.5f * sinf(delta * j);
}
}
static void cft1st_128_C(float* a) {
const int n = 128;
int j, k1, k2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
x0r = a[0] + a[2];
x0i = a[1] + a[3];
x1r = a[0] - a[2];
x1i = a[1] - a[3];
x2r = a[4] + a[6];
x2i = a[5] + a[7];
x3r = a[4] - a[6];
x3i = a[5] - a[7];
a[0] = x0r + x2r;
a[1] = x0i + x2i;
a[4] = x0r - x2r;
a[5] = x0i - x2i;
a[2] = x1r - x3i;
a[3] = x1i + x3r;
a[6] = x1r + x3i;
a[7] = x1i - x3r;
wk1r = rdft_w[2];
x0r = a[8] + a[10];
x0i = a[9] + a[11];
x1r = a[8] - a[10];
x1i = a[9] - a[11];
x2r = a[12] + a[14];
x2i = a[13] + a[15];
x3r = a[12] - a[14];
x3i = a[13] - a[15];
a[8] = x0r + x2r;
a[9] = x0i + x2i;
a[12] = x2i - x0i;
a[13] = x0r - x2r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[10] = wk1r * (x0r - x0i);
a[11] = wk1r * (x0r + x0i);
x0r = x3i + x1r;
x0i = x3r - x1i;
a[14] = wk1r * (x0i - x0r);
a[15] = wk1r * (x0i + x0r);
k1 = 0;
for (j = 16; j < n; j += 16) {
k1 += 2;
k2 = 2 * k1;
wk2r = rdft_w[k1 + 0];
wk2i = rdft_w[k1 + 1];
wk1r = rdft_w[k2 + 0];
wk1i = rdft_w[k2 + 1];
wk3r = rdft_wk3ri_first[k1 + 0];
wk3i = rdft_wk3ri_first[k1 + 1];
x0r = a[j + 0] + a[j + 2];
x0i = a[j + 1] + a[j + 3];
x1r = a[j + 0] - a[j + 2];
x1i = a[j + 1] - a[j + 3];
x2r = a[j + 4] + a[j + 6];
x2i = a[j + 5] + a[j + 7];
x3r = a[j + 4] - a[j + 6];
x3i = a[j + 5] - a[j + 7];
a[j + 0] = x0r + x2r;
a[j + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j + 4] = wk2r * x0r - wk2i * x0i;
a[j + 5] = wk2r * x0i + wk2i * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j + 2] = wk1r * x0r - wk1i * x0i;
a[j + 3] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j + 6] = wk3r * x0r - wk3i * x0i;
a[j + 7] = wk3r * x0i + wk3i * x0r;
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
wk3r = rdft_wk3ri_second[k1 + 0];
wk3i = rdft_wk3ri_second[k1 + 1];
x0r = a[j + 8] + a[j + 10];
x0i = a[j + 9] + a[j + 11];
x1r = a[j + 8] - a[j + 10];
x1i = a[j + 9] - a[j + 11];
x2r = a[j + 12] + a[j + 14];
x2i = a[j + 13] + a[j + 15];
x3r = a[j + 12] - a[j + 14];
x3i = a[j + 13] - a[j + 15];
a[j + 8] = x0r + x2r;
a[j + 9] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j + 12] = -wk2i * x0r - wk2r * x0i;
a[j + 13] = -wk2i * x0i + wk2r * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j + 10] = wk1r * x0r - wk1i * x0i;
a[j + 11] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j + 14] = wk3r * x0r - wk3i * x0i;
a[j + 15] = wk3r * x0i + wk3i * x0r;
}
}
static void cftmdl_128_C(float* a) {
const int l = 8;
const int n = 128;
const int m = 32;
int j0, j1, j2, j3, k, k1, k2, m2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
for (j0 = 0; j0 < l; j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
a[j2 + 0] = x0r - x2r;
a[j2 + 1] = x0i - x2i;
a[j1 + 0] = x1r - x3i;
a[j1 + 1] = x1i + x3r;
a[j3 + 0] = x1r + x3i;
a[j3 + 1] = x1i - x3r;
}
wk1r = rdft_w[2];
for (j0 = m; j0 < l + m; j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
a[j2 + 0] = x2i - x0i;
a[j2 + 1] = x0r - x2r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1 + 0] = wk1r * (x0r - x0i);
a[j1 + 1] = wk1r * (x0r + x0i);
x0r = x3i + x1r;
x0i = x3r - x1i;
a[j3 + 0] = wk1r * (x0i - x0r);
a[j3 + 1] = wk1r * (x0i + x0r);
}
k1 = 0;
m2 = 2 * m;
for (k = m2; k < n; k += m2) {
k1 += 2;
k2 = 2 * k1;
wk2r = rdft_w[k1 + 0];
wk2i = rdft_w[k1 + 1];
wk1r = rdft_w[k2 + 0];
wk1i = rdft_w[k2 + 1];
wk3r = rdft_wk3ri_first[k1 + 0];
wk3i = rdft_wk3ri_first[k1 + 1];
for (j0 = k; j0 < l + k; j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j2 + 0] = wk2r * x0r - wk2i * x0i;
a[j2 + 1] = wk2r * x0i + wk2i * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1 + 0] = wk1r * x0r - wk1i * x0i;
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j3 + 0] = wk3r * x0r - wk3i * x0i;
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
}
wk1r = rdft_w[k2 + 2];
wk1i = rdft_w[k2 + 3];
wk3r = rdft_wk3ri_second[k1 + 0];
wk3i = rdft_wk3ri_second[k1 + 1];
for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
j1 = j0 + 8;
j2 = j0 + 16;
j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0];
x0i = a[j0 + 1] + a[j1 + 1];
x1r = a[j0 + 0] - a[j1 + 0];
x1i = a[j0 + 1] - a[j1 + 1];
x2r = a[j2 + 0] + a[j3 + 0];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2 + 0] - a[j3 + 0];
x3i = a[j2 + 1] - a[j3 + 1];
a[j0 + 0] = x0r + x2r;
a[j0 + 1] = x0i + x2i;
x0r -= x2r;
x0i -= x2i;
a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
x0r = x1r - x3i;
x0i = x1i + x3r;
a[j1 + 0] = wk1r * x0r - wk1i * x0i;
a[j1 + 1] = wk1r * x0i + wk1i * x0r;
x0r = x1r + x3i;
x0i = x1i - x3r;
a[j3 + 0] = wk3r * x0r - wk3i * x0i;
a[j3 + 1] = wk3r * x0i + wk3i * x0r;
}
}
}
static void cftfsub_128_C(float* a) {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
cft1st_128(a);
cftmdl_128(a);
l = 32;
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = a[j + 1] + a[j1 + 1];
x1r = a[j] - a[j1];
x1i = a[j + 1] - a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i + x2i;
a[j2] = x0r - x2r;
a[j2 + 1] = x0i - x2i;
a[j1] = x1r - x3i;
a[j1 + 1] = x1i + x3r;
a[j3] = x1r + x3i;
a[j3 + 1] = x1i - x3r;
}
}
static void cftbsub_128_C(float* a) {
int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
cft1st_128(a);
cftmdl_128(a);
l = 32;
for (j = 0; j < l; j += 2) {
j1 = j + l;
j2 = j1 + l;
j3 = j2 + l;
x0r = a[j] + a[j1];
x0i = -a[j + 1] - a[j1 + 1];
x1r = a[j] - a[j1];
x1i = -a[j + 1] + a[j1 + 1];
x2r = a[j2] + a[j3];
x2i = a[j2 + 1] + a[j3 + 1];
x3r = a[j2] - a[j3];
x3i = a[j2 + 1] - a[j3 + 1];
a[j] = x0r + x2r;
a[j + 1] = x0i - x2i;
a[j2] = x0r - x2r;
a[j2 + 1] = x0i + x2i;
a[j1] = x1r - x3i;
a[j1 + 1] = x1i - x3r;
a[j3] = x1r + x3i;
a[j3 + 1] = x1i + x3r;
}
}
static void rftfsub_128_C(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr - wki * xi;
yi = wkr * xi + wki * xr;
a[j2 + 0] -= yr;
a[j2 + 1] -= yi;
a[k2 + 0] += yr;
a[k2 + 1] -= yi;
}
}
static void rftbsub_128_C(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
a[1] = -a[1];
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr + wki * xi;
yi = wkr * xi - wki * xr;
a[j2 + 0] = a[j2 + 0] - yr;
a[j2 + 1] = yi - a[j2 + 1];
a[k2 + 0] = yr + a[k2 + 0];
a[k2 + 1] = yi - a[k2 + 1];
}
a[65] = -a[65];
}
void aec_rdft_forward_128(float* a) {
float xi;
bitrv2_128(a);
cftfsub_128(a);
rftfsub_128(a);
xi = a[0] - a[1];
a[0] += a[1];
a[1] = xi;
}
void aec_rdft_inverse_128(float* a) {
a[1] = 0.5f * (a[0] - a[1]);
a[0] -= a[1];
rftbsub_128(a);
bitrv2_128(a);
cftbsub_128(a);
}
// code path selection
rft_sub_128_t cft1st_128;
rft_sub_128_t cftmdl_128;
rft_sub_128_t rftfsub_128;
rft_sub_128_t rftbsub_128;
rft_sub_128_t cftfsub_128;
rft_sub_128_t cftbsub_128;
rft_sub_128_t bitrv2_128;
void aec_rdft_init(void) {
cft1st_128 = cft1st_128_C;
cftmdl_128 = cftmdl_128_C;
rftfsub_128 = rftfsub_128_C;
rftbsub_128 = rftbsub_128_C;
cftfsub_128 = cftfsub_128_C;
cftbsub_128 = cftbsub_128_C;
bitrv2_128 = bitrv2_128_C;
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (WebRtc_GetCPUInfo(kSSE2)) {
aec_rdft_init_sse2();
}
#endif
#if defined(MIPS_FPU_LE)
aec_rdft_init_mips();
#endif
// init library constants.
makewt_32();
makect_32();
}

View File

@@ -0,0 +1,64 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
// These intrinsics were unavailable before VS 2008.
// TODO(andrew): move to a common file.
#if defined(_MSC_VER) && _MSC_VER < 1500
#include <emmintrin.h>
static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
#endif
#ifdef _MSC_VER /* visual c++ */
#define ALIGN16_BEG __declspec(align(16))
#define ALIGN16_END
#else /* gcc or icc */
#define ALIGN16_BEG
#define ALIGN16_END __attribute__((aligned(16)))
#endif
// constants shared by all paths (C, SSE2).
extern float rdft_w[64];
// constants used by the C path.
extern float rdft_wk3ri_first[32];
extern float rdft_wk3ri_second[32];
// constants used by SSE2 but initialized in C path.
extern ALIGN16_BEG float ALIGN16_END rdft_wk1r[32];
extern ALIGN16_BEG float ALIGN16_END rdft_wk2r[32];
extern ALIGN16_BEG float ALIGN16_END rdft_wk3r[32];
extern ALIGN16_BEG float ALIGN16_END rdft_wk1i[32];
extern ALIGN16_BEG float ALIGN16_END rdft_wk2i[32];
extern ALIGN16_BEG float ALIGN16_END rdft_wk3i[32];
extern ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
// code path selection function pointers
typedef void (*rft_sub_128_t)(float* a);
extern rft_sub_128_t rftfsub_128;
extern rft_sub_128_t rftbsub_128;
extern rft_sub_128_t cft1st_128;
extern rft_sub_128_t cftmdl_128;
extern rft_sub_128_t cftfsub_128;
extern rft_sub_128_t cftbsub_128;
extern rft_sub_128_t bitrv2_128;
// entry points
void aec_rdft_init(void);
void aec_rdft_init_sse2(void);
void aec_rdft_forward_128(float* a);
void aec_rdft_inverse_128(float* a);
#if defined(MIPS_FPU_LE)
void aec_rdft_init_mips(void);
#endif
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,427 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
#include <emmintrin.h>
static const ALIGN16_BEG float ALIGN16_END
k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
static void cft1st_128_SSE2(float* a) {
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j, k2;
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
__m128 a00v = _mm_loadu_ps(&a[j + 0]);
__m128 a04v = _mm_loadu_ps(&a[j + 4]);
__m128 a08v = _mm_loadu_ps(&a[j + 8]);
__m128 a12v = _mm_loadu_ps(&a[j + 12]);
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
__m128 x0v = _mm_add_ps(a01v, a23v);
const __m128 x1v = _mm_sub_ps(a01v, a23v);
const __m128 x2v = _mm_add_ps(a45v, a67v);
const __m128 x3v = _mm_sub_ps(a45v, a67v);
__m128 x0w;
a01v = _mm_add_ps(x0v, x2v);
x0v = _mm_sub_ps(x0v, x2v);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
{
const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
a45v = _mm_add_ps(a45_0v, a45_1v);
}
{
__m128 a23_0v, a23_1v;
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
x0v = _mm_add_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
a23_0v = _mm_mul_ps(wk1rv, x0v);
a23_1v = _mm_mul_ps(wk1iv, x0w);
a23v = _mm_add_ps(a23_0v, a23_1v);
x0v = _mm_sub_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
}
{
const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
a67v = _mm_add_ps(a67_0v, a67_1v);
}
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storeu_ps(&a[j + 0], a00v);
_mm_storeu_ps(&a[j + 4], a04v);
_mm_storeu_ps(&a[j + 8], a08v);
_mm_storeu_ps(&a[j + 12], a12v);
}
}
static void cftmdl_128_SSE2(float* a) {
const int l = 8;
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j0;
__m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
for (j0 = 0; j0 < l; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
_mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
_mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
_mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
_mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 yy0 =
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 yy1 =
_mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
const __m128 yy3 = _mm_add_ps(yy0, yy2);
const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
_mm_storel_epi64(
(__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
_mm_storel_epi64(
(__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
a[j0 + 48] = -a[j0 + 48];
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
_mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
_mm_storel_epi64(
(__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
}
{
int k = 64;
int k1 = 2;
int k2 = 2 * k1;
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
for (j0 = k; j0 < l + k; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
_mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
_mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
_mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
_mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
const __m128 xx3 =
_mm_mul_ps(wk2iv,
_mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx4 = _mm_add_ps(xx2, xx3);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
const __m128 xx11 = _mm_mul_ps(
wk1iv,
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx12 = _mm_add_ps(xx10, xx11);
const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
const __m128 xx21 = _mm_mul_ps(
wk3iv,
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
_MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx22 = _mm_add_ps(xx20, xx21);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
_mm_storel_epi64(
(__m128i*)&a[j0 + 32],
_mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
_mm_storel_epi64(
(__m128i*)&a[j0 + 48],
_mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
_mm_storel_epi64(
(__m128i*)&a[j0 + 40],
_mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
_mm_storel_epi64(
(__m128i*)&a[j0 + 56],
_mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
}
}
}
static void rftfsub_128_SSE2(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END
k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
const __m128 mm_half = _mm_load_ps(k_half);
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr;
const __m128 a_ = _mm_mul_ps(wkr_, xr_);
const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] -= yr;
// a[j2 + 1] -= yi;
// a[k2 + 0] += yr;
// a[k2 + 1] -= yi;
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9,
const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121,
// Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr - wki * xi;
yi = wkr * xi + wki * xr;
a[j2 + 0] -= yr;
a[j2 + 1] -= yi;
a[k2 + 0] += yr;
a[k2 + 1] -= yi;
}
}
static void rftbsub_128_SSE2(float* a) {
const float* c = rdft_w + 32;
int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END
k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
const __m128 mm_half = _mm_load_ps(k_half);
a[1] = -a[1];
// Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(
a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(
a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'.
// yr = wkr * xr + wki * xi;
// yi = wkr * xi - wki * xr;
const __m128 a_ = _mm_mul_ps(wkr_, xr_);
const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'.
// a[j2 + 0] = a[j2 + 0] - yr;
// a[j2 + 1] = yi - a[j2 + 1];
// a[k2 + 0] = yr + a[k2 + 0];
// a[k2 + 1] = yi - a[k2 + 1];
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9,
const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121,
// Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(
a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(
a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n);
}
// Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2;
k1 = 32 - j1;
wkr = 0.5f - c[k1];
wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0];
xi = a[j2 + 1] + a[k2 + 1];
yr = wkr * xr + wki * xi;
yi = wkr * xi - wki * xr;
a[j2 + 0] = a[j2 + 0] - yr;
a[j2 + 1] = yi - a[j2 + 1];
a[k2 + 0] = yr + a[k2 + 0];
a[k2 + 1] = yi - a[k2 + 1];
}
a[65] = -a[65];
}
void aec_rdft_init_sse2(void) {
cft1st_128 = cft1st_128_SSE2;
cftmdl_128 = cftmdl_128_SSE2;
rftfsub_128 = rftfsub_128_SSE2;
rftbsub_128 = rftbsub_128_SSE2;
}

View File

@@ -0,0 +1,225 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for
* clock skew by resampling the farend signal.
*/
#include "webrtc/modules/audio_processing/aec/aec_resampler.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "webrtc/modules/audio_processing/aec/aec_core.h"
enum {
kEstimateLengthFrames = 400
};
typedef struct {
short buffer[kResamplerBufferSize];
float position;
int deviceSampleRateHz;
int skewData[kEstimateLengthFrames];
int skewDataIndex;
float skewEstimate;
} resampler_t;
static int EstimateSkew(const int* rawSkew,
int size,
int absLimit,
float* skewEst);
int WebRtcAec_CreateResampler(void** resampInst) {
resampler_t* obj = malloc(sizeof(resampler_t));
*resampInst = obj;
if (obj == NULL) {
return -1;
}
return 0;
}
int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz) {
resampler_t* obj = (resampler_t*)resampInst;
memset(obj->buffer, 0, sizeof(obj->buffer));
obj->position = 0.0;
obj->deviceSampleRateHz = deviceSampleRateHz;
memset(obj->skewData, 0, sizeof(obj->skewData));
obj->skewDataIndex = 0;
obj->skewEstimate = 0.0;
return 0;
}
int WebRtcAec_FreeResampler(void* resampInst) {
resampler_t* obj = (resampler_t*)resampInst;
free(obj);
return 0;
}
void WebRtcAec_ResampleLinear(void* resampInst,
const short* inspeech,
int size,
float skew,
short* outspeech,
int* size_out) {
resampler_t* obj = (resampler_t*)resampInst;
short* y;
float be, tnew, interp;
int tn, mm;
assert(!(size < 0 || size > 2 * FRAME_LEN));
assert(resampInst != NULL);
assert(inspeech != NULL);
assert(outspeech != NULL);
assert(size_out != NULL);
// Add new frame data in lookahead
memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
inspeech,
size * sizeof(short));
// Sample rate ratio
be = 1 + skew;
// Loop over input frame
mm = 0;
y = &obj->buffer[FRAME_LEN]; // Point at current frame
tnew = be * mm + obj->position;
tn = (int)tnew;
while (tn < size) {
// Interpolation
interp = y[tn] + (tnew - tn) * (y[tn + 1] - y[tn]);
if (interp > 32767) {
interp = 32767;
} else if (interp < -32768) {
interp = -32768;
}
outspeech[mm] = (short)interp;
mm++;
tnew = be * mm + obj->position;
tn = (int)tnew;
}
*size_out = mm;
obj->position += (*size_out) * be - size;
// Shift buffer
memmove(obj->buffer,
&obj->buffer[size],
(kResamplerBufferSize - size) * sizeof(short));
}
int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst) {
resampler_t* obj = (resampler_t*)resampInst;
int err = 0;
if (obj->skewDataIndex < kEstimateLengthFrames) {
obj->skewData[obj->skewDataIndex] = rawSkew;
obj->skewDataIndex++;
} else if (obj->skewDataIndex == kEstimateLengthFrames) {
err = EstimateSkew(
obj->skewData, kEstimateLengthFrames, obj->deviceSampleRateHz, skewEst);
obj->skewEstimate = *skewEst;
obj->skewDataIndex++;
} else {
*skewEst = obj->skewEstimate;
}
return err;
}
int EstimateSkew(const int* rawSkew,
int size,
int deviceSampleRateHz,
float* skewEst) {
const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
int i = 0;
int n = 0;
float rawAvg = 0;
float err = 0;
float rawAbsDev = 0;
int upperLimit = 0;
int lowerLimit = 0;
float cumSum = 0;
float x = 0;
float x2 = 0;
float y = 0;
float xy = 0;
float xAvg = 0;
float denom = 0;
float skew = 0;
*skewEst = 0; // Set in case of error below.
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
n++;
rawAvg += rawSkew[i];
}
}
if (n == 0) {
return -1;
}
assert(n > 0);
rawAvg /= n;
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
err = rawSkew[i] - rawAvg;
rawAbsDev += err >= 0 ? err : -err;
}
}
assert(n > 0);
rawAbsDev /= n;
upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
n = 0;
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
(rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
n++;
cumSum += rawSkew[i];
x += n;
x2 += n * n;
y += cumSum;
xy += n * cumSum;
}
}
if (n == 0) {
return -1;
}
assert(n > 0);
xAvg = x / n;
denom = x2 - xAvg * x;
if (denom != 0) {
skew = (xy - xAvg * y) / denom;
}
*skewEst = skew;
return 0;
}

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_
#include "webrtc/modules/audio_processing/aec/aec_core.h"
enum {
kResamplingDelay = 1
};
enum {
kResamplerBufferSize = FRAME_LEN * 4
};
// Unless otherwise specified, functions return 0 on success and -1 on error
int WebRtcAec_CreateResampler(void** resampInst);
int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz);
int WebRtcAec_FreeResampler(void* resampInst);
// Estimates skew from raw measurement.
int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst);
// Resamples input using linear interpolation.
void WebRtcAec_ResampleLinear(void* resampInst,
const short* inspeech,
int size,
float skew,
short* outspeech,
int* size_out);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_

View File

@@ -0,0 +1,974 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* Contains the API functions for the AEC.
*/
#include "webrtc/modules/audio_processing/aec/include/echo_cancellation.h"
#include <math.h>
#ifdef WEBRTC_AEC_DEBUG_DUMP
#include <stdio.h>
#endif
#include <stdlib.h>
#include <string.h>
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include "webrtc/modules/audio_processing/aec/aec_resampler.h"
#include "webrtc/modules/audio_processing/aec/echo_cancellation_internal.h"
#include "webrtc/modules/audio_processing/utility/ring_buffer.h"
#include "webrtc/typedefs.h"
// Measured delays [ms]
// Device Chrome GTP
// MacBook Air 10
// MacBook Retina 10 100
// MacPro 30?
//
// Win7 Desktop 70 80?
// Win7 T430s 110
// Win8 T420s 70
//
// Daisy 50
// Pixel (w/ preproc?) 240
// Pixel (w/o preproc?) 110 110
// The extended filter mode gives us the flexibility to ignore the system's
// reported delays. We do this for platforms which we believe provide results
// which are incompatible with the AEC's expectations. Based on measurements
// (some provided above) we set a conservative (i.e. lower than measured)
// fixed delay.
//
// WEBRTC_UNTRUSTED_DELAY will only have an impact when |extended_filter_mode|
// is enabled. See the note along with |DelayCorrection| in
// echo_cancellation_impl.h for more details on the mode.
//
// Justification:
// Chromium/Mac: Here, the true latency is so low (~10-20 ms), that it plays
// havoc with the AEC's buffering. To avoid this, we set a fixed delay of 20 ms
// and then compensate by rewinding by 10 ms (in wideband) through
// kDelayDiffOffsetSamples. This trick does not seem to work for larger rewind
// values, but fortunately this is sufficient.
//
// Chromium/Linux(ChromeOS): The values we get on this platform don't correspond
// well to reality. The variance doesn't match the AEC's buffer changes, and the
// bulk values tend to be too low. However, the range across different hardware
// appears to be too large to choose a single value.
//
// GTP/Linux(ChromeOS): TBD, but for the moment we will trust the values.
#if defined(WEBRTC_CHROMIUM_BUILD) && defined(WEBRTC_MAC)
#define WEBRTC_UNTRUSTED_DELAY
#if defined(WEBRTC_MAC)
static const int kDelayDiffOffsetSamples = -160;
#else
// Not enabled for now.
static const int kDelayDiffOffsetSamples = 0;
#endif
#endif
#if defined(WEBRTC_MAC)
static const int kFixedDelayMs = 20;
#else
static const int kFixedDelayMs = 50;
#endif
#if !defined(WEBRTC_UNTRUSTED_DELAY)
static const int kMinTrustedDelayMs = 20;
#endif
static const int kMaxTrustedDelayMs = 500;
// Maximum length of resampled signal. Must be an integer multiple of frames
// (ceil(1/(1 + MIN_SKEW)*2) + 1)*FRAME_LEN
// The factor of 2 handles wb, and the + 1 is as a safety margin
// TODO(bjornv): Replace with kResamplerBufferSize
#define MAX_RESAMP_LEN (5 * FRAME_LEN)
static const int kMaxBufSizeStart = 62; // In partitions
static const int sampMsNb = 8; // samples per ms in nb
static const int initCheck = 42;
#ifdef WEBRTC_AEC_DEBUG_DUMP
int webrtc_aec_instance_count = 0;
#endif
// Estimates delay to set the position of the far-end buffer read pointer
// (controlled by knownDelay)
static void EstBufDelayNormal(aecpc_t* aecInst);
static void EstBufDelayExtended(aecpc_t* aecInst);
static int ProcessNormal(aecpc_t* self,
const int16_t* near,
const int16_t* near_high,
int16_t* out,
int16_t* out_high,
int16_t num_samples,
int16_t reported_delay_ms,
int32_t skew);
static void ProcessExtended(aecpc_t* self,
const int16_t* near,
const int16_t* near_high,
int16_t* out,
int16_t* out_high,
int16_t num_samples,
int16_t reported_delay_ms,
int32_t skew);
int32_t WebRtcAec_Create(void** aecInst) {
aecpc_t* aecpc;
if (aecInst == NULL) {
return -1;
}
aecpc = malloc(sizeof(aecpc_t));
*aecInst = aecpc;
if (aecpc == NULL) {
return -1;
}
if (WebRtcAec_CreateAec(&aecpc->aec) == -1) {
WebRtcAec_Free(aecpc);
aecpc = NULL;
return -1;
}
if (WebRtcAec_CreateResampler(&aecpc->resampler) == -1) {
WebRtcAec_Free(aecpc);
aecpc = NULL;
return -1;
}
// Create far-end pre-buffer. The buffer size has to be large enough for
// largest possible drift compensation (kResamplerBufferSize) + "almost" an
// FFT buffer (PART_LEN2 - 1).
aecpc->far_pre_buf =
WebRtc_CreateBuffer(PART_LEN2 + kResamplerBufferSize, sizeof(float));
if (!aecpc->far_pre_buf) {
WebRtcAec_Free(aecpc);
aecpc = NULL;
return -1;
}
aecpc->initFlag = 0;
aecpc->lastError = 0;
#ifdef WEBRTC_AEC_DEBUG_DUMP
aecpc->far_pre_buf_s16 =
WebRtc_CreateBuffer(PART_LEN2 + kResamplerBufferSize, sizeof(int16_t));
if (!aecpc->far_pre_buf_s16) {
WebRtcAec_Free(aecpc);
aecpc = NULL;
return -1;
}
{
char filename[64];
sprintf(filename, "aec_buf%d.dat", webrtc_aec_instance_count);
aecpc->bufFile = fopen(filename, "wb");
sprintf(filename, "aec_skew%d.dat", webrtc_aec_instance_count);
aecpc->skewFile = fopen(filename, "wb");
sprintf(filename, "aec_delay%d.dat", webrtc_aec_instance_count);
aecpc->delayFile = fopen(filename, "wb");
webrtc_aec_instance_count++;
}
#endif
return 0;
}
int32_t WebRtcAec_Free(void* aecInst) {
aecpc_t* aecpc = aecInst;
if (aecpc == NULL) {
return -1;
}
WebRtc_FreeBuffer(aecpc->far_pre_buf);
#ifdef WEBRTC_AEC_DEBUG_DUMP
WebRtc_FreeBuffer(aecpc->far_pre_buf_s16);
fclose(aecpc->bufFile);
fclose(aecpc->skewFile);
fclose(aecpc->delayFile);
#endif
WebRtcAec_FreeAec(aecpc->aec);
WebRtcAec_FreeResampler(aecpc->resampler);
free(aecpc);
return 0;
}
int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq) {
aecpc_t* aecpc = aecInst;
AecConfig aecConfig;
if (sampFreq != 8000 && sampFreq != 16000 && sampFreq != 32000) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
aecpc->sampFreq = sampFreq;
if (scSampFreq < 1 || scSampFreq > 96000) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
aecpc->scSampFreq = scSampFreq;
// Initialize echo canceller core
if (WebRtcAec_InitAec(aecpc->aec, aecpc->sampFreq) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1;
}
if (WebRtcAec_InitResampler(aecpc->resampler, aecpc->scSampFreq) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1;
}
if (WebRtc_InitBuffer(aecpc->far_pre_buf) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1;
}
WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN); // Start overlap.
aecpc->initFlag = initCheck; // indicates that initialization has been done
if (aecpc->sampFreq == 32000) {
aecpc->splitSampFreq = 16000;
} else {
aecpc->splitSampFreq = sampFreq;
}
aecpc->delayCtr = 0;
aecpc->sampFactor = (aecpc->scSampFreq * 1.0f) / aecpc->splitSampFreq;
// Sampling frequency multiplier (SWB is processed as 160 frame size).
aecpc->rate_factor = aecpc->splitSampFreq / 8000;
aecpc->sum = 0;
aecpc->counter = 0;
aecpc->checkBuffSize = 1;
aecpc->firstVal = 0;
aecpc->startup_phase = WebRtcAec_reported_delay_enabled(aecpc->aec);
aecpc->bufSizeStart = 0;
aecpc->checkBufSizeCtr = 0;
aecpc->msInSndCardBuf = 0;
aecpc->filtDelay = -1; // -1 indicates an initialized state.
aecpc->timeForDelayChange = 0;
aecpc->knownDelay = 0;
aecpc->lastDelayDiff = 0;
aecpc->skewFrCtr = 0;
aecpc->resample = kAecFalse;
aecpc->highSkewCtr = 0;
aecpc->skew = 0;
aecpc->farend_started = 0;
// Default settings.
aecConfig.nlpMode = kAecNlpModerate;
aecConfig.skewMode = kAecFalse;
aecConfig.metricsMode = kAecFalse;
aecConfig.delay_logging = kAecFalse;
if (WebRtcAec_set_config(aecpc, aecConfig) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1;
}
#ifdef WEBRTC_AEC_DEBUG_DUMP
if (WebRtc_InitBuffer(aecpc->far_pre_buf_s16) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1;
}
WebRtc_MoveReadPtr(aecpc->far_pre_buf_s16, -PART_LEN); // Start overlap.
#endif
return 0;
}
// only buffer L band for farend
int32_t WebRtcAec_BufferFarend(void* aecInst,
const int16_t* farend,
int16_t nrOfSamples) {
aecpc_t* aecpc = aecInst;
int32_t retVal = 0;
int newNrOfSamples = (int)nrOfSamples;
short newFarend[MAX_RESAMP_LEN];
const int16_t* farend_ptr = farend;
float tmp_farend[MAX_RESAMP_LEN];
const float* farend_float = tmp_farend;
float skew;
int i = 0;
if (farend == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (aecpc->initFlag != initCheck) {
aecpc->lastError = AEC_UNINITIALIZED_ERROR;
return -1;
}
// number of samples == 160 for SWB input
if (nrOfSamples != 80 && nrOfSamples != 160) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
skew = aecpc->skew;
if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
// Resample and get a new number of samples
WebRtcAec_ResampleLinear(aecpc->resampler,
farend,
nrOfSamples,
skew,
newFarend,
&newNrOfSamples);
farend_ptr = (const int16_t*)newFarend;
}
aecpc->farend_started = 1;
WebRtcAec_SetSystemDelay(aecpc->aec,
WebRtcAec_system_delay(aecpc->aec) + newNrOfSamples);
#ifdef WEBRTC_AEC_DEBUG_DUMP
WebRtc_WriteBuffer(
aecpc->far_pre_buf_s16, farend_ptr, (size_t)newNrOfSamples);
#endif
// Cast to float and write the time-domain data to |far_pre_buf|.
for (i = 0; i < newNrOfSamples; i++) {
tmp_farend[i] = (float)farend_ptr[i];
}
WebRtc_WriteBuffer(aecpc->far_pre_buf, farend_float, (size_t)newNrOfSamples);
// Transform to frequency domain if we have enough data.
while (WebRtc_available_read(aecpc->far_pre_buf) >= PART_LEN2) {
// We have enough data to pass to the FFT, hence read PART_LEN2 samples.
WebRtc_ReadBuffer(
aecpc->far_pre_buf, (void**)&farend_float, tmp_farend, PART_LEN2);
WebRtcAec_BufferFarendPartition(aecpc->aec, farend_float);
// Rewind |far_pre_buf| PART_LEN samples for overlap before continuing.
WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN);
#ifdef WEBRTC_AEC_DEBUG_DUMP
WebRtc_ReadBuffer(
aecpc->far_pre_buf_s16, (void**)&farend_ptr, newFarend, PART_LEN2);
WebRtc_WriteBuffer(
WebRtcAec_far_time_buf(aecpc->aec), &farend_ptr[PART_LEN], 1);
WebRtc_MoveReadPtr(aecpc->far_pre_buf_s16, -PART_LEN);
#endif
}
return retVal;
}
int32_t WebRtcAec_Process(void* aecInst,
const int16_t* nearend,
const int16_t* nearendH,
int16_t* out,
int16_t* outH,
int16_t nrOfSamples,
int16_t msInSndCardBuf,
int32_t skew) {
aecpc_t* aecpc = aecInst;
int32_t retVal = 0;
if (nearend == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (out == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (aecpc->initFlag != initCheck) {
aecpc->lastError = AEC_UNINITIALIZED_ERROR;
return -1;
}
// number of samples == 160 for SWB input
if (nrOfSamples != 80 && nrOfSamples != 160) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
// Check for valid pointers based on sampling rate
if (aecpc->sampFreq == 32000 && nearendH == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (msInSndCardBuf < 0) {
msInSndCardBuf = 0;
aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
retVal = -1;
} else if (msInSndCardBuf > kMaxTrustedDelayMs) {
// The clamping is now done in ProcessExtended/Normal().
aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
retVal = -1;
}
// This returns the value of aec->extended_filter_enabled.
if (WebRtcAec_delay_correction_enabled(aecpc->aec)) {
ProcessExtended(
aecpc, nearend, nearendH, out, outH, nrOfSamples, msInSndCardBuf, skew);
} else {
if (ProcessNormal(aecpc,
nearend,
nearendH,
out,
outH,
nrOfSamples,
msInSndCardBuf,
skew) != 0) {
retVal = -1;
}
}
#ifdef WEBRTC_AEC_DEBUG_DUMP
{
int16_t far_buf_size_ms = (int16_t)(WebRtcAec_system_delay(aecpc->aec) /
(sampMsNb * aecpc->rate_factor));
(void)fwrite(&far_buf_size_ms, 2, 1, aecpc->bufFile);
(void)fwrite(
&aecpc->knownDelay, sizeof(aecpc->knownDelay), 1, aecpc->delayFile);
}
#endif
return retVal;
}
int WebRtcAec_set_config(void* handle, AecConfig config) {
aecpc_t* self = (aecpc_t*)handle;
if (self->initFlag != initCheck) {
self->lastError = AEC_UNINITIALIZED_ERROR;
return -1;
}
if (config.skewMode != kAecFalse && config.skewMode != kAecTrue) {
self->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
self->skewMode = config.skewMode;
if (config.nlpMode != kAecNlpConservative &&
config.nlpMode != kAecNlpModerate &&
config.nlpMode != kAecNlpAggressive) {
self->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
if (config.metricsMode != kAecFalse && config.metricsMode != kAecTrue) {
self->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
if (config.delay_logging != kAecFalse && config.delay_logging != kAecTrue) {
self->lastError = AEC_BAD_PARAMETER_ERROR;
return -1;
}
WebRtcAec_SetConfigCore(
self->aec, config.nlpMode, config.metricsMode, config.delay_logging);
return 0;
}
int WebRtcAec_get_echo_status(void* handle, int* status) {
aecpc_t* self = (aecpc_t*)handle;
if (status == NULL) {
self->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (self->initFlag != initCheck) {
self->lastError = AEC_UNINITIALIZED_ERROR;
return -1;
}
*status = WebRtcAec_echo_state(self->aec);
return 0;
}
int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics) {
const float kUpWeight = 0.7f;
float dtmp;
int stmp;
aecpc_t* self = (aecpc_t*)handle;
Stats erl;
Stats erle;
Stats a_nlp;
if (handle == NULL) {
return -1;
}
if (metrics == NULL) {
self->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (self->initFlag != initCheck) {
self->lastError = AEC_UNINITIALIZED_ERROR;
return -1;
}
WebRtcAec_GetEchoStats(self->aec, &erl, &erle, &a_nlp);
// ERL
metrics->erl.instant = (int)erl.instant;
if ((erl.himean > kOffsetLevel) && (erl.average > kOffsetLevel)) {
// Use a mix between regular average and upper part average.
dtmp = kUpWeight * erl.himean + (1 - kUpWeight) * erl.average;
metrics->erl.average = (int)dtmp;
} else {
metrics->erl.average = kOffsetLevel;
}
metrics->erl.max = (int)erl.max;
if (erl.min < (kOffsetLevel * (-1))) {
metrics->erl.min = (int)erl.min;
} else {
metrics->erl.min = kOffsetLevel;
}
// ERLE
metrics->erle.instant = (int)erle.instant;
if ((erle.himean > kOffsetLevel) && (erle.average > kOffsetLevel)) {
// Use a mix between regular average and upper part average.
dtmp = kUpWeight * erle.himean + (1 - kUpWeight) * erle.average;
metrics->erle.average = (int)dtmp;
} else {
metrics->erle.average = kOffsetLevel;
}
metrics->erle.max = (int)erle.max;
if (erle.min < (kOffsetLevel * (-1))) {
metrics->erle.min = (int)erle.min;
} else {
metrics->erle.min = kOffsetLevel;
}
// RERL
if ((metrics->erl.average > kOffsetLevel) &&
(metrics->erle.average > kOffsetLevel)) {
stmp = metrics->erl.average + metrics->erle.average;
} else {
stmp = kOffsetLevel;
}
metrics->rerl.average = stmp;
// No other statistics needed, but returned for completeness.
metrics->rerl.instant = stmp;
metrics->rerl.max = stmp;
metrics->rerl.min = stmp;
// A_NLP
metrics->aNlp.instant = (int)a_nlp.instant;
if ((a_nlp.himean > kOffsetLevel) && (a_nlp.average > kOffsetLevel)) {
// Use a mix between regular average and upper part average.
dtmp = kUpWeight * a_nlp.himean + (1 - kUpWeight) * a_nlp.average;
metrics->aNlp.average = (int)dtmp;
} else {
metrics->aNlp.average = kOffsetLevel;
}
metrics->aNlp.max = (int)a_nlp.max;
if (a_nlp.min < (kOffsetLevel * (-1))) {
metrics->aNlp.min = (int)a_nlp.min;
} else {
metrics->aNlp.min = kOffsetLevel;
}
return 0;
}
int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std) {
aecpc_t* self = handle;
if (median == NULL) {
self->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (std == NULL) {
self->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (self->initFlag != initCheck) {
self->lastError = AEC_UNINITIALIZED_ERROR;
return -1;
}
if (WebRtcAec_GetDelayMetricsCore(self->aec, median, std) == -1) {
// Logging disabled.
self->lastError = AEC_UNSUPPORTED_FUNCTION_ERROR;
return -1;
}
return 0;
}
int32_t WebRtcAec_get_error_code(void* aecInst) {
aecpc_t* aecpc = aecInst;
return aecpc->lastError;
}
AecCore* WebRtcAec_aec_core(void* handle) {
if (!handle) {
return NULL;
}
return ((aecpc_t*)handle)->aec;
}
static int ProcessNormal(aecpc_t* aecpc,
const int16_t* nearend,
const int16_t* nearendH,
int16_t* out,
int16_t* outH,
int16_t nrOfSamples,
int16_t msInSndCardBuf,
int32_t skew) {
int retVal = 0;
short i;
short nBlocks10ms;
short nFrames;
// Limit resampling to doubling/halving of signal
const float minSkewEst = -0.5f;
const float maxSkewEst = 1.0f;
msInSndCardBuf =
msInSndCardBuf > kMaxTrustedDelayMs ? kMaxTrustedDelayMs : msInSndCardBuf;
// TODO(andrew): we need to investigate if this +10 is really wanted.
msInSndCardBuf += 10;
aecpc->msInSndCardBuf = msInSndCardBuf;
if (aecpc->skewMode == kAecTrue) {
if (aecpc->skewFrCtr < 25) {
aecpc->skewFrCtr++;
} else {
retVal = WebRtcAec_GetSkew(aecpc->resampler, skew, &aecpc->skew);
if (retVal == -1) {
aecpc->skew = 0;
aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
}
aecpc->skew /= aecpc->sampFactor * nrOfSamples;
if (aecpc->skew < 1.0e-3 && aecpc->skew > -1.0e-3) {
aecpc->resample = kAecFalse;
} else {
aecpc->resample = kAecTrue;
}
if (aecpc->skew < minSkewEst) {
aecpc->skew = minSkewEst;
} else if (aecpc->skew > maxSkewEst) {
aecpc->skew = maxSkewEst;
}
#ifdef WEBRTC_AEC_DEBUG_DUMP
(void)fwrite(&aecpc->skew, sizeof(aecpc->skew), 1, aecpc->skewFile);
#endif
}
}
nFrames = nrOfSamples / FRAME_LEN;
nBlocks10ms = nFrames / aecpc->rate_factor;
if (aecpc->startup_phase) {
// Only needed if they don't already point to the same place.
if (nearend != out) {
memcpy(out, nearend, sizeof(short) * nrOfSamples);
}
if (nearendH != outH) {
memcpy(outH, nearendH, sizeof(short) * nrOfSamples);
}
// The AEC is in the start up mode
// AEC is disabled until the system delay is OK
// Mechanism to ensure that the system delay is reasonably stable.
if (aecpc->checkBuffSize) {
aecpc->checkBufSizeCtr++;
// Before we fill up the far-end buffer we require the system delay
// to be stable (+/-8 ms) compared to the first value. This
// comparison is made during the following 6 consecutive 10 ms
// blocks. If it seems to be stable then we start to fill up the
// far-end buffer.
if (aecpc->counter == 0) {
aecpc->firstVal = aecpc->msInSndCardBuf;
aecpc->sum = 0;
}
if (abs(aecpc->firstVal - aecpc->msInSndCardBuf) <
WEBRTC_SPL_MAX(0.2 * aecpc->msInSndCardBuf, sampMsNb)) {
aecpc->sum += aecpc->msInSndCardBuf;
aecpc->counter++;
} else {
aecpc->counter = 0;
}
if (aecpc->counter * nBlocks10ms >= 6) {
// The far-end buffer size is determined in partitions of
// PART_LEN samples. Use 75% of the average value of the system
// delay as buffer size to start with.
aecpc->bufSizeStart =
WEBRTC_SPL_MIN((3 * aecpc->sum * aecpc->rate_factor * 8) /
(4 * aecpc->counter * PART_LEN),
kMaxBufSizeStart);
// Buffer size has now been determined.
aecpc->checkBuffSize = 0;
}
if (aecpc->checkBufSizeCtr * nBlocks10ms > 50) {
// For really bad systems, don't disable the echo canceller for
// more than 0.5 sec.
aecpc->bufSizeStart = WEBRTC_SPL_MIN(
(aecpc->msInSndCardBuf * aecpc->rate_factor * 3) / 40,
kMaxBufSizeStart);
aecpc->checkBuffSize = 0;
}
}
// If |checkBuffSize| changed in the if-statement above.
if (!aecpc->checkBuffSize) {
// The system delay is now reasonably stable (or has been unstable
// for too long). When the far-end buffer is filled with
// approximately the same amount of data as reported by the system
// we end the startup phase.
int overhead_elements =
WebRtcAec_system_delay(aecpc->aec) / PART_LEN - aecpc->bufSizeStart;
if (overhead_elements == 0) {
// Enable the AEC
aecpc->startup_phase = 0;
} else if (overhead_elements > 0) {
// TODO(bjornv): Do we need a check on how much we actually
// moved the read pointer? It should always be possible to move
// the pointer |overhead_elements| since we have only added data
// to the buffer and no delay compensation nor AEC processing
// has been done.
WebRtcAec_MoveFarReadPtr(aecpc->aec, overhead_elements);
// Enable the AEC
aecpc->startup_phase = 0;
}
}
} else {
// AEC is enabled.
if (WebRtcAec_reported_delay_enabled(aecpc->aec)) {
EstBufDelayNormal(aecpc);
}
// Note that 1 frame is supported for NB and 2 frames for WB.
for (i = 0; i < nFrames; i++) {
// Call the AEC.
WebRtcAec_ProcessFrame(aecpc->aec,
&nearend[FRAME_LEN * i],
&nearendH[FRAME_LEN * i],
aecpc->knownDelay,
&out[FRAME_LEN * i],
&outH[FRAME_LEN * i]);
// TODO(bjornv): Re-structure such that we don't have to pass
// |aecpc->knownDelay| as input. Change name to something like
// |system_buffer_diff|.
}
}
return retVal;
}
static void ProcessExtended(aecpc_t* self,
const int16_t* near,
const int16_t* near_high,
int16_t* out,
int16_t* out_high,
int16_t num_samples,
int16_t reported_delay_ms,
int32_t skew) {
int i;
const int num_frames = num_samples / FRAME_LEN;
#if defined(WEBRTC_UNTRUSTED_DELAY)
const int delay_diff_offset = kDelayDiffOffsetSamples;
reported_delay_ms = kFixedDelayMs;
#else
// This is the usual mode where we trust the reported system delay values.
const int delay_diff_offset = 0;
// Due to the longer filter, we no longer add 10 ms to the reported delay
// to reduce chance of non-causality. Instead we apply a minimum here to avoid
// issues with the read pointer jumping around needlessly.
reported_delay_ms = reported_delay_ms < kMinTrustedDelayMs
? kMinTrustedDelayMs
: reported_delay_ms;
// If the reported delay appears to be bogus, we attempt to recover by using
// the measured fixed delay values. We use >= here because higher layers
// may already clamp to this maximum value, and we would otherwise not
// detect it here.
reported_delay_ms = reported_delay_ms >= kMaxTrustedDelayMs
? kFixedDelayMs
: reported_delay_ms;
#endif
self->msInSndCardBuf = reported_delay_ms;
if (!self->farend_started) {
// Only needed if they don't already point to the same place.
if (near != out) {
memcpy(out, near, sizeof(short) * num_samples);
}
if (near_high != out_high) {
memcpy(out_high, near_high, sizeof(short) * num_samples);
}
return;
}
if (self->startup_phase) {
// In the extended mode, there isn't a startup "phase", just a special
// action on the first frame. In the trusted delay case, we'll take the
// current reported delay, unless it's less then our conservative
// measurement.
int startup_size_ms =
reported_delay_ms < kFixedDelayMs ? kFixedDelayMs : reported_delay_ms;
int overhead_elements = (WebRtcAec_system_delay(self->aec) -
startup_size_ms / 2 * self->rate_factor * 8) /
PART_LEN;
WebRtcAec_MoveFarReadPtr(self->aec, overhead_elements);
self->startup_phase = 0;
}
if (WebRtcAec_reported_delay_enabled(self->aec)) {
EstBufDelayExtended(self);
}
{
// |delay_diff_offset| gives us the option to manually rewind the delay on
// very low delay platforms which can't be expressed purely through
// |reported_delay_ms|.
const int adjusted_known_delay =
WEBRTC_SPL_MAX(0, self->knownDelay + delay_diff_offset);
for (i = 0; i < num_frames; ++i) {
WebRtcAec_ProcessFrame(self->aec,
&near[FRAME_LEN * i],
&near_high[FRAME_LEN * i],
adjusted_known_delay,
&out[FRAME_LEN * i],
&out_high[FRAME_LEN * i]);
}
}
}
static void EstBufDelayNormal(aecpc_t* aecpc) {
int nSampSndCard = aecpc->msInSndCardBuf * sampMsNb * aecpc->rate_factor;
int current_delay = nSampSndCard - WebRtcAec_system_delay(aecpc->aec);
int delay_difference = 0;
// Before we proceed with the delay estimate filtering we:
// 1) Compensate for the frame that will be read.
// 2) Compensate for drift resampling.
// 3) Compensate for non-causality if needed, since the estimated delay can't
// be negative.
// 1) Compensating for the frame(s) that will be read/processed.
current_delay += FRAME_LEN * aecpc->rate_factor;
// 2) Account for resampling frame delay.
if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
current_delay -= kResamplingDelay;
}
// 3) Compensate for non-causality, if needed, by flushing one block.
if (current_delay < PART_LEN) {
current_delay += WebRtcAec_MoveFarReadPtr(aecpc->aec, 1) * PART_LEN;
}
// We use -1 to signal an initialized state in the "extended" implementation;
// compensate for that.
aecpc->filtDelay = aecpc->filtDelay < 0 ? 0 : aecpc->filtDelay;
aecpc->filtDelay =
WEBRTC_SPL_MAX(0, (short)(0.8 * aecpc->filtDelay + 0.2 * current_delay));
delay_difference = aecpc->filtDelay - aecpc->knownDelay;
if (delay_difference > 224) {
if (aecpc->lastDelayDiff < 96) {
aecpc->timeForDelayChange = 0;
} else {
aecpc->timeForDelayChange++;
}
} else if (delay_difference < 96 && aecpc->knownDelay > 0) {
if (aecpc->lastDelayDiff > 224) {
aecpc->timeForDelayChange = 0;
} else {
aecpc->timeForDelayChange++;
}
} else {
aecpc->timeForDelayChange = 0;
}
aecpc->lastDelayDiff = delay_difference;
if (aecpc->timeForDelayChange > 25) {
aecpc->knownDelay = WEBRTC_SPL_MAX((int)aecpc->filtDelay - 160, 0);
}
}
static void EstBufDelayExtended(aecpc_t* self) {
int reported_delay = self->msInSndCardBuf * sampMsNb * self->rate_factor;
int current_delay = reported_delay - WebRtcAec_system_delay(self->aec);
int delay_difference = 0;
// Before we proceed with the delay estimate filtering we:
// 1) Compensate for the frame that will be read.
// 2) Compensate for drift resampling.
// 3) Compensate for non-causality if needed, since the estimated delay can't
// be negative.
// 1) Compensating for the frame(s) that will be read/processed.
current_delay += FRAME_LEN * self->rate_factor;
// 2) Account for resampling frame delay.
if (self->skewMode == kAecTrue && self->resample == kAecTrue) {
current_delay -= kResamplingDelay;
}
// 3) Compensate for non-causality, if needed, by flushing two blocks.
if (current_delay < PART_LEN) {
current_delay += WebRtcAec_MoveFarReadPtr(self->aec, 2) * PART_LEN;
}
if (self->filtDelay == -1) {
self->filtDelay = WEBRTC_SPL_MAX(0, 0.5 * current_delay);
} else {
self->filtDelay = WEBRTC_SPL_MAX(
0, (short)(0.95 * self->filtDelay + 0.05 * current_delay));
}
delay_difference = self->filtDelay - self->knownDelay;
if (delay_difference > 384) {
if (self->lastDelayDiff < 128) {
self->timeForDelayChange = 0;
} else {
self->timeForDelayChange++;
}
} else if (delay_difference < 128 && self->knownDelay > 0) {
if (self->lastDelayDiff > 384) {
self->timeForDelayChange = 0;
} else {
self->timeForDelayChange++;
}
} else {
self->timeForDelayChange = 0;
}
self->lastDelayDiff = delay_difference;
if (self->timeForDelayChange > 25) {
self->knownDelay = WEBRTC_SPL_MAX((int)self->filtDelay - 256, 0);
}
}

View File

@@ -0,0 +1,68 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_
#include "webrtc/modules/audio_processing/aec/aec_core.h"
#include "webrtc/modules/audio_processing/utility/ring_buffer.h"
typedef struct {
int delayCtr;
int sampFreq;
int splitSampFreq;
int scSampFreq;
float sampFactor; // scSampRate / sampFreq
short skewMode;
int bufSizeStart;
int knownDelay;
int rate_factor;
short initFlag; // indicates if AEC has been initialized
// Variables used for averaging far end buffer size
short counter;
int sum;
short firstVal;
short checkBufSizeCtr;
// Variables used for delay shifts
short msInSndCardBuf;
short filtDelay; // Filtered delay estimate.
int timeForDelayChange;
int startup_phase;
int checkBuffSize;
short lastDelayDiff;
#ifdef WEBRTC_AEC_DEBUG_DUMP
RingBuffer* far_pre_buf_s16; // Time domain far-end pre-buffer in int16_t.
FILE* bufFile;
FILE* delayFile;
FILE* skewFile;
#endif
// Structures
void* resampler;
int skewFrCtr;
int resample; // if the skew is small enough we don't resample
int highSkewCtr;
float skew;
RingBuffer* far_pre_buf; // Time domain far-end pre-buffer.
int lastError;
int farend_started;
AecCore* aec;
} aecpc_t;
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_ECHO_CANCELLATION_INTERNAL_H_

View File

@@ -0,0 +1,50 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
// TODO(bjornv): Make this a comprehensive test.
#include "webrtc/modules/audio_processing/aec/include/echo_cancellation.h"
#include <stdlib.h>
#include <time.h>
extern "C" {
#include "webrtc/modules/audio_processing/aec/aec_core.h"
}
#include "testing/gtest/include/gtest/gtest.h"
namespace webrtc {
TEST(EchoCancellationTest, CreateAndFreeHandlesErrors) {
EXPECT_EQ(-1, WebRtcAec_Create(NULL));
void* handle = NULL;
ASSERT_EQ(0, WebRtcAec_Create(&handle));
EXPECT_TRUE(handle != NULL);
EXPECT_EQ(-1, WebRtcAec_Free(NULL));
EXPECT_EQ(0, WebRtcAec_Free(handle));
}
TEST(EchoCancellationTest, ApplyAecCoreHandle) {
void* handle = NULL;
ASSERT_EQ(0, WebRtcAec_Create(&handle));
EXPECT_TRUE(handle != NULL);
EXPECT_TRUE(WebRtcAec_aec_core(NULL) == NULL);
AecCore* aec_core = WebRtcAec_aec_core(handle);
EXPECT_TRUE(aec_core != NULL);
// A simple test to verify that we can set and get a value from the lower
// level |aec_core| handle.
int delay = 111;
WebRtcAec_SetSystemDelay(aec_core, delay);
EXPECT_EQ(delay, WebRtcAec_system_delay(aec_core));
EXPECT_EQ(0, WebRtcAec_Free(handle));
}
} // namespace webrtc

View File

@@ -0,0 +1,256 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_
#include "webrtc/typedefs.h"
// Errors
#define AEC_UNSPECIFIED_ERROR 12000
#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
#define AEC_UNINITIALIZED_ERROR 12002
#define AEC_NULL_POINTER_ERROR 12003
#define AEC_BAD_PARAMETER_ERROR 12004
// Warnings
#define AEC_BAD_PARAMETER_WARNING 12050
enum {
kAecNlpConservative = 0,
kAecNlpModerate,
kAecNlpAggressive
};
enum {
kAecFalse = 0,
kAecTrue
};
typedef struct {
int16_t nlpMode; // default kAecNlpModerate
int16_t skewMode; // default kAecFalse
int16_t metricsMode; // default kAecFalse
int delay_logging; // default kAecFalse
// float realSkew;
} AecConfig;
typedef struct {
int instant;
int average;
int max;
int min;
} AecLevel;
typedef struct {
AecLevel rerl;
AecLevel erl;
AecLevel erle;
AecLevel aNlp;
} AecMetrics;
struct AecCore;
#ifdef __cplusplus
extern "C" {
#endif
/*
* Allocates the memory needed by the AEC. The memory needs to be initialized
* separately using the WebRtcAec_Init() function.
*
* Inputs Description
* -------------------------------------------------------------------
* void** aecInst Pointer to the AEC instance to be created
* and initialized
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_Create(void** aecInst);
/*
* This function releases the memory allocated by WebRtcAec_Create().
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_Free(void* aecInst);
/*
* Initializes an AEC instance.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
* int32_t sampFreq Sampling frequency of data
* int32_t scSampFreq Soundcard sampling frequency
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq);
/*
* Inserts an 80 or 160 sample block of data into the farend buffer.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
* int16_t* farend In buffer containing one frame of
* farend signal for L band
* int16_t nrOfSamples Number of samples in farend buffer
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_BufferFarend(void* aecInst,
const int16_t* farend,
int16_t nrOfSamples);
/*
* Runs the echo canceller on an 80 or 160 sample blocks of data.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
* int16_t* nearend In buffer containing one frame of
* nearend+echo signal for L band
* int16_t* nearendH In buffer containing one frame of
* nearend+echo signal for H band
* int16_t nrOfSamples Number of samples in nearend buffer
* int16_t msInSndCardBuf Delay estimate for sound card and
* system buffers
* int16_t skew Difference between number of samples played
* and recorded at the soundcard (for clock skew
* compensation)
*
* Outputs Description
* -------------------------------------------------------------------
* int16_t* out Out buffer, one frame of processed nearend
* for L band
* int16_t* outH Out buffer, one frame of processed nearend
* for H band
* int32_t return 0: OK
* -1: error
*/
int32_t WebRtcAec_Process(void* aecInst,
const int16_t* nearend,
const int16_t* nearendH,
int16_t* out,
int16_t* outH,
int16_t nrOfSamples,
int16_t msInSndCardBuf,
int32_t skew);
/*
* This function enables the user to set certain parameters on-the-fly.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
* AecConfig config Config instance that contains all
* properties to be set
*
* Outputs Description
* -------------------------------------------------------------------
* int return 0: OK
* -1: error
*/
int WebRtcAec_set_config(void* handle, AecConfig config);
/*
* Gets the current echo status of the nearend signal.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int* status 0: Almost certainly nearend single-talk
* 1: Might not be neared single-talk
* int return 0: OK
* -1: error
*/
int WebRtcAec_get_echo_status(void* handle, int* status);
/*
* Gets the current echo metrics for the session.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* AecMetrics* metrics Struct which will be filled out with the
* current echo metrics.
* int return 0: OK
* -1: error
*/
int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics);
/*
* Gets the current delay metrics for the session.
*
* Inputs Description
* -------------------------------------------------------------------
* void* handle Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int* median Delay median value.
* int* std Delay standard deviation.
*
* int return 0: OK
* -1: error
*/
int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std);
/*
* Gets the last error code.
*
* Inputs Description
* -------------------------------------------------------------------
* void* aecInst Pointer to the AEC instance
*
* Outputs Description
* -------------------------------------------------------------------
* int32_t return 11000-11100: error code
*/
int32_t WebRtcAec_get_error_code(void* aecInst);
// Returns a pointer to the low level AEC handle.
//
// Input:
// - handle : Pointer to the AEC instance.
//
// Return value:
// - AecCore pointer : NULL for error.
//
struct AecCore* WebRtcAec_aec_core(void* handle);
#ifdef __cplusplus
}
#endif
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_INCLUDE_ECHO_CANCELLATION_H_

View File

@@ -0,0 +1,482 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "testing/gtest/include/gtest/gtest.h"
extern "C" {
#include "webrtc/modules/audio_processing/aec/aec_core.h"
}
#include "webrtc/modules/audio_processing/aec/echo_cancellation_internal.h"
#include "webrtc/modules/audio_processing/aec/include/echo_cancellation.h"
#include "webrtc/typedefs.h"
namespace {
class SystemDelayTest : public ::testing::Test {
protected:
SystemDelayTest();
virtual void SetUp();
virtual void TearDown();
// Initialization of AEC handle with respect to |sample_rate_hz|. Since the
// device sample rate is unimportant we set that value to 48000 Hz.
void Init(int sample_rate_hz);
// Makes one render call and one capture call in that specific order.
void RenderAndCapture(int device_buffer_ms);
// Fills up the far-end buffer with respect to the default device buffer size.
int BufferFillUp();
// Runs and verifies the behavior in a stable startup procedure.
void RunStableStartup();
// Maps buffer size in ms into samples, taking the unprocessed frame into
// account.
int MapBufferSizeToSamples(int size_in_ms);
void* handle_;
aecpc_t* self_;
int samples_per_frame_;
// Dummy input/output speech data.
int16_t far_[160];
int16_t near_[160];
int16_t out_[160];
};
SystemDelayTest::SystemDelayTest()
: handle_(NULL), self_(NULL), samples_per_frame_(0) {
// Dummy input data are set with more or less arbitrary non-zero values.
memset(far_, 1, sizeof(far_));
memset(near_, 2, sizeof(near_));
memset(out_, 0, sizeof(out_));
}
void SystemDelayTest::SetUp() {
ASSERT_EQ(0, WebRtcAec_Create(&handle_));
self_ = reinterpret_cast<aecpc_t*>(handle_);
}
void SystemDelayTest::TearDown() {
// Free AEC
ASSERT_EQ(0, WebRtcAec_Free(handle_));
handle_ = NULL;
}
// In SWB mode nothing is added to the buffer handling with respect to
// functionality compared to WB. We therefore only verify behavior in NB and WB.
static const int kSampleRateHz[] = {8000, 16000};
static const size_t kNumSampleRates =
sizeof(kSampleRateHz) / sizeof(*kSampleRateHz);
// Default audio device buffer size used.
static const int kDeviceBufMs = 100;
// Requirement for a stable device convergence time in ms. Should converge in
// less than |kStableConvergenceMs|.
static const int kStableConvergenceMs = 100;
// Maximum convergence time in ms. This means that we should leave the startup
// phase after |kMaxConvergenceMs| independent of device buffer stability
// conditions.
static const int kMaxConvergenceMs = 500;
void SystemDelayTest::Init(int sample_rate_hz) {
// Initialize AEC
EXPECT_EQ(0, WebRtcAec_Init(handle_, sample_rate_hz, 48000));
// One frame equals 10 ms of data.
samples_per_frame_ = sample_rate_hz / 100;
}
void SystemDelayTest::RenderAndCapture(int device_buffer_ms) {
EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
EXPECT_EQ(0,
WebRtcAec_Process(handle_,
near_,
NULL,
out_,
NULL,
samples_per_frame_,
device_buffer_ms,
0));
}
int SystemDelayTest::BufferFillUp() {
// To make sure we have a full buffer when we verify stability we first fill
// up the far-end buffer with the same amount as we will report in through
// Process().
int buffer_size = 0;
for (int i = 0; i < kDeviceBufMs / 10; i++) {
EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
buffer_size += samples_per_frame_;
EXPECT_EQ(buffer_size, WebRtcAec_system_delay(self_->aec));
}
return buffer_size;
}
void SystemDelayTest::RunStableStartup() {
// To make sure we have a full buffer when we verify stability we first fill
// up the far-end buffer with the same amount as we will report in through
// Process().
int buffer_size = BufferFillUp();
// A stable device should be accepted and put in a regular process mode within
// |kStableConvergenceMs|.
int process_time_ms = 0;
for (; process_time_ms < kStableConvergenceMs; process_time_ms += 10) {
RenderAndCapture(kDeviceBufMs);
buffer_size += samples_per_frame_;
if (self_->startup_phase == 0) {
// We have left the startup phase.
break;
}
}
// Verify convergence time.
EXPECT_GT(kStableConvergenceMs, process_time_ms);
// Verify that the buffer has been flushed.
EXPECT_GE(buffer_size, WebRtcAec_system_delay(self_->aec));
}
int SystemDelayTest::MapBufferSizeToSamples(int size_in_ms) {
// The extra 10 ms corresponds to the unprocessed frame.
return (size_in_ms + 10) * samples_per_frame_ / 10;
}
// The tests should meet basic requirements and not be adjusted to what is
// actually implemented. If we don't get good code coverage this way we either
// lack in tests or have unnecessary code.
// General requirements:
// 1) If we add far-end data the system delay should be increased with the same
// amount we add.
// 2) If the far-end buffer is full we should flush the oldest data to make room
// for the new. In this case the system delay is unaffected.
// 3) There should exist a startup phase in which the buffer size is to be
// determined. In this phase no cancellation should be performed.
// 4) Under stable conditions (small variations in device buffer sizes) the AEC
// should determine an appropriate local buffer size within
// |kStableConvergenceMs| ms.
// 5) Under unstable conditions the AEC should make a decision within
// |kMaxConvergenceMs| ms.
// 6) If the local buffer runs out of data we should stuff the buffer with older
// frames.
// 7) The system delay should within |kMaxConvergenceMs| ms heal from
// disturbances like drift, data glitches, toggling events and outliers.
// 8) The system delay should never become negative.
TEST_F(SystemDelayTest, CorrectIncreaseWhenBufferFarend) {
// When we add data to the AEC buffer the internal system delay should be
// incremented with the same amount as the size of data.
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
// Loop through a couple of calls to make sure the system delay increments
// correctly.
for (int j = 1; j <= 5; j++) {
EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
EXPECT_EQ(j * samples_per_frame_, WebRtcAec_system_delay(self_->aec));
}
}
}
// TODO(bjornv): Add a test to verify behavior if the far-end buffer is full
// when adding new data.
TEST_F(SystemDelayTest, CorrectDelayAfterStableStartup) {
// We run the system in a stable startup. After that we verify that the system
// delay meets the requirements.
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
RunStableStartup();
// Verify system delay with respect to requirements, i.e., the
// |system_delay| is in the interval [75%, 100%] of what's reported on the
// average.
int average_reported_delay = kDeviceBufMs * samples_per_frame_ / 10;
EXPECT_GE(average_reported_delay, WebRtcAec_system_delay(self_->aec));
EXPECT_LE(average_reported_delay * 3 / 4,
WebRtcAec_system_delay(self_->aec));
}
}
TEST_F(SystemDelayTest, CorrectDelayAfterUnstableStartup) {
// In an unstable system we would start processing after |kMaxConvergenceMs|.
// On the last frame the AEC buffer is adjusted to 60% of the last reported
// device buffer size.
// We construct an unstable system by altering the device buffer size between
// two values |kDeviceBufMs| +- 25 ms.
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
// To make sure we have a full buffer when we verify stability we first fill
// up the far-end buffer with the same amount as we will report in on the
// average through Process().
int buffer_size = BufferFillUp();
int buffer_offset_ms = 25;
int reported_delay_ms = 0;
int process_time_ms = 0;
for (; process_time_ms <= kMaxConvergenceMs; process_time_ms += 10) {
reported_delay_ms = kDeviceBufMs + buffer_offset_ms;
RenderAndCapture(reported_delay_ms);
buffer_size += samples_per_frame_;
buffer_offset_ms = -buffer_offset_ms;
if (self_->startup_phase == 0) {
// We have left the startup phase.
break;
}
}
// Verify convergence time.
EXPECT_GE(kMaxConvergenceMs, process_time_ms);
// Verify that the buffer has been flushed.
EXPECT_GE(buffer_size, WebRtcAec_system_delay(self_->aec));
// Verify system delay with respect to requirements, i.e., the
// |system_delay| is in the interval [60%, 100%] of what's last reported.
EXPECT_GE(reported_delay_ms * samples_per_frame_ / 10,
WebRtcAec_system_delay(self_->aec));
EXPECT_LE(reported_delay_ms * samples_per_frame_ / 10 * 3 / 5,
WebRtcAec_system_delay(self_->aec));
}
}
TEST_F(SystemDelayTest, CorrectDelayAfterStableBufferBuildUp) {
// In this test we start by establishing the device buffer size during stable
// conditions, but with an empty internal far-end buffer. Once that is done we
// verify that the system delay is increased correctly until we have reach an
// internal buffer size of 75% of what's been reported.
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
// We assume that running |kStableConvergenceMs| calls will put the
// algorithm in a state where the device buffer size has been determined. We
// can make that assumption since we have a separate stability test.
int process_time_ms = 0;
for (; process_time_ms < kStableConvergenceMs; process_time_ms += 10) {
EXPECT_EQ(0,
WebRtcAec_Process(handle_,
near_,
NULL,
out_,
NULL,
samples_per_frame_,
kDeviceBufMs,
0));
}
// Verify that a buffer size has been established.
EXPECT_EQ(0, self_->checkBuffSize);
// We now have established the required buffer size. Let us verify that we
// fill up before leaving the startup phase for normal processing.
int buffer_size = 0;
int target_buffer_size = kDeviceBufMs * samples_per_frame_ / 10 * 3 / 4;
process_time_ms = 0;
for (; process_time_ms <= kMaxConvergenceMs; process_time_ms += 10) {
RenderAndCapture(kDeviceBufMs);
buffer_size += samples_per_frame_;
if (self_->startup_phase == 0) {
// We have left the startup phase.
break;
}
}
// Verify convergence time.
EXPECT_GT(kMaxConvergenceMs, process_time_ms);
// Verify that the buffer has reached the desired size.
EXPECT_LE(target_buffer_size, WebRtcAec_system_delay(self_->aec));
// Verify normal behavior (system delay is kept constant) after startup by
// running a couple of calls to BufferFarend() and Process().
for (int j = 0; j < 6; j++) {
int system_delay_before_calls = WebRtcAec_system_delay(self_->aec);
RenderAndCapture(kDeviceBufMs);
EXPECT_EQ(system_delay_before_calls, WebRtcAec_system_delay(self_->aec));
}
}
}
TEST_F(SystemDelayTest, CorrectDelayWhenBufferUnderrun) {
// Here we test a buffer under run scenario. If we keep on calling
// WebRtcAec_Process() we will finally run out of data, but should
// automatically stuff the buffer. We verify this behavior by checking if the
// system delay goes negative.
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
RunStableStartup();
// The AEC has now left the Startup phase. We now have at most
// |kStableConvergenceMs| in the buffer. Keep on calling Process() until
// we run out of data and verify that the system delay is non-negative.
for (int j = 0; j <= kStableConvergenceMs; j += 10) {
EXPECT_EQ(0,
WebRtcAec_Process(handle_,
near_,
NULL,
out_,
NULL,
samples_per_frame_,
kDeviceBufMs,
0));
EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
}
}
}
TEST_F(SystemDelayTest, CorrectDelayDuringDrift) {
// This drift test should verify that the system delay is never exceeding the
// device buffer. The drift is simulated by decreasing the reported device
// buffer size by 1 ms every 100 ms. If the device buffer size goes below 30
// ms we jump (add) 10 ms to give a repeated pattern.
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
RunStableStartup();
// We have now left the startup phase and proceed with normal processing.
int jump = 0;
for (int j = 0; j < 1000; j++) {
// Drift = -1 ms per 100 ms of data.
int device_buf_ms = kDeviceBufMs - (j / 10) + jump;
int device_buf = MapBufferSizeToSamples(device_buf_ms);
if (device_buf_ms < 30) {
// Add 10 ms data, taking affect next frame.
jump += 10;
}
RenderAndCapture(device_buf_ms);
// Verify that the system delay does not exceed the device buffer.
EXPECT_GE(device_buf, WebRtcAec_system_delay(self_->aec));
// Verify that the system delay is non-negative.
EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
}
}
}
TEST_F(SystemDelayTest, ShouldRecoverAfterGlitch) {
// This glitch test should verify that the system delay recovers if there is
// a glitch in data. The data glitch is constructed as 200 ms of buffering
// after which the stable procedure continues. The glitch is never reported by
// the device.
// The system is said to be in a non-causal state if the difference between
// the device buffer and system delay is less than a block (64 samples).
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
RunStableStartup();
int device_buf = MapBufferSizeToSamples(kDeviceBufMs);
// Glitch state.
for (int j = 0; j < 20; j++) {
EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
// No need to verify system delay, since that is done in a separate test.
}
// Verify that we are in a non-causal state, i.e.,
// |system_delay| > |device_buf|.
EXPECT_LT(device_buf, WebRtcAec_system_delay(self_->aec));
// Recover state. Should recover at least 4 ms of data per 10 ms, hence a
// glitch of 200 ms will take at most 200 * 10 / 4 = 500 ms to recover from.
bool non_causal = true; // We are currently in a non-causal state.
for (int j = 0; j < 50; j++) {
int system_delay_before = WebRtcAec_system_delay(self_->aec);
RenderAndCapture(kDeviceBufMs);
int system_delay_after = WebRtcAec_system_delay(self_->aec);
// We have recovered if |device_buf| - |system_delay_after| >= 64 (one
// block). During recovery |system_delay_after| < |system_delay_before|,
// otherwise they are equal.
if (non_causal) {
EXPECT_LT(system_delay_after, system_delay_before);
if (device_buf - system_delay_after >= 64) {
non_causal = false;
}
} else {
EXPECT_EQ(system_delay_before, system_delay_after);
}
// Verify that the system delay is non-negative.
EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
}
// Check that we have recovered.
EXPECT_FALSE(non_causal);
}
}
TEST_F(SystemDelayTest, UnaffectedWhenSpuriousDeviceBufferValues) {
// This spurious device buffer data test aims at verifying that the system
// delay is unaffected by large outliers.
// The system is said to be in a non-causal state if the difference between
// the device buffer and system delay is less than a block (64 samples).
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
RunStableStartup();
int device_buf = MapBufferSizeToSamples(kDeviceBufMs);
// Normal state. We are currently not in a non-causal state.
bool non_causal = false;
// Run 1 s and replace device buffer size with 500 ms every 100 ms.
for (int j = 0; j < 100; j++) {
int system_delay_before_calls = WebRtcAec_system_delay(self_->aec);
int device_buf_ms = kDeviceBufMs;
if (j % 10 == 0) {
device_buf_ms = 500;
}
RenderAndCapture(device_buf_ms);
// Check for non-causality.
if (device_buf - WebRtcAec_system_delay(self_->aec) < 64) {
non_causal = true;
}
EXPECT_FALSE(non_causal);
EXPECT_EQ(system_delay_before_calls, WebRtcAec_system_delay(self_->aec));
// Verify that the system delay is non-negative.
EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
}
}
}
TEST_F(SystemDelayTest, CorrectImpactWhenTogglingDeviceBufferValues) {
// This test aims at verifying that the system delay is "unaffected" by
// toggling values reported by the device.
// The test is constructed such that every other device buffer value is zero
// and then 2 * |kDeviceBufMs|, hence the size is constant on the average. The
// zero values will force us into a non-causal state and thereby lowering the
// system delay until we basically runs out of data. Once that happens the
// buffer will be stuffed.
// TODO(bjornv): This test will have a better impact if we verified that the
// delay estimate goes up when the system delay goes done to meet the average
// device buffer size.
for (size_t i = 0; i < kNumSampleRates; i++) {
Init(kSampleRateHz[i]);
RunStableStartup();
int device_buf = MapBufferSizeToSamples(kDeviceBufMs);
// Normal state. We are currently not in a non-causal state.
bool non_causal = false;
// Loop through 100 frames (both render and capture), which equals 1 s of
// data. Every odd frame we set the device buffer size to 2 * |kDeviceBufMs|
// and even frames we set the device buffer size to zero.
for (int j = 0; j < 100; j++) {
int system_delay_before_calls = WebRtcAec_system_delay(self_->aec);
int device_buf_ms = 2 * (j % 2) * kDeviceBufMs;
RenderAndCapture(device_buf_ms);
// Check for non-causality, compared with the average device buffer size.
non_causal |= (device_buf - WebRtcAec_system_delay(self_->aec) < 64);
EXPECT_GE(system_delay_before_calls, WebRtcAec_system_delay(self_->aec));
// Verify that the system delay is non-negative.
EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
}
// Verify we are not in a non-causal state.
EXPECT_FALSE(non_causal);
}
}
} // namespace