mayhem-firmware/firmware/baseband/dsp_decimate.cpp
2015-07-08 08:39:24 -07:00

461 lines
14 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (C) 2014 Jared Boone, ShareBrained Technology, Inc.
*
* This file is part of PortaPack.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#include "dsp_decimate.hpp"
#include <hal.h>
namespace dsp {
namespace decimate {
buffer_c16_t TranslateByFSOver4AndDecimateBy2CIC3::execute(buffer_c8_t src, buffer_c16_t dst) {
/* Translates incoming complex<int8_t> samples by -fs/4,
* decimates by two using a non-recursive third-order CIC filter.
*/
/* Derivation of algorithm:
* Original CIC filter (decimating by two):
* D_I0 = i3 * 1 + i2 * 3 + i1 * 3 + i0 * 1
* D_Q0 = q3 * 1 + q2 * 3 + q1 * 3 + q0 * 1
*
* D_I1 = i5 * 1 + i4 * 3 + i3 * 3 + i2 * 1
* D_Q1 = q5 * 1 + q4 * 3 + q3 * 3 + q2 * 1
*
* Translate -fs/4, phased 180 degrees, accomplished by complex multiplication
* of complex length-4 sequence:
*
* Substitute:
* i0 = -i0, q0 = -q0
* i1 = -q1, q1 = i1
* i2 = i2, q2 = q2
* i3 = q3, q3 = -i3
* i4 = -i4, q4 = -q4
* i5 = -q5, q5 = i5
*
* Resulting taps (with decimation by 2, four samples in, two samples out):
* D_I0 = q3 * 1 + i2 * 3 + -q1 * 3 + -i0 * 1
* D_Q0 = -i3 * 1 + q2 * 3 + i1 * 3 + -q0 * 1
*
* D_I1 = -q5 * 1 + -i4 * 3 + q3 * 3 + i2 * 1
* D_Q1 = i5 * 1 + -q4 * 3 + -i3 * 3 + q2 * 1
*/
// 6 cycles per complex input sample, not including loop overhead.
uint32_t q1_i0 = _q1_i0;
uint32_t q0_i1 = _q0_i1;
/* 3:1 Scaled by 32 to normalize output to +/-32768-ish. */
constexpr uint32_t scale_factor = 32;
const uint32_t k_3_1 = 0x00030001 * scale_factor;
uint32_t* src_p = reinterpret_cast<uint32_t*>(&src.p[0]);
uint32_t* const src_end = reinterpret_cast<uint32_t*>(&src.p[src.count]);
uint32_t* dst_p = reinterpret_cast<uint32_t*>(&dst.p[0]);
while(src_p < src_end) {
const uint32_t q3_i3_q2_i2 = *(src_p++); // 3
const uint32_t q5_i5_q4_i4 = *(src_p++);
const uint32_t i2_i3 = __SXTB16(q3_i3_q2_i2, 16); // 1: (q3_i3_q2_i2 ror 16)[23:16]:(q3_i3_q2_i2 ror 16)[7:0]
const uint32_t q3_q2 = __SXTB16(q3_i3_q2_i2, 8); // 1: (q3_i3_q2_i2 ror 8)[23:16]:(q3_i3_q2_i2 ror 8)[7:0]
const uint32_t i2_q3 = __PKHTB(i2_i3, q3_q2, 16); // 1: Rn[31:16]:(Rm>>16)[15:0]
const uint32_t i3_q2 = __PKHBT(q3_q2, i2_i3, 16); // 1:(Rm<<16)[31:16]:Rn[15:0]
// D_I0 = 3 * (i2 - q1) + (q3 - i0)
const uint32_t i2_m_q1_q3_m_i0 = __QSUB16(i2_q3, q1_i0); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
const uint32_t d_i0 = __SMUAD(k_3_1, i2_m_q1_q3_m_i0); // 1: Rm[15:0]*Rs[15:0]+Rm[31:16]*Rs[31:16]
// D_Q0 = 3 * (q2 + i1) - (i3 + q0)
const uint32_t i3_p_q0_q2_p_i1 = __QADD16(i3_q2, q0_i1); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
const uint32_t d_q0 = __SMUSDX(i3_p_q0_q2_p_i1, k_3_1); // 1: Rm[15:0]*Rs[31:16]Rm[31:16]*RsX[15:0]
const uint32_t d_q0_i0 = __PKHBT(d_i0, d_q0, 16); // 1: (Rm<<16)[31:16]:Rn[15:0]
const uint32_t i5_i4 = __SXTB16(q5_i5_q4_i4, 0); // 1: (q5_i5_q4_i4 ror 0)[23:16]:(q5_i5_q4_i4 ror 0)[7:0]
const uint32_t q4_q5 = __SXTB16(q5_i5_q4_i4, 24); // 1: (q5_i5_q4_i4 ror 24)[23:16]:(q5_i5_q4_i4 ror 24)[7:0]
const uint32_t q4_i5 = __PKHTB(q4_q5, i5_i4, 16); // 1: Rn[31:16]:(Rm>>16)[15:0]
const uint32_t q5_i4 = __PKHBT(i5_i4, q4_q5, 16); // 1: (Rm<<16)[31:16]:Rn[15:0]
// D_I1 = (i2 - q5) + 3 * (q3 - i4)
const uint32_t i2_m_q5_q3_m_i4 = __QSUB16(i2_q3, q5_i4); // 1: Rn[31:16]-Rm[31:16]:Rn[15:0]-Rm[15:0]
const uint32_t d_i1 = __SMUADX(i2_m_q5_q3_m_i4, k_3_1); // 1: Rm[15:0]*Rs[31:16]+Rm[31:16]*Rs[15:0]
// D_Q1 = (i5 + q2) - 3 * (q4 + i3)
const uint32_t q4_p_i3_i5_p_q2 = __QADD16(q4_i5, i3_q2); // 1: Rn[31:16]+Rm[31:16]:Rn[15:0]+Rm[15:0]
const uint32_t d_q1 = __SMUSD(k_3_1, q4_p_i3_i5_p_q2); // 1: Rm[15:0]*Rs[15:0]Rm[31:16]*Rs[31:16]
const uint32_t d_q1_i1 = __PKHBT(d_i1, d_q1, 16); // 1: (Rm<<16)[31:16]:Rn[15:0]
*(dst_p++) = d_q0_i0; // 3
*(dst_p++) = d_q1_i1;
q1_i0 = q5_i4;
q0_i1 = q4_i5;
}
_q1_i0 = q1_i0;
_q0_i1 = q0_i1;
return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
buffer_c16_t DecimateBy2CIC3::execute(
buffer_c16_t src,
buffer_c16_t dst
) {
/* Complex non-recursive 3rd-order CIC filter (taps 1,3,3,1).
* Gain of 8.
* Consumes 16 bytes (4 s16:s16 samples) per loop iteration,
* Produces 8 bytes (2 s16:s16 samples) per loop iteration.
*/
uint32_t t1 = _iq0;
uint32_t t2 = _iq1;
uint32_t t3, t4;
const uint32_t taps = 0x00000003;
auto s = src.p;
auto d = dst.p;
const auto d_end = &dst.p[src.count / 2];
uint32_t i, q;
while(d < d_end) {
i = __SXTH(t1, 0); /* 1: I0 */
q = __SXTH(t1, 16); /* 1: Q0 */
i = __SMLABB(t2, taps, i); /* 1: I1*3 + I0 */
q = __SMLATB(t2, taps, q); /* 1: Q1*3 + Q0 */
t3 = *__SIMD32(s)++; /* 3: Q2:I2 */
t4 = *__SIMD32(s)++; /* Q3:I3 */
i = __SMLABB(t3, taps, i); /* 1: I2*3 + I1*3 + I0 */
q = __SMLATB(t3, taps, q); /* 1: Q2*3 + Q1*3 + Q0 */
int32_t si0 = __SXTAH(i, t4, 0); /* 1: I3 + Q2*3 + Q1*3 + Q0 */
int32_t sq0 = __SXTAH(q, t4, 16); /* 1: Q3 + Q2*3 + Q1*3 + Q0 */
i = __BFI(si0 / 8, sq0 / 8, 16, 16); /* 1: D2_Q0:D2_I0 */
*__SIMD32(d)++ = i; /* D2_Q0:D2_I0 */
i = __SXTH(t3, 0); /* 1: I2 */
q = __SXTH(t3, 16); /* 1: Q2 */
i = __SMLABB(t4, taps, i); /* 1: I3*3 + I2 */
q = __SMLATB(t4, taps, q); /* 1: Q3*3 + Q2 */
t1 = *__SIMD32(s)++; /* 3: Q4:I4 */
t2 = *__SIMD32(s)++; /* Q5:I5 */
i = __SMLABB(t1, taps, i); /* 1: I4*3 + I3*3 + I2 */
q = __SMLATB(t1, taps, q); /* 1: Q4*3 + Q3*3 + Q2 */
int32_t si1 = __SXTAH(i, t2, 0) ; /* 1: I5 + Q4*3 + Q3*3 + Q2 */
int32_t sq1 = __SXTAH(q, t2, 16); /* 1: Q5 + Q4*3 + Q3*3 + Q2 */
i = __BFI(si1 / 8, sq1 / 8, 16, 16); /* 1: D2_Q1:D2_I1 */
*__SIMD32(d)++ = i; /* D2_Q1:D2_I1 */
}
_iq0 = t1;
_iq1 = t2;
return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
buffer_s16_t FIR64AndDecimateBy2Real::execute(
buffer_s16_t src,
buffer_s16_t dst
) {
/* int16_t input (sample count "n" must be multiple of 4)
* -> int16_t output, decimated by 2.
* taps are normalized to 1 << 16 == 1.0.
*/
auto src_p = src.p;
auto dst_p = dst.p;
int32_t n = src.count;
for(; n>0; n-=2) {
z[taps_count-2] = *(src_p++);
z[taps_count-1] = *(src_p++);
int32_t t = 0;
for(size_t j=0; j<taps_count; j+=4) {
t += z[j+0] * taps[j+0];
t += z[j+1] * taps[j+1];
t += z[j+2] * taps[j+2];
t += z[j+3] * taps[j+3];
z[j+0] = z[j+0+2];
z[j+1] = z[j+1+2];
z[j+2] = z[j+2+2];
z[j+3] = z[j+3+2];
}
*(dst_p++) = t / 65536;
}
return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
#if 0
size_t fir_and_decimate_by_2_complex(
const complex16_t* const src_start,
const size_t src_count,
complex16_t* const dst_start,
complex16_t* const z,
const complex16_t* const taps,
const size_t taps_count
) {
/* int16_t input (sample count "n" must be multiple of 4)
* -> int16_t output, decimated by 2.
* taps are normalized to 1 << 16 == 1.0.
*/
auto src_p = src_start;
const auto src_end = &src_start[src_count];
auto dst_p = dst_start;
auto z_p = &z[0];
while(src_p < src_end) {
/* Put two new samples into delay buffer */
*__SIMD32(z_p)++ = *__SIMD32(src_p)++;
*__SIMD32(z_p)++ = *__SIMD32(src_p)++;
int64_t t_real = 0;
int64_t t_imag = 0;
auto t_p = &taps[0];
const auto z_end = &z[taps_count];
while(z_p < z_end) {
const auto tap0 = *__SIMD32(t_p)++;
const auto sample0 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample0, tap0, t_real);
t_imag = __SMLALDX(sample0, tap0, t_imag);
const auto tap1 = *__SIMD32(t_p)++;
const auto sample1 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample1, tap1, t_real);
t_imag = __SMLALDX(sample1, tap1, t_imag);
}
z_p = &z[0];
const auto t_end = &taps[taps_count];
while(t_p < t_end) {
const auto tap0 = *__SIMD32(t_p)++;
const auto sample0 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample0, tap0, t_real);
t_imag = __SMLALDX(sample0, tap0, t_imag);
const auto tap1 = *__SIMD32(t_p)++;
const auto sample1 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample1, tap1, t_real);
t_imag = __SMLALDX(sample1, tap1, t_imag);
}
if( z_p == z_end ) {
z_p = &z[0];
}
/* TODO: No rounding taking place here, so might be adding a bit of
* noise. Enough to be significant?
*/
*__SIMD32(dst_p)++ = __PKHBT(
t_real / 131072,
t_imag / 131072,
16
);
/*
*__SIMD32(dst_p)++ = __PKHBT(
__SSAT((t_real / 131072), 16),
__SSAT((t_imag / 131072), 16),
16
);
*/
}
return src_count / 2;
}
#endif
size_t fir_and_decimate_by_2_complex_fast(
const complex16_t* const src_start,
const size_t src_count,
complex16_t* const dst_start,
complex16_t* const z,
const complex16_t* const taps,
const size_t taps_count
) {
/* int16_t input (sample count "n" must be multiple of 4)
* -> int16_t output, decimated by 2.
* taps are normalized to 1 << 16 == 1.0.
*/
auto src_p = src_start;
auto dst_p = dst_start;
auto z_new_p = &z[0];
auto t_p = &taps[taps_count * 2];
while(src_p < &src_start[src_count]) {
/* Put two new samples into delay buffer */
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
*__SIMD32(z_new_p)++ = *__SIMD32(src_p)++;
t_p -= (taps_count + 2);
if( z_new_p == &z[taps_count] ) {
z_new_p = &z[0];
t_p = &taps[taps_count];
}
int64_t t_real = 0;
int64_t t_imag = 0;
auto z_p = &z[0];
while(z_p < &z[taps_count]) {
const auto tap0 = *__SIMD32(t_p)++;
const auto sample0 = *__SIMD32(z_p)++;
const auto tap1 = *__SIMD32(t_p)++;
const auto sample1 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample0, tap0, t_real);
t_imag = __SMLALDX(sample0, tap0, t_imag);
t_real = __SMLSLD(sample1, tap1, t_real);
t_imag = __SMLALDX(sample1, tap1, t_imag);
const auto tap2 = *__SIMD32(t_p)++;
const auto sample2 = *__SIMD32(z_p)++;
const auto tap3 = *__SIMD32(t_p)++;
const auto sample3 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample2, tap2, t_real);
t_imag = __SMLALDX(sample2, tap2, t_imag);
t_real = __SMLSLD(sample3, tap3, t_real);
t_imag = __SMLALDX(sample3, tap3, t_imag);
const auto tap4 = *__SIMD32(t_p)++;
const auto sample4 = *__SIMD32(z_p)++;
const auto tap5 = *__SIMD32(t_p)++;
const auto sample5 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample4, tap4, t_real);
t_imag = __SMLALDX(sample4, tap4, t_imag);
t_real = __SMLSLD(sample5, tap5, t_real);
t_imag = __SMLALDX(sample5, tap5, t_imag);
const auto tap6 = *__SIMD32(t_p)++;
const auto sample6 = *__SIMD32(z_p)++;
const auto tap7 = *__SIMD32(t_p)++;
const auto sample7 = *__SIMD32(z_p)++;
t_real = __SMLSLD(sample6, tap6, t_real);
t_imag = __SMLALDX(sample6, tap6, t_imag);
t_real = __SMLSLD(sample7, tap7, t_real);
t_imag = __SMLALDX(sample7, tap7, t_imag);
}
/* TODO: Re-evaluate whether saturation is performed, normalization,
* all that jazz.
*/
const int32_t r = t_real >> 16;
const int32_t i = t_imag >> 16;
const int32_t r_sat = __SSAT(r, 16);
const int32_t i_sat = __SSAT(i, 16);
*__SIMD32(dst_p)++ = __PKHBT(
r_sat,
i_sat,
16
);
}
return src_count / 2;
}
buffer_s16_t DecimateBy2CIC4Real::execute(
buffer_s16_t src,
buffer_s16_t dst
) {
auto src_p = src.p;
auto dst_p = dst.p;
int32_t n = src.count;
for(; n>0; n-=2) {
/* TODO: Probably a lot of room to optimize... */
z[0] = z[2];
z[1] = z[3];
z[2] = z[4];
z[3] = *(src_p++);
z[4] = *(src_p++);
int32_t t = z[0] + z[1] * 4 + z[2] * 6 + z[3] * 4 + z[4];
*(dst_p++) = t / 16;
}
return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
#if 0
buffer_c16_t DecimateBy2HBF5Complex::execute(
buffer_c16_t const src,
buffer_c16_t const dst
) {
auto src_p = src.p;
auto dst_p = dst.p;
int32_t n = src.count;
for(; n>0; n-=2) {
/* TODO: Probably a lot of room to optimize... */
z[0] = z[2];
//z[1] = z[3];
z[2] = z[4];
//z[3] = z[5];
z[4] = z[6];
z[5] = z[7];
z[6] = z[8];
z[7] = z[9];
z[8] = z[10];
z[9] = *(src_p++);
z[10] = *(src_p++);
int32_t t_real { z[5].real * 256 };
int32_t t_imag { z[5].imag * 256 };
t_real += (z[ 0].real + z[10].real) * 3;
t_imag += (z[ 0].imag + z[10].imag) * 3;
t_real -= (z[ 2].real + z[ 8].real) * 25;
t_imag -= (z[ 2].imag + z[ 8].imag) * 25;
t_real += (z[ 4].real + z[ 6].real) * 150;
t_imag += (z[ 4].imag + z[ 6].imag) * 150;
*(dst_p++) = { t_real / 256, t_imag / 256 };
}
return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
buffer_c16_t DecimateBy2HBF7Complex::execute(
buffer_c16_t const src,
buffer_c16_t const dst
) {
auto src_p = src.p;
auto dst_p = dst.p;
int32_t n = src.count;
for(; n>0; n-=2) {
/* TODO: Probably a lot of room to optimize... */
z[0] = z[2];
//z[1] = z[3];
z[2] = z[4];
//z[3] = z[5];
z[4] = z[6];
z[5] = z[7];
z[6] = z[8];
z[7] = z[9];
z[8] = z[10];
z[9] = *(src_p++);
z[10] = *(src_p++);
int32_t t_real { z[5].real * 512 };
int32_t t_imag { z[5].imag * 512 };
t_real += (z[ 0].real + z[10].real) * 7;
t_imag += (z[ 0].imag + z[10].imag) * 7;
t_real -= (z[ 2].real + z[ 8].real) * 53;
t_imag -= (z[ 2].imag + z[ 8].imag) * 53;
t_real += (z[ 4].real + z[ 6].real) * 302;
t_imag += (z[ 4].imag + z[ 6].imag) * 302;
*(dst_p++) = { t_real / 512, t_imag / 512 };
}
return { dst.p, src.count / 2, src.sampling_rate / 2 };
}
#endif
} /* namespace decimate */
} /* namespace dsp */