mirror of
https://github.com/oxen-io/session-android.git
synced 2024-12-18 14:07:30 +00:00
216 lines
7.2 KiB
ArmAsm
216 lines
7.2 KiB
ArmAsm
|
@
|
||
|
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
|
||
|
@
|
||
|
@ Use of this source code is governed by a BSD-style license
|
||
|
@ that can be found in the LICENSE file in the root of the source
|
||
|
@ tree. An additional intellectual property rights grant can be found
|
||
|
@ in the file PATENTS. All contributing project authors may
|
||
|
@ be found in the AUTHORS file in the root of the source tree.
|
||
|
@
|
||
|
|
||
|
@ This file contains the function WebRtcSpl_DownsampleFastNeon(), optimized for
|
||
|
@ ARM Neon platform. The description header can be found in
|
||
|
@ signal_processing_library.h
|
||
|
@
|
||
|
@ The reference C code is in file downsample_fast.c. Bit-exact.
|
||
|
|
||
|
#include "webrtc/system_wrappers/interface/asm_defines.h"
|
||
|
|
||
|
GLOBAL_FUNCTION WebRtcSpl_DownsampleFastNeon
|
||
|
.align 2
|
||
|
DEFINE_FUNCTION WebRtcSpl_DownsampleFastNeon
|
||
|
push {r4-r11}
|
||
|
|
||
|
cmp r3, #0 @ data_out_length <= 0?
|
||
|
movle r0, #-1
|
||
|
ble END
|
||
|
|
||
|
ldrsh r12, [sp, #44]
|
||
|
ldr r5, [sp, #40] @ r5: factor
|
||
|
add r4, r12, #1 @ r4: delay + 1
|
||
|
sub r3, r3, #1 @ r3: data_out_length - 1
|
||
|
smulbb r3, r5, r3
|
||
|
ldr r8, [sp, #32] @ &coefficients[0]
|
||
|
mov r9, r12 @ Iteration counter for outer loops.
|
||
|
add r3, r4 @ delay + factor * (out_length-1) +1
|
||
|
|
||
|
cmp r3, r1 @ data_in_length < endpos?
|
||
|
movgt r0, #-1
|
||
|
bgt END
|
||
|
|
||
|
@ Initializations.
|
||
|
sub r3, r5, asl #3
|
||
|
add r11, r0, r12, asl #1 @ &data_in[delay]
|
||
|
ldr r0, [sp, #36] @ coefficients_length
|
||
|
add r3, r5 @ endpos - factor * 7
|
||
|
|
||
|
cmp r0, #0 @ coefficients_length <= 0 ?
|
||
|
movle r0, #-1
|
||
|
ble END
|
||
|
|
||
|
add r8, r0, asl #1 @ &coeffieient[coefficients_length]
|
||
|
cmp r9, r3
|
||
|
bge POST_LOOP_ENDPOS @ branch when Iteration < 8 times.
|
||
|
|
||
|
@
|
||
|
@ First part, unroll the loop 8 times, with 3 subcases (factor == 2, 4, others)
|
||
|
@
|
||
|
mov r4, #-2
|
||
|
|
||
|
@ Direct program flow to the right channel.
|
||
|
|
||
|
@ r10 is an offset to &data_in[] in the loop. After an iteration, we need to
|
||
|
@ move the pointer back to original after advancing 16 bytes by a vld1, and
|
||
|
@ then move 2 bytes forward to increment one more sample.
|
||
|
cmp r5, #2
|
||
|
moveq r10, #-14
|
||
|
beq LOOP_ENDPOS_FACTOR2 @ Branch when factor == 2
|
||
|
|
||
|
@ Similar here, for r10, we need to move the pointer back to original after
|
||
|
@ advancing 32 bytes, then move 2 bytes forward to increment one sample.
|
||
|
cmp r5, #4
|
||
|
moveq r10, #-30
|
||
|
beq LOOP_ENDPOS_FACTOR4 @ Branch when factor == 4
|
||
|
|
||
|
@ For r10, we need to move the pointer back to original after advancing
|
||
|
@ (factor * 7 * 2) bytes, then move 2 bytes forward to increment one sample.
|
||
|
mov r10, r5, asl #4
|
||
|
rsb r10, #2
|
||
|
add r10, r5, asl #1
|
||
|
lsl r5, #1 @ r5 = factor * sizeof(data_in)
|
||
|
|
||
|
@ The general case (factor != 2 && factor != 4)
|
||
|
LOOP_ENDPOS_GENERAL:
|
||
|
@ Initializations.
|
||
|
vmov.i32 q2, #2048
|
||
|
vmov.i32 q3, #2048
|
||
|
sub r7, r8, #2
|
||
|
sub r12, r0, #1 @ coefficients_length - 1
|
||
|
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||
|
|
||
|
LOOP_COEFF_LENGTH_GENERAL:
|
||
|
vld1.16 {d2[], d3[]}, [r7], r4 @ coefficients[j]
|
||
|
vld1.16 d0[0], [r1], r5 @ data_in[i - j]
|
||
|
vld1.16 d0[1], [r1], r5 @ data_in[i + factor - j]
|
||
|
vld1.16 d0[2], [r1], r5 @ data_in[i + factor * 2 - j]
|
||
|
vld1.16 d0[3], [r1], r5 @ data_in[i + factor * 3 - j]
|
||
|
vld1.16 d1[0], [r1], r5 @ data_in[i + factor * 4 - j]
|
||
|
vld1.16 d1[1], [r1], r5 @ data_in[i + factor * 5 - j]
|
||
|
vld1.16 d1[2], [r1], r5 @ data_in[i + factor * 6 - j]
|
||
|
vld1.16 d1[3], [r1], r10 @ data_in[i + factor * 7 - j]
|
||
|
subs r12, #1
|
||
|
vmlal.s16 q2, d0, d2
|
||
|
vmlal.s16 q3, d1, d3
|
||
|
bge LOOP_COEFF_LENGTH_GENERAL
|
||
|
|
||
|
@ Shift, saturate, and store the result.
|
||
|
vqshrn.s32 d0, q2, #12
|
||
|
vqshrn.s32 d1, q3, #12
|
||
|
vst1.16 {d0, d1}, [r2]!
|
||
|
|
||
|
add r11, r5, asl #3 @ r11 -> &data_in[i + factor * 8]
|
||
|
add r9, r5, asl #2 @ Counter i = delay + factor * 8.
|
||
|
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||
|
blt LOOP_ENDPOS_GENERAL
|
||
|
asr r5, #1 @ Restore r5 to the value of factor.
|
||
|
b POST_LOOP_ENDPOS
|
||
|
|
||
|
@ The case for factor == 2.
|
||
|
LOOP_ENDPOS_FACTOR2:
|
||
|
@ Initializations.
|
||
|
vmov.i32 q2, #2048
|
||
|
vmov.i32 q3, #2048
|
||
|
sub r7, r8, #2
|
||
|
sub r12, r0, #1 @ coefficients_length - 1
|
||
|
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||
|
|
||
|
LOOP_COEFF_LENGTH_FACTOR2:
|
||
|
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
|
||
|
vld2.16 {d0, d1}, [r1]! @ data_in[]
|
||
|
vld2.16 {d2, d3}, [r1], r10 @ data_in[]
|
||
|
subs r12, #1
|
||
|
vmlal.s16 q2, d0, d16
|
||
|
vmlal.s16 q3, d2, d17
|
||
|
bge LOOP_COEFF_LENGTH_FACTOR2
|
||
|
|
||
|
@ Shift, saturate, and store the result.
|
||
|
vqshrn.s32 d0, q2, #12
|
||
|
vqshrn.s32 d1, q3, #12
|
||
|
vst1.16 {d0, d1}, [r2]!
|
||
|
|
||
|
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||
|
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||
|
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||
|
blt LOOP_ENDPOS_FACTOR2
|
||
|
b POST_LOOP_ENDPOS
|
||
|
|
||
|
@ The case for factor == 4.
|
||
|
LOOP_ENDPOS_FACTOR4:
|
||
|
@ Initializations.
|
||
|
vmov.i32 q2, #2048
|
||
|
vmov.i32 q3, #2048
|
||
|
sub r7, r8, #2
|
||
|
sub r12, r0, #1 @ coefficients_length - 1
|
||
|
sub r1, r11, r12, asl #1 @ &data_in[i - j]
|
||
|
|
||
|
LOOP_COEFF_LENGTH_FACTOR4:
|
||
|
vld1.16 {d16[], d17[]}, [r7], r4 @ coefficients[j]
|
||
|
vld4.16 {d0, d1, d2, d3}, [r1]! @ data_in[]
|
||
|
vld4.16 {d18, d19, d20, d21}, [r1], r10 @ data_in[]
|
||
|
subs r12, #1
|
||
|
vmlal.s16 q2, d0, d16
|
||
|
vmlal.s16 q3, d18, d17
|
||
|
bge LOOP_COEFF_LENGTH_FACTOR4
|
||
|
|
||
|
add r11, r5, asl #4 @ r11 -> &data_in[i + factor * 8]
|
||
|
add r9, r5, asl #3 @ Counter i = delay + factor * 8.
|
||
|
|
||
|
@ Shift, saturate, and store the result.
|
||
|
vqshrn.s32 d0, q2, #12
|
||
|
vqshrn.s32 d1, q3, #12
|
||
|
cmp r9, r3 @ i < endpos - factor * 7 ?
|
||
|
vst1.16 {d0, d1}, [r2]!
|
||
|
|
||
|
blt LOOP_ENDPOS_FACTOR4
|
||
|
|
||
|
@
|
||
|
@ Second part, do the rest iterations (if any).
|
||
|
@
|
||
|
|
||
|
POST_LOOP_ENDPOS:
|
||
|
add r3, r5, asl #3
|
||
|
sub r3, r5 @ Restore r3 to endpos.
|
||
|
cmp r9, r3
|
||
|
movge r0, #0
|
||
|
bge END
|
||
|
|
||
|
LOOP2_ENDPOS:
|
||
|
@ Initializations.
|
||
|
mov r7, r8
|
||
|
sub r12, r0, #1 @ coefficients_length - 1
|
||
|
sub r6, r11, r12, asl #1 @ &data_in[i - j]
|
||
|
|
||
|
mov r1, #2048
|
||
|
|
||
|
LOOP2_COEFF_LENGTH:
|
||
|
ldrsh r4, [r7, #-2]! @ coefficients[j]
|
||
|
ldrsh r10, [r6], #2 @ data_in[i - j]
|
||
|
smlabb r1, r4, r10, r1
|
||
|
subs r12, #1
|
||
|
bge LOOP2_COEFF_LENGTH
|
||
|
|
||
|
@ Shift, saturate, and store the result.
|
||
|
ssat r1, #16, r1, asr #12
|
||
|
strh r1, [r2], #2
|
||
|
|
||
|
add r11, r5, asl #1 @ r11 -> &data_in[i + factor]
|
||
|
add r9, r5 @ Counter i = delay + factor.
|
||
|
cmp r9, r3 @ i < endpos?
|
||
|
blt LOOP2_ENDPOS
|
||
|
|
||
|
mov r0, #0
|
||
|
|
||
|
END:
|
||
|
pop {r4-r11}
|
||
|
bx lr
|