forked from rrcarlosr/Jetpack
68 lines
1.5 KiB
ArmAsm
68 lines
1.5 KiB
ArmAsm
/*
|
|
* Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
|
|
*
|
|
* SPDX-License-Identifier: GPL-2.0+
|
|
*/
|
|
|
|
/*
|
|
* If dst and src are 4 byte aligned, copy 8 bytes at a time.
|
|
* If the src is 4, but not 8 byte aligned, we first read 4 bytes to get
|
|
* it 8 byte aligned. Thus, we can do a little read-ahead, without
|
|
* dereferencing a cache line that we should not touch.
|
|
* Note that short and long instructions have been scheduled to avoid
|
|
* branch stalls.
|
|
* The beq_s to r3z could be made unaligned & long to avoid a stall
|
|
* there, but it is not likely to be taken often, and it would also be likely
|
|
* to cost an unaligned mispredict at the next call.
|
|
*/
|
|
|
|
.global strcpy
|
|
.align 4
|
|
strcpy:
|
|
or %r2, %r0, %r1
|
|
bmsk_s %r2, %r2, 1
|
|
brne.d %r2, 0, charloop
|
|
mov_s %r10, %r0
|
|
ld_s %r3, [%r1, 0]
|
|
mov %r8, 0x01010101
|
|
bbit0.d %r1, 2, loop_start
|
|
ror %r12, %r8
|
|
sub %r2, %r3, %r8
|
|
bic_s %r2, %r2, %r3
|
|
tst_s %r2,%r12
|
|
bne r3z
|
|
mov_s %r4,%r3
|
|
.balign 4
|
|
loop:
|
|
ld.a %r3, [%r1, 4]
|
|
st.ab %r4, [%r10, 4]
|
|
loop_start:
|
|
ld.a %r4, [%r1, 4]
|
|
sub %r2, %r3, %r8
|
|
bic_s %r2, %r2, %r3
|
|
tst_s %r2, %r12
|
|
bne_s r3z
|
|
st.ab %r3, [%r10, 4]
|
|
sub %r2, %r4, %r8
|
|
bic %r2, %r2, %r4
|
|
tst %r2, %r12
|
|
beq loop
|
|
mov_s %r3, %r4
|
|
#ifdef __LITTLE_ENDIAN__
|
|
r3z: bmsk.f %r1, %r3, 7
|
|
lsr_s %r3, %r3, 8
|
|
#else /* __BIG_ENDIAN__ */
|
|
r3z: lsr.f %r1, %r3, 24
|
|
asl_s %r3, %r3, 8
|
|
#endif /* _ENDIAN__ */
|
|
bne.d r3z
|
|
stb.ab %r1, [%r10, 1]
|
|
j_s [%blink]
|
|
|
|
.balign 4
|
|
charloop:
|
|
ldb.ab %r3, [%r1, 1]
|
|
brne.d %r3, 0, charloop
|
|
stb.ab %r3, [%r10, 1]
|
|
j [%blink]
|