forked from rrcarlosr/Jetpack
98 lines
2.7 KiB
ArmAsm
98 lines
2.7 KiB
ArmAsm
/*
|
|
* Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
|
|
*
|
|
* SPDX-License-Identifier: GPL-2.0+
|
|
*/
|
|
|
|
/*
|
|
* This is optimized primarily for the ARC700.
|
|
* It would be possible to speed up the loops by one cycle / word
|
|
* respective one cycle / byte by forcing double source 1 alignment, unrolling
|
|
* by a factor of two, and speculatively loading the second word / byte of
|
|
* source 1; however, that would increase the overhead for loop setup / finish,
|
|
* and strcmp might often terminate early.
|
|
*/
|
|
|
|
.global strcmp
|
|
.align 4
|
|
strcmp:
|
|
or %r2, %r0, %r1
|
|
bmsk_s %r2, %r2, 1
|
|
brne %r2, 0, .Lcharloop
|
|
mov_s %r12, 0x01010101
|
|
ror %r5, %r12
|
|
.Lwordloop:
|
|
ld.ab %r2, [%r0, 4]
|
|
ld.ab %r3, [%r1, 4]
|
|
nop_s
|
|
sub %r4, %r2, %r12
|
|
bic %r4, %r4, %r2
|
|
and %r4, %r4, %r5
|
|
brne %r4, 0, .Lfound0
|
|
breq %r2 ,%r3, .Lwordloop
|
|
#ifdef __LITTLE_ENDIAN__
|
|
xor %r0, %r2, %r3 /* mask for difference */
|
|
sub_s %r1, %r0, 1
|
|
bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
|
|
sub %r1, %r5, %r0
|
|
xor %r0, %r5, %r1 /* mask for least significant difference byte */
|
|
and_s %r2, %r2, %r0
|
|
and_s %r3, %r3, %r0
|
|
#endif /* _ENDIAN__ */
|
|
cmp_s %r2, %r3
|
|
mov_s %r0, 1
|
|
j_s.d [%blink]
|
|
bset.lo %r0, %r0, 31
|
|
|
|
.balign 4
|
|
#ifdef __LITTLE_ENDIAN__
|
|
.Lfound0:
|
|
xor %r0, %r2, %r3 /* mask for difference */
|
|
or %r0, %r0, %r4 /* or in zero indicator */
|
|
sub_s %r1, %r0, 1
|
|
bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
|
|
sub %r1, %r5, %r0
|
|
xor %r0, %r5, %r1 /* mask for least significant difference byte */
|
|
and_s %r2, %r2, %r0
|
|
and_s %r3, %r3, %r0
|
|
sub.f %r0, %r2, %r3
|
|
mov.hi %r0, 1
|
|
j_s.d [%blink]
|
|
bset.lo %r0, %r0, 31
|
|
#else /* __BIG_ENDIAN__ */
|
|
/*
|
|
* The zero-detection above can mis-detect 0x01 bytes as zeroes
|
|
* because of carry-propagateion from a lower significant zero byte.
|
|
* We can compensate for this by checking that bit0 is zero.
|
|
* This compensation is not necessary in the step where we
|
|
* get a low estimate for r2, because in any affected bytes
|
|
* we already have 0x00 or 0x01, which will remain unchanged
|
|
* when bit 7 is cleared.
|
|
*/
|
|
.balign 4
|
|
.Lfound0:
|
|
lsr %r0, %r4, 8
|
|
lsr_s %r1, %r2
|
|
bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */
|
|
bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */
|
|
or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */
|
|
cmp_s %r3, %r2 /* ... be independent of trailing garbage */
|
|
or_s %r2, %r2, %r0 /* likewise for r3 > r2 */
|
|
bic_s %r3, %r3, %r0
|
|
rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */
|
|
cmp_s %r2, %r3
|
|
j_s.d [%blink]
|
|
bset.lo %r0, %r0, 31
|
|
#endif /* _ENDIAN__ */
|
|
|
|
.balign 4
|
|
.Lcharloop:
|
|
ldb.ab %r2,[%r0,1]
|
|
ldb.ab %r3,[%r1,1]
|
|
nop_s
|
|
breq %r2, 0, .Lcmpend
|
|
breq %r2, %r3, .Lcharloop
|
|
.Lcmpend:
|
|
j_s.d [%blink]
|
|
sub %r0, %r2, %r3
|