Add x86_64 optimized __memcmp16 implementation;
fix tabs in 32-bit implementation.
Change-Id: I7bbfb344074aed66511c1a845998dc38798116ea
Signed-off-by: Varvara Rainchik <varvara.rainchik@intel.com>
diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S
index 17662fa..a315a37 100644
--- a/runtime/arch/x86/memcmp16_x86.S
+++ b/runtime/arch/x86/memcmp16_x86.S
@@ -21,1018 +21,1018 @@
/* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
#ifndef L
-# define L(label) .L##label
+# define L(label) .L##label
#endif
-#define CFI_PUSH(REG) \
- CFI_ADJUST_CFA_OFFSET(4); \
- CFI_REL_OFFSET(REG, 0)
+#define CFI_PUSH(REG) \
+ CFI_ADJUST_CFA_OFFSET(4); \
+ CFI_REL_OFFSET(REG, 0)
-#define CFI_POP(REG) \
- CFI_ADJUST_CFA_OFFSET(-4); \
- CFI_RESTORE(REG)
+#define CFI_POP(REG) \
+ CFI_ADJUST_CFA_OFFSET(-4); \
+ CFI_RESTORE(REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
+#define PARMS 4
+#define BLK1 PARMS
+#define BLK2 BLK1+4
+#define LEN BLK2+4
+#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
DEFINE_FUNCTION MEMCMP
- movl LEN(%esp), %ecx
+ movl LEN(%esp), %ecx
- shl $1, %ecx
- jz L(zero)
+ shl $1, %ecx
+ jz L(zero)
- movl BLK1(%esp), %eax
- cmp $48, %ecx
- movl BLK2(%esp), %edx
- jae L(48bytesormore)
+ movl BLK1(%esp), %eax
+ cmp $48, %ecx
+ movl BLK2(%esp), %edx
+ jae L(48bytesormore)
- PUSH (%ebx)
- add %ecx, %edx
- add %ecx, %eax
- jmp L(less48bytes)
+ PUSH (%ebx)
+ add %ecx, %edx
+ add %ecx, %eax
+ jmp L(less48bytes)
- CFI_POP (%ebx)
+ CFI_POP (%ebx)
- .p2align 4
+ .p2align 4
L(zero):
- xor %eax, %eax
- ret
+ xor %eax, %eax
+ ret
- .p2align 4
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
- CFI_REMEMBER_STATE
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
- movl %eax, %edi
- movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%edi), %edi
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
+ CFI_REMEMBER_STATE
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
+ movl %eax, %edi
+ movl %edx, %esi
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%edi), %edi
- sub $0xffff, %edx
- lea 16(%esi), %esi
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %edx, %edi
- sub %edx, %esi
- add %edx, %ecx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %edx, %esi
+ sub $0xffff, %edx
+ lea 16(%esi), %esi
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %edx, %edi
+ sub %edx, %esi
+ add %edx, %ecx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %edx, %esi
- cmp $0, %edx
- je L(shr_0)
- cmp $2, %edx
- je L(shr_2)
- cmp $4, %edx
- je L(shr_4)
- cmp $6, %edx
- je L(shr_6)
- cmp $8, %edx
- je L(shr_8)
- cmp $10, %edx
- je L(shr_10)
- cmp $12, %edx
- je L(shr_12)
- jmp L(shr_14)
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $6, %edx
+ je L(shr_6)
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $12, %edx
+ je L(shr_12)
+ jmp L(shr_14)
- .p2align 4
+ .p2align 4
L(shr_0):
- cmp $80, %ecx
- jae L(shr_0_gobble)
- lea -48(%ecx), %ecx
- xor %eax, %eax
- movaps (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
- movaps 16(%esi), %xmm2
- pcmpeqb 16(%edi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- add $32, %edi
- add $32, %esi
- sub $0xffff, %edx
- jnz L(exit)
+ cmp $80, %ecx
+ jae L(shr_0_gobble)
+ lea -48(%ecx), %ecx
+ xor %eax, %eax
+ movaps (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
+ movaps 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ add $32, %edi
+ add $32, %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_0_gobble):
- lea -48(%ecx), %ecx
- movdqa (%esi), %xmm0
- xor %eax, %eax
- pcmpeqb (%edi), %xmm0
- sub $32, %ecx
- movdqa 16(%esi), %xmm2
- pcmpeqb 16(%edi), %xmm2
+ lea -48(%ecx), %ecx
+ movdqa (%esi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm2
+ pcmpeqb 16(%edi), %xmm2
L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %ecx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%esi), %xmm0
- movdqa 48(%esi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%edi), %xmm0
- pcmpeqb 48(%edi), %xmm2
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- jz L(shr_0_gobble_loop)
+ pand %xmm0, %xmm2
+ sub $32, %ecx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%esi), %xmm0
+ movdqa 48(%esi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%edi), %xmm0
+ pcmpeqb 48(%edi), %xmm2
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ jz L(shr_0_gobble_loop)
- pand %xmm0, %xmm2
- cmp $0, %ecx
- jge L(shr_0_gobble_loop_next)
- inc %edx
- add $32, %ecx
+ pand %xmm0, %xmm2
+ cmp $0, %ecx
+ jge L(shr_0_gobble_loop_next)
+ inc %edx
+ add $32, %ecx
L(shr_0_gobble_loop_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea (%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_2):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_2_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $2,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $2,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_2_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $2,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $2,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $2,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $2,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $2,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $2,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $2,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $2,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 2(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_4):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_4_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $4,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $4,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_4_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $4,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $4,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $4,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $4,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $4,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $4,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $4,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $4,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 4(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_6):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_6_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $6,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $6,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_6_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $6,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $6,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $6,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $6,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $6,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $6,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $6,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $6,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 6(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_8):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_8_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8,(%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8,(%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $8,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $8,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_8_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $8,(%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $8,(%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $8,16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $8,16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $8,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $8,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $8,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $8,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 8(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_10):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_10_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $10,%xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $10,%xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_10_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $10, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $10, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $10, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $10, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $10,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $10,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $10,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $10,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 10(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_12):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_12_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_12_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $12, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $12, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $12, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $12, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $12,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $12,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $12,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $12,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 12(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_14):
- cmp $80, %ecx
- lea -48(%ecx), %ecx
- mov %edx, %eax
- jae L(shr_14_gobble)
+ cmp $80, %ecx
+ lea -48(%ecx), %ecx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
- movdqa 16(%esi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%esi), %xmm1
- pcmpeqb (%edi), %xmm1
+ movdqa 16(%esi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%esi), %xmm1
+ pcmpeqb (%edi), %xmm1
- movdqa 32(%esi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%edi), %xmm3
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(shr_14_gobble):
- sub $32, %ecx
- movdqa 16(%esi), %xmm0
- palignr $14, (%esi), %xmm0
- pcmpeqb (%edi), %xmm0
+ sub $32, %ecx
+ movdqa 16(%esi), %xmm0
+ palignr $14, (%esi), %xmm0
+ pcmpeqb (%edi), %xmm0
- movdqa 32(%esi), %xmm3
- palignr $14, 16(%esi), %xmm3
- pcmpeqb 16(%edi), %xmm3
+ movdqa 32(%esi), %xmm3
+ palignr $14, 16(%esi), %xmm3
+ pcmpeqb 16(%edi), %xmm3
L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %ecx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
+ pand %xmm0, %xmm3
+ sub $32, %ecx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
- movdqa 64(%esi), %xmm3
- palignr $14,48(%esi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%esi), %xmm0
- palignr $14,32(%esi), %xmm0
- pcmpeqb 32(%edi), %xmm0
- lea 32(%esi), %esi
- pcmpeqb 48(%edi), %xmm3
+ movdqa 64(%esi), %xmm3
+ palignr $14,48(%esi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%esi), %xmm0
+ palignr $14,32(%esi), %xmm0
+ pcmpeqb 32(%edi), %xmm0
+ lea 32(%esi), %esi
+ pcmpeqb 48(%edi), %xmm3
- lea 32(%edi), %edi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
+ lea 32(%edi), %edi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
- cmp $0, %ecx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %ecx
+ cmp $0, %ecx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %ecx
L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
+ test %edx, %edx
+ jnz L(exit)
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%edi), %edi
- lea 32(%esi), %esi
- sub $0xffff, %edx
- jnz L(exit)
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%edi), %edi
+ lea 32(%esi), %esi
+ sub $0xffff, %edx
+ jnz L(exit)
- lea (%ecx, %edi,1), %eax
- lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
- jmp L(less48bytes)
+ lea (%ecx, %edi,1), %eax
+ lea 14(%ecx, %esi,1), %edx
+ POP (%edi)
+ POP (%esi)
+ jmp L(less48bytes)
- CFI_RESTORE_STATE
- CFI_REMEMBER_STATE
- .p2align 4
+ CFI_RESTORE_STATE
+ CFI_REMEMBER_STATE
+ .p2align 4
L(exit):
- pmovmskb %xmm1, %ebx
- sub $0xffff, %ebx
- jz L(first16bytes)
- lea -16(%esi), %esi
- lea -16(%edi), %edi
- mov %ebx, %edx
+ pmovmskb %xmm1, %ebx
+ sub $0xffff, %ebx
+ jz L(first16bytes)
+ lea -16(%esi), %esi
+ lea -16(%edi), %edi
+ mov %ebx, %edx
L(first16bytes):
- add %eax, %esi
+ add %eax, %esi
L(less16bytes):
- test %dl, %dl
- jz L(next_four_words)
- test $15, %dl
- jz L(second_two_words)
- test $3, %dl
- jz L(second_word)
- movzwl -16(%edi), %eax
- movzwl -16(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test %dl, %dl
+ jz L(next_four_words)
+ test $15, %dl
+ jz L(second_two_words)
+ test $3, %dl
+ jz L(second_word)
+ movzwl -16(%edi), %eax
+ movzwl -16(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(second_word):
- movzwl -14(%edi), %eax
- movzwl -14(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -14(%edi), %eax
+ movzwl -14(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(second_two_words):
- test $63, %dl
- jz L(fourth_word)
- movzwl -12(%edi), %eax
- movzwl -12(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test $63, %dl
+ jz L(fourth_word)
+ movzwl -12(%edi), %eax
+ movzwl -12(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(fourth_word):
- movzwl -10(%edi), %eax
- movzwl -10(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -10(%edi), %eax
+ movzwl -10(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(next_four_words):
- test $15, %dh
- jz L(fourth_two_words)
- test $3, %dh
- jz L(sixth_word)
- movzwl -8(%edi), %eax
- movzwl -8(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test $15, %dh
+ jz L(fourth_two_words)
+ test $3, %dh
+ jz L(sixth_word)
+ movzwl -8(%edi), %eax
+ movzwl -8(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(sixth_word):
- movzwl -6(%edi), %eax
- movzwl -6(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -6(%edi), %eax
+ movzwl -6(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(fourth_two_words):
- test $63, %dh
- jz L(eighth_word)
- movzwl -4(%edi), %eax
- movzwl -4(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ test $63, %dh
+ jz L(eighth_word)
+ movzwl -4(%edi), %eax
+ movzwl -4(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- .p2align 4
+ .p2align 4
L(eighth_word):
- movzwl -2(%edi), %eax
- movzwl -2(%esi), %ebx
- subl %ebx, %eax
- RETURN
+ movzwl -2(%edi), %eax
+ movzwl -2(%esi), %ebx
+ subl %ebx, %eax
+ RETURN
- CFI_PUSH (%ebx)
+ CFI_PUSH (%ebx)
- .p2align 4
+ .p2align 4
L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $12, %ecx
- je L(12bytes)
- jmp L(14bytes)
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ jmp L(14bytes)
- .p2align 4
+ .p2align 4
L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $20, %ecx
- je L(20bytes)
- jmp L(22bytes)
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ jmp L(22bytes)
- .p2align 4
+ .p2align 4
L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $28, %ecx
- je L(28bytes)
- jmp L(30bytes)
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ jmp L(30bytes)
- .p2align 4
+ .p2align 4
L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $36, %ecx
- je L(36bytes)
- jmp L(38bytes)
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ jmp L(38bytes)
- .p2align 4
+ .p2align 4
L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $4, %ecx
- je L(4bytes)
- jmp L(6bytes)
+ cmp $8, %ecx
+ jae L(more8bytes)
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ jmp L(6bytes)
- .p2align 4
+ .p2align 4
L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $44, %ecx
- je L(44bytes)
- jmp L(46bytes)
+ cmp $40, %ecx
+ je L(40bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ jmp L(46bytes)
- .p2align 4
+ .p2align 4
L(46bytes):
- movzwl -46(%eax), %ecx
- movzwl -46(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -46(%eax), %ecx
+ movzwl -46(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(44bytes):
- movzwl -44(%eax), %ecx
- movzwl -44(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -44(%eax), %ecx
+ movzwl -44(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(42bytes):
- movzwl -42(%eax), %ecx
- movzwl -42(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -42(%eax), %ecx
+ movzwl -42(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(40bytes):
- movzwl -40(%eax), %ecx
- movzwl -40(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -40(%eax), %ecx
+ movzwl -40(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(38bytes):
- movzwl -38(%eax), %ecx
- movzwl -38(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -38(%eax), %ecx
+ movzwl -38(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(36bytes):
- movzwl -36(%eax), %ecx
- movzwl -36(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -36(%eax), %ecx
+ movzwl -36(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(34bytes):
- movzwl -34(%eax), %ecx
- movzwl -34(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -34(%eax), %ecx
+ movzwl -34(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(32bytes):
- movzwl -32(%eax), %ecx
- movzwl -32(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -32(%eax), %ecx
+ movzwl -32(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(30bytes):
- movzwl -30(%eax), %ecx
- movzwl -30(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -30(%eax), %ecx
+ movzwl -30(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(28bytes):
- movzwl -28(%eax), %ecx
- movzwl -28(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -28(%eax), %ecx
+ movzwl -28(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(26bytes):
- movzwl -26(%eax), %ecx
- movzwl -26(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -26(%eax), %ecx
+ movzwl -26(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(24bytes):
- movzwl -24(%eax), %ecx
- movzwl -24(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -24(%eax), %ecx
+ movzwl -24(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(22bytes):
- movzwl -22(%eax), %ecx
- movzwl -22(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -22(%eax), %ecx
+ movzwl -22(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(20bytes):
- movzwl -20(%eax), %ecx
- movzwl -20(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -20(%eax), %ecx
+ movzwl -20(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(18bytes):
- movzwl -18(%eax), %ecx
- movzwl -18(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -18(%eax), %ecx
+ movzwl -18(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(16bytes):
- movzwl -16(%eax), %ecx
- movzwl -16(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -16(%eax), %ecx
+ movzwl -16(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(14bytes):
- movzwl -14(%eax), %ecx
- movzwl -14(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -14(%eax), %ecx
+ movzwl -14(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(12bytes):
- movzwl -12(%eax), %ecx
- movzwl -12(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -12(%eax), %ecx
+ movzwl -12(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(10bytes):
- movzwl -10(%eax), %ecx
- movzwl -10(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -10(%eax), %ecx
+ movzwl -10(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(8bytes):
- movzwl -8(%eax), %ecx
- movzwl -8(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -8(%eax), %ecx
+ movzwl -8(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(6bytes):
- movzwl -6(%eax), %ecx
- movzwl -6(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -6(%eax), %ecx
+ movzwl -6(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(4bytes):
- movzwl -4(%eax), %ecx
- movzwl -4(%edx), %ebx
- subl %ebx, %ecx
- jne L(memcmp16_exit)
+ movzwl -4(%eax), %ecx
+ movzwl -4(%edx), %ebx
+ subl %ebx, %ecx
+ jne L(memcmp16_exit)
L(2bytes):
- movzwl -2(%eax), %eax
- movzwl -2(%edx), %ebx
- subl %ebx, %eax
- POP (%ebx)
- ret
- CFI_PUSH (%ebx)
+ movzwl -2(%eax), %eax
+ movzwl -2(%edx), %ebx
+ subl %ebx, %eax
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
- .p2align 4
+ .p2align 4
L(memcmp16_exit):
- POP (%ebx)
- mov %ecx, %eax
- ret
+ POP (%ebx)
+ mov %ecx, %eax
+ ret
END_FUNCTION MEMCMP