Add x86_64 optimized __memcmp16 implementation;
fix tabs in 32-bit implementation.

Change-Id: I7bbfb344074aed66511c1a845998dc38798116ea
Signed-off-by: Varvara Rainchik <varvara.rainchik@intel.com>
diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S
index 17662fa..a315a37 100644
--- a/runtime/arch/x86/memcmp16_x86.S
+++ b/runtime/arch/x86/memcmp16_x86.S
@@ -21,1018 +21,1018 @@
 /* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */
 
 #ifndef L
-# define L(label)	.L##label
+# define L(label)    .L##label
 #endif
 
-#define CFI_PUSH(REG)	\
-	CFI_ADJUST_CFA_OFFSET(4);	\
-	CFI_REL_OFFSET(REG, 0)
+#define CFI_PUSH(REG)    \
+    CFI_ADJUST_CFA_OFFSET(4);    \
+    CFI_REL_OFFSET(REG, 0)
 
-#define CFI_POP(REG)	\
-	CFI_ADJUST_CFA_OFFSET(-4);	\
-	CFI_RESTORE(REG)
+#define CFI_POP(REG)    \
+    CFI_ADJUST_CFA_OFFSET(-4);    \
+    CFI_RESTORE(REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+#define PUSH(REG)    pushl REG; CFI_PUSH (REG)
+#define POP(REG)    popl REG; CFI_POP (REG)
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN		RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
+#define PARMS        4
+#define BLK1        PARMS
+#define BLK2        BLK1+4
+#define LEN        BLK2+4
+#define RETURN_END    POP (%edi); POP (%esi); POP (%ebx); ret
+#define RETURN        RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE
 
 DEFINE_FUNCTION MEMCMP
-	movl	LEN(%esp), %ecx
+    movl       LEN(%esp), %ecx
 
-	shl	$1, %ecx
-	jz	L(zero)
+    shl        $1, %ecx
+    jz         L(zero)
 
-	movl	BLK1(%esp), %eax
-	cmp	$48, %ecx
-	movl	BLK2(%esp), %edx
-	jae	L(48bytesormore)
+    movl       BLK1(%esp), %eax
+    cmp        $48, %ecx
+    movl       BLK2(%esp), %edx
+    jae        L(48bytesormore)
 
-	PUSH	(%ebx)
-	add	%ecx, %edx
-	add	%ecx, %eax
-	jmp	L(less48bytes)
+    PUSH       (%ebx)
+    add        %ecx, %edx
+    add        %ecx, %eax
+    jmp        L(less48bytes)
 
-	CFI_POP	(%ebx)
+    CFI_POP    (%ebx)
 
-	.p2align 4
+    .p2align 4
 L(zero):
-	xor	%eax, %eax
-	ret
+    xor        %eax, %eax
+    ret
 
-	.p2align 4
+    .p2align 4
 L(48bytesormore):
-	PUSH	(%ebx)
-	PUSH	(%esi)
-	PUSH	(%edi)
-	CFI_REMEMBER_STATE
-	movdqu	(%eax), %xmm3
-	movdqu	(%edx), %xmm0
-	movl	%eax, %edi
-	movl	%edx, %esi
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	16(%edi), %edi
+    PUSH       (%ebx)
+    PUSH       (%esi)
+    PUSH       (%edi)
+    CFI_REMEMBER_STATE
+    movdqu     (%eax), %xmm3
+    movdqu     (%edx), %xmm0
+    movl       %eax, %edi
+    movl       %edx, %esi
+    pcmpeqb    %xmm0, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        16(%edi), %edi
 
-	sub	$0xffff, %edx
-	lea	16(%esi), %esi
-	jnz	L(less16bytes)
-	mov	%edi, %edx
-	and	$0xf, %edx
-	xor	%edx, %edi
-	sub	%edx, %esi
-	add	%edx, %ecx
-	mov	%esi, %edx
-	and	$0xf, %edx
-	jz	L(shr_0)
-	xor	%edx, %esi
+    sub        $0xffff, %edx
+    lea        16(%esi), %esi
+    jnz        L(less16bytes)
+    mov        %edi, %edx
+    and        $0xf, %edx
+    xor        %edx, %edi
+    sub        %edx, %esi
+    add        %edx, %ecx
+    mov        %esi, %edx
+    and        $0xf, %edx
+    jz         L(shr_0)
+    xor        %edx, %esi
 
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$2, %edx
-	je	L(shr_2)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$6, %edx
-	je	L(shr_6)
-	cmp	$8, %edx
-	je	L(shr_8)
-	cmp	$10, %edx
-	je	L(shr_10)
-	cmp	$12, %edx
-	je	L(shr_12)
-	jmp	L(shr_14)
+    cmp        $0, %edx
+    je         L(shr_0)
+    cmp        $2, %edx
+    je         L(shr_2)
+    cmp        $4, %edx
+    je         L(shr_4)
+    cmp        $6, %edx
+    je         L(shr_6)
+    cmp        $8, %edx
+    je         L(shr_8)
+    cmp        $10, %edx
+    je         L(shr_10)
+    cmp        $12, %edx
+    je         L(shr_12)
+    jmp        L(shr_14)
 
-	.p2align 4
+    .p2align 4
 L(shr_0):
-	cmp	$80, %ecx
-	jae	L(shr_0_gobble)
-	lea	-48(%ecx), %ecx
-	xor	%eax, %eax
-	movaps	(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
-	movaps	16(%esi), %xmm2
-	pcmpeqb	16(%edi), %xmm2
-	pand	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	add	$32, %edi
-	add	$32, %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    cmp        $80, %ecx
+    jae        L(shr_0_gobble)
+    lea        -48(%ecx), %ecx
+    xor        %eax, %eax
+    movaps     (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
+    movaps     16(%esi), %xmm2
+    pcmpeqb    16(%edi), %xmm2
+    pand       %xmm1, %xmm2
+    pmovmskb   %xmm2, %edx
+    add        $32, %edi
+    add        $32, %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        (%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_0_gobble):
-	lea	-48(%ecx), %ecx
-	movdqa	(%esi), %xmm0
-	xor	%eax, %eax
-	pcmpeqb	(%edi), %xmm0
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm2
-	pcmpeqb	16(%edi), %xmm2
+    lea        -48(%ecx), %ecx
+    movdqa     (%esi), %xmm0
+    xor        %eax, %eax
+    pcmpeqb    (%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm2
+    pcmpeqb    16(%edi), %xmm2
 L(shr_0_gobble_loop):
-	pand	%xmm0, %xmm2
-	sub	$32, %ecx
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	movdqa	32(%esi), %xmm0
-	movdqa	48(%esi), %xmm2
-	sbb	$0xffff, %edx
-	pcmpeqb	32(%edi), %xmm0
-	pcmpeqb	48(%edi), %xmm2
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	jz	L(shr_0_gobble_loop)
+    pand       %xmm0, %xmm2
+    sub        $32, %ecx
+    pmovmskb   %xmm2, %edx
+    movdqa     %xmm0, %xmm1
+    movdqa     32(%esi), %xmm0
+    movdqa     48(%esi), %xmm2
+    sbb        $0xffff, %edx
+    pcmpeqb    32(%edi), %xmm0
+    pcmpeqb    48(%edi), %xmm2
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    jz         L(shr_0_gobble_loop)
 
-	pand	%xmm0, %xmm2
-	cmp	$0, %ecx
-	jge	L(shr_0_gobble_loop_next)
-	inc	%edx
-	add	$32, %ecx
+    pand       %xmm0, %xmm2
+    cmp        $0, %ecx
+    jge        L(shr_0_gobble_loop_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_0_gobble_loop_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pmovmskb %xmm2, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        (%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_2):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_2_gobble)
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_2_gobble)
 
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$2,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $2,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
 
-	movdqa	32(%esi), %xmm3
-	palignr	$2,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $2,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	2(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        2(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_2_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$2,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $2,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$2,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $2,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_2_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
 
-	movdqa	64(%esi), %xmm3
-	palignr	$2,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$2,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
+    movdqa     64(%esi), %xmm3
+    palignr    $2,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $2,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
 
-	lea	32(%edi), %edi
-	jz	L(shr_2_gobble_loop)
-	pand	%xmm0, %xmm3
+    lea        32(%edi), %edi
+    jz         L(shr_2_gobble_loop)
+    pand       %xmm0, %xmm3
 
-	cmp	$0, %ecx
-	jge	L(shr_2_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    cmp        $0, %ecx
+    jge        L(shr_2_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_2_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	2(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        2(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_4):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_4_gobble)
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_4_gobble)
 
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$4,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $4,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
 
-	movdqa	32(%esi), %xmm3
-	palignr	$4,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $4,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	4(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        4(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_4_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$4,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $4,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$4,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $4,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_4_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
 
-	movdqa	64(%esi), %xmm3
-	palignr	$4,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$4,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
+    movdqa     64(%esi), %xmm3
+    palignr    $4,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $4,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
 
-	lea	32(%edi), %edi
-	jz	L(shr_4_gobble_loop)
-	pand	%xmm0, %xmm3
+    lea        32(%edi), %edi
+    jz         L(shr_4_gobble_loop)
+    pand       %xmm0, %xmm3
 
-	cmp	$0, %ecx
-	jge	L(shr_4_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    cmp        $0, %ecx
+    jge        L(shr_4_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_4_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	4(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        4(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_6):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_6_gobble)
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_6_gobble)
 
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$6,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $6,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
 
-	movdqa	32(%esi), %xmm3
-	palignr	$6,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $6,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	6(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        6(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_6_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$6,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $6,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$6,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $6,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_6_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
 
-	movdqa	64(%esi), %xmm3
-	palignr	$6,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$6,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
+    movdqa     64(%esi), %xmm3
+    palignr    $6,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $6,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
 
-	lea	32(%edi), %edi
-	jz	L(shr_6_gobble_loop)
-	pand	%xmm0, %xmm3
+    lea        32(%edi), %edi
+    jz         L(shr_6_gobble_loop)
+    pand       %xmm0, %xmm3
 
-	cmp	$0, %ecx
-	jge	L(shr_6_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    cmp        $0, %ecx
+    jge        L(shr_6_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_6_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	6(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        6(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_8):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_8_gobble)
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_8_gobble)
 
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$8,(%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $8,(%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
 
-	movdqa	32(%esi), %xmm3
-	palignr	$8,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $8,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	8(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        8(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_8_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$8,(%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $8,(%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$8,16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $8,16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_8_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
 
-	movdqa	64(%esi), %xmm3
-	palignr	$8,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$8,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
+    movdqa     64(%esi), %xmm3
+    palignr    $8,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $8,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
 
-	lea	32(%edi), %edi
-	jz	L(shr_8_gobble_loop)
-	pand	%xmm0, %xmm3
+    lea        32(%edi), %edi
+    jz         L(shr_8_gobble_loop)
+    pand       %xmm0, %xmm3
 
-	cmp	$0, %ecx
-	jge	L(shr_8_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    cmp        $0, %ecx
+    jge        L(shr_8_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_8_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	8(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        8(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_10):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_10_gobble)
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_10_gobble)
 
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$10, (%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $10, (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
 
-	movdqa	32(%esi), %xmm3
-	palignr	$10,%xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $10,%xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	10(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        10(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_10_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$10, (%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $10, (%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$10, 16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $10, 16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_10_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
 
-	movdqa	64(%esi), %xmm3
-	palignr	$10,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$10,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
+    movdqa     64(%esi), %xmm3
+    palignr    $10,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $10,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
 
-	lea	32(%edi), %edi
-	jz	L(shr_10_gobble_loop)
-	pand	%xmm0, %xmm3
+    lea        32(%edi), %edi
+    jz         L(shr_10_gobble_loop)
+    pand       %xmm0, %xmm3
 
-	cmp	$0, %ecx
-	jge	L(shr_10_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    cmp        $0, %ecx
+    jge        L(shr_10_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_10_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	10(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        10(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_12):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_12_gobble)
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_12_gobble)
 
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$12, (%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $12, (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
 
-	movdqa	32(%esi), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $12, %xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	12(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        12(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_12_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$12, (%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $12, (%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$12, 16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $12, 16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_12_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
 
-	movdqa	64(%esi), %xmm3
-	palignr	$12,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$12,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
+    movdqa     64(%esi), %xmm3
+    palignr    $12,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $12,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
 
-	lea	32(%edi), %edi
-	jz	L(shr_12_gobble_loop)
-	pand	%xmm0, %xmm3
+    lea        32(%edi), %edi
+    jz         L(shr_12_gobble_loop)
+    pand       %xmm0, %xmm3
 
-	cmp	$0, %ecx
-	jge	L(shr_12_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    cmp        $0, %ecx
+    jge        L(shr_12_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_12_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	12(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        12(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_14):
-	cmp	$80, %ecx
-	lea	-48(%ecx), %ecx
-	mov	%edx, %eax
-	jae	L(shr_14_gobble)
+    cmp        $80, %ecx
+    lea        -48(%ecx), %ecx
+    mov        %edx, %eax
+    jae        L(shr_14_gobble)
 
-	movdqa	16(%esi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$14, (%esi), %xmm1
-	pcmpeqb	(%edi), %xmm1
+    movdqa     16(%esi), %xmm1
+    movdqa     %xmm1, %xmm2
+    palignr    $14, (%esi), %xmm1
+    pcmpeqb    (%edi), %xmm1
 
-	movdqa	32(%esi), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $14, %xmm2, %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	lea	(%ecx, %edi,1), %eax
-	lea	14(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    pand       %xmm1, %xmm3
+    pmovmskb   %xmm3, %edx
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
+    lea        (%ecx, %edi,1), %eax
+    lea        14(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(shr_14_gobble):
-	sub	$32, %ecx
-	movdqa	16(%esi), %xmm0
-	palignr	$14, (%esi), %xmm0
-	pcmpeqb	(%edi), %xmm0
+    sub        $32, %ecx
+    movdqa     16(%esi), %xmm0
+    palignr    $14, (%esi), %xmm0
+    pcmpeqb    (%edi), %xmm0
 
-	movdqa	32(%esi), %xmm3
-	palignr	$14, 16(%esi), %xmm3
-	pcmpeqb	16(%edi), %xmm3
+    movdqa     32(%esi), %xmm3
+    palignr    $14, 16(%esi), %xmm3
+    pcmpeqb    16(%edi), %xmm3
 
 L(shr_14_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %ecx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
+    pand       %xmm0, %xmm3
+    sub        $32, %ecx
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
 
-	movdqa	64(%esi), %xmm3
-	palignr	$14,48(%esi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%esi), %xmm0
-	palignr	$14,32(%esi), %xmm0
-	pcmpeqb	32(%edi), %xmm0
-	lea	32(%esi), %esi
-	pcmpeqb	48(%edi), %xmm3
+    movdqa     64(%esi), %xmm3
+    palignr    $14,48(%esi), %xmm3
+    sbb        $0xffff, %edx
+    movdqa     48(%esi), %xmm0
+    palignr    $14,32(%esi), %xmm0
+    pcmpeqb    32(%edi), %xmm0
+    lea        32(%esi), %esi
+    pcmpeqb    48(%edi), %xmm3
 
-	lea	32(%edi), %edi
-	jz	L(shr_14_gobble_loop)
-	pand	%xmm0, %xmm3
+    lea        32(%edi), %edi
+    jz         L(shr_14_gobble_loop)
+    pand       %xmm0, %xmm3
 
-	cmp	$0, %ecx
-	jge	L(shr_14_gobble_next)
-	inc	%edx
-	add	$32, %ecx
+    cmp        $0, %ecx
+    jge        L(shr_14_gobble_next)
+    inc        %edx
+    add        $32, %ecx
 L(shr_14_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
+    test       %edx, %edx
+    jnz        L(exit)
 
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%edi), %edi
-	lea	32(%esi), %esi
-	sub	$0xffff, %edx
-	jnz	L(exit)
+    pmovmskb   %xmm3, %edx
+    movdqa     %xmm0, %xmm1
+    lea        32(%edi), %edi
+    lea        32(%esi), %esi
+    sub        $0xffff, %edx
+    jnz        L(exit)
 
-	lea	(%ecx, %edi,1), %eax
-	lea	14(%ecx, %esi,1), %edx
-	POP	(%edi)
-	POP	(%esi)
-	jmp	L(less48bytes)
+    lea        (%ecx, %edi,1), %eax
+    lea        14(%ecx, %esi,1), %edx
+    POP        (%edi)
+    POP        (%esi)
+    jmp        L(less48bytes)
 
-	CFI_RESTORE_STATE
-	CFI_REMEMBER_STATE
-	.p2align 4
+    CFI_RESTORE_STATE
+    CFI_REMEMBER_STATE
+    .p2align 4
 L(exit):
-	pmovmskb %xmm1, %ebx
-	sub	$0xffff, %ebx
-	jz	L(first16bytes)
-	lea	-16(%esi), %esi
-	lea	-16(%edi), %edi
-	mov	%ebx, %edx
+    pmovmskb   %xmm1, %ebx
+    sub        $0xffff, %ebx
+    jz         L(first16bytes)
+    lea        -16(%esi), %esi
+    lea        -16(%edi), %edi
+    mov        %ebx, %edx
 
 L(first16bytes):
-	add	%eax, %esi
+    add        %eax, %esi
 L(less16bytes):
-	test	%dl, %dl
-	jz	L(next_four_words)
-	test	$15, %dl
-	jz	L(second_two_words)
-	test	$3, %dl
-	jz	L(second_word)
-	movzwl	-16(%edi), %eax
-	movzwl	-16(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    test       %dl, %dl
+    jz         L(next_four_words)
+    test       $15, %dl
+    jz         L(second_two_words)
+    test       $3, %dl
+    jz         L(second_word)
+    movzwl     -16(%edi), %eax
+    movzwl     -16(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(second_word):
-	movzwl	-14(%edi), %eax
-	movzwl	-14(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -14(%edi), %eax
+    movzwl     -14(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(second_two_words):
-	test	$63, %dl
-	jz	L(fourth_word)
-	movzwl	-12(%edi), %eax
-	movzwl	-12(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    test       $63, %dl
+    jz         L(fourth_word)
+    movzwl     -12(%edi), %eax
+    movzwl     -12(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(fourth_word):
-	movzwl	-10(%edi), %eax
-	movzwl	-10(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -10(%edi), %eax
+    movzwl     -10(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(next_four_words):
-	test	$15, %dh
-	jz	L(fourth_two_words)
-	test	$3, %dh
-	jz	L(sixth_word)
-	movzwl	-8(%edi), %eax
-	movzwl	-8(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    test       $15, %dh
+    jz         L(fourth_two_words)
+    test       $3, %dh
+    jz         L(sixth_word)
+    movzwl     -8(%edi), %eax
+    movzwl     -8(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(sixth_word):
-	movzwl	-6(%edi), %eax
-	movzwl	-6(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -6(%edi), %eax
+    movzwl     -6(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(fourth_two_words):
-	test	$63, %dh
-	jz	L(eighth_word)
-	movzwl	-4(%edi), %eax
-	movzwl	-4(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    test       $63, %dh
+    jz         L(eighth_word)
+    movzwl     -4(%edi), %eax
+    movzwl     -4(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
-	.p2align 4
+    .p2align 4
 L(eighth_word):
-	movzwl	-2(%edi), %eax
-	movzwl	-2(%esi), %ebx
-	subl	%ebx, %eax
-	RETURN
+    movzwl     -2(%edi), %eax
+    movzwl     -2(%esi), %ebx
+    subl       %ebx, %eax
+    RETURN
 
 
-	CFI_PUSH (%ebx)
+    CFI_PUSH (%ebx)
 
-	.p2align 4
+    .p2align 4
 L(more8bytes):
-	cmp	$16, %ecx
-	jae	L(more16bytes)
-	cmp	$8, %ecx
-	je	L(8bytes)
-	cmp	$10, %ecx
-	je	L(10bytes)
-	cmp	$12, %ecx
-	je	L(12bytes)
-	jmp	L(14bytes)
+    cmp        $16, %ecx
+    jae        L(more16bytes)
+    cmp        $8, %ecx
+    je         L(8bytes)
+    cmp        $10, %ecx
+    je         L(10bytes)
+    cmp        $12, %ecx
+    je         L(12bytes)
+    jmp        L(14bytes)
 
-	.p2align 4
+    .p2align 4
 L(more16bytes):
-	cmp	$24, %ecx
-	jae	L(more24bytes)
-	cmp	$16, %ecx
-	je	L(16bytes)
-	cmp	$18, %ecx
-	je	L(18bytes)
-	cmp	$20, %ecx
-	je	L(20bytes)
-	jmp	L(22bytes)
+    cmp        $24, %ecx
+    jae        L(more24bytes)
+    cmp        $16, %ecx
+    je         L(16bytes)
+    cmp        $18, %ecx
+    je         L(18bytes)
+    cmp        $20, %ecx
+    je         L(20bytes)
+    jmp        L(22bytes)
 
-	.p2align 4
+    .p2align 4
 L(more24bytes):
-	cmp	$32, %ecx
-	jae	L(more32bytes)
-	cmp	$24, %ecx
-	je	L(24bytes)
-	cmp	$26, %ecx
-	je	L(26bytes)
-	cmp	$28, %ecx
-	je	L(28bytes)
-	jmp	L(30bytes)
+    cmp        $32, %ecx
+    jae        L(more32bytes)
+    cmp        $24, %ecx
+    je         L(24bytes)
+    cmp        $26, %ecx
+    je         L(26bytes)
+    cmp        $28, %ecx
+    je         L(28bytes)
+    jmp        L(30bytes)
 
-	.p2align 4
+    .p2align 4
 L(more32bytes):
-	cmp	$40, %ecx
-	jae	L(more40bytes)
-	cmp	$32, %ecx
-	je	L(32bytes)
-	cmp	$34, %ecx
-	je	L(34bytes)
-	cmp	$36, %ecx
-	je	L(36bytes)
-	jmp	L(38bytes)
+    cmp        $40, %ecx
+    jae        L(more40bytes)
+    cmp        $32, %ecx
+    je         L(32bytes)
+    cmp        $34, %ecx
+    je         L(34bytes)
+    cmp        $36, %ecx
+    je         L(36bytes)
+    jmp        L(38bytes)
 
-	.p2align 4
+    .p2align 4
 L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	jmp	L(6bytes)
+    cmp        $8, %ecx
+    jae        L(more8bytes)
+    cmp        $2, %ecx
+    je         L(2bytes)
+    cmp        $4, %ecx
+    je         L(4bytes)
+    jmp        L(6bytes)
 
-	.p2align 4
+    .p2align 4
 L(more40bytes):
-	cmp	$40, %ecx
-	je	L(40bytes)
-	cmp	$42, %ecx
-	je	L(42bytes)
-	cmp	$44, %ecx
-	je	L(44bytes)
-	jmp	L(46bytes)
+    cmp        $40, %ecx
+    je         L(40bytes)
+    cmp        $42, %ecx
+    je         L(42bytes)
+    cmp        $44, %ecx
+    je         L(44bytes)
+    jmp        L(46bytes)
 
-	.p2align 4
+    .p2align 4
 L(46bytes):
-	movzwl	-46(%eax), %ecx
-	movzwl	-46(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -46(%eax), %ecx
+    movzwl     -46(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(44bytes):
-	movzwl	-44(%eax), %ecx
-	movzwl	-44(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -44(%eax), %ecx
+    movzwl     -44(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(42bytes):
-	movzwl	-42(%eax), %ecx
-	movzwl	-42(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -42(%eax), %ecx
+    movzwl     -42(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(40bytes):
-	movzwl	-40(%eax), %ecx
-	movzwl	-40(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -40(%eax), %ecx
+    movzwl     -40(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(38bytes):
-	movzwl	-38(%eax), %ecx
-	movzwl	-38(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -38(%eax), %ecx
+    movzwl     -38(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(36bytes):
-	movzwl	-36(%eax), %ecx
-	movzwl	-36(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -36(%eax), %ecx
+    movzwl     -36(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(34bytes):
-	movzwl	-34(%eax), %ecx
-	movzwl	-34(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -34(%eax), %ecx
+    movzwl     -34(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(32bytes):
-	movzwl	-32(%eax), %ecx
-	movzwl	-32(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -32(%eax), %ecx
+    movzwl     -32(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(30bytes):
-	movzwl	-30(%eax), %ecx
-	movzwl	-30(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -30(%eax), %ecx
+    movzwl     -30(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(28bytes):
-	movzwl	-28(%eax), %ecx
-	movzwl	-28(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -28(%eax), %ecx
+    movzwl     -28(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(26bytes):
-	movzwl	-26(%eax), %ecx
-	movzwl	-26(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -26(%eax), %ecx
+    movzwl     -26(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(24bytes):
-	movzwl	-24(%eax), %ecx
-	movzwl	-24(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -24(%eax), %ecx
+    movzwl     -24(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(22bytes):
-	movzwl	-22(%eax), %ecx
-	movzwl	-22(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -22(%eax), %ecx
+    movzwl     -22(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(20bytes):
-	movzwl	-20(%eax), %ecx
-	movzwl	-20(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -20(%eax), %ecx
+    movzwl     -20(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(18bytes):
-	movzwl	-18(%eax), %ecx
-	movzwl	-18(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -18(%eax), %ecx
+    movzwl     -18(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(16bytes):
-	movzwl	-16(%eax), %ecx
-	movzwl	-16(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -16(%eax), %ecx
+    movzwl     -16(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(14bytes):
-	movzwl	-14(%eax), %ecx
-	movzwl	-14(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -14(%eax), %ecx
+    movzwl     -14(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(12bytes):
-	movzwl	-12(%eax), %ecx
-	movzwl	-12(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -12(%eax), %ecx
+    movzwl     -12(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(10bytes):
-	movzwl	-10(%eax), %ecx
-	movzwl	-10(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -10(%eax), %ecx
+    movzwl     -10(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(8bytes):
-	movzwl	-8(%eax), %ecx
-	movzwl	-8(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -8(%eax), %ecx
+    movzwl     -8(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(6bytes):
-	movzwl	-6(%eax), %ecx
-	movzwl	-6(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -6(%eax), %ecx
+    movzwl     -6(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(4bytes):
-	movzwl	-4(%eax), %ecx
-	movzwl	-4(%edx), %ebx
-	subl	%ebx, %ecx
-	jne	L(memcmp16_exit)
+    movzwl     -4(%eax), %ecx
+    movzwl     -4(%edx), %ebx
+    subl       %ebx, %ecx
+    jne        L(memcmp16_exit)
 L(2bytes):
-	movzwl	-2(%eax), %eax
-	movzwl	-2(%edx), %ebx
-	subl	%ebx, %eax
-	POP	(%ebx)
-	ret
-	CFI_PUSH (%ebx)
+    movzwl     -2(%eax), %eax
+    movzwl     -2(%edx), %ebx
+    subl       %ebx, %eax
+    POP        (%ebx)
+    ret
+    CFI_PUSH   (%ebx)
 
-	.p2align 4
+    .p2align 4
 L(memcmp16_exit):
-	POP	(%ebx)
-	mov	%ecx, %eax
-	ret
+    POP        (%ebx)
+    mov        %ecx, %eax
+    ret
 END_FUNCTION MEMCMP