exynos: multimedia: speed up color conversion from ARGB8888 to YUV420SP

Use NEON instructions for color conversion from ARGB8888 to YUV420SP. This
greatly improves performance and can help achieve 30fps+ framerates when
dealing with OMX_COLOR_FormatAndroidOpaque during Screen Recording and
Screen Casting.

Change-Id: Ifdaaf03e1ce6909822df3f046ef35dd977b84d17
diff --git a/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c b/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c
index d3f16b9..e738507 100644
--- a/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c
+++ b/exynos/multimedia/openmax/component/video/enc/SEC_OMX_Venc.c
@@ -805,7 +805,7 @@
                             SEC_OSAL_GetInfoFromMetaData(inputData, ppBuf);
                             SEC_OSAL_LockANBHandle((OMX_U32)ppBuf[0], width, height, OMX_COLOR_FormatAndroidOpaque, &pOutBuffer);
 
-                            csc_ARGB8888_to_YUV420SP(pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].YVirAddr,
+                            csc_ARGB8888_to_YUV420SP_NEON(pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].YVirAddr,
                                                     pVideoEnc->MFCEncInputBuffer[pVideoEnc->indexInputBuffer].CVirAddr,
                                                     pOutBuffer, width, height);
 
diff --git a/exynos/multimedia/utils/csc/exynos4/Android.mk b/exynos/multimedia/utils/csc/exynos4/Android.mk
index e7ed4e2..8609819 100644
--- a/exynos/multimedia/utils/csc/exynos4/Android.mk
+++ b/exynos/multimedia/utils/csc/exynos4/Android.mk
@@ -15,6 +15,7 @@
 	csc_linear_to_tiled_interleave_crop_neon.s \
 	csc_tiled_to_linear_crop_neon.s \
 	csc_tiled_to_linear_deinterleave_crop_neon.s \
+	csc_ARGB8888_to_YUV420SP_NEON.s \
 	csc_interleave_memcpy_neon.s \
 	csc_fimc.cpp
 
diff --git a/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h b/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h
index 92c0a6d..1967f48 100644
--- a/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h
+++ b/exynos/multimedia/utils/csc/exynos4/color_space_convertor.h
@@ -411,4 +411,11 @@
     unsigned int width,
     unsigned int height);
 
+void csc_ARGB8888_to_YUV420SP_NEON(
+    unsigned char *y_dst,
+    unsigned char *uv_dst,
+    unsigned char *rgb_src,
+    unsigned int width,
+    unsigned int height);
+
 #endif /*COLOR_SPACE_CONVERTOR_H_*/
diff --git a/exynos/multimedia/utils/csc/exynos4/csc_ARGB8888_to_YUV420SP_NEON.s b/exynos/multimedia/utils/csc/exynos4/csc_ARGB8888_to_YUV420SP_NEON.s
new file mode 100644
index 0000000..62ccf97
--- /dev/null
+++ b/exynos/multimedia/utils/csc/exynos4/csc_ARGB8888_to_YUV420SP_NEON.s
@@ -0,0 +1,365 @@
+
+    .arch armv7-a
+    .text
+    .global csc_ARGB8888_to_YUV420SP_NEON
+    .type   csc_ARGB8888_to_YUV420SP_NEON, %function
+csc_ARGB8888_to_YUV420SP_NEON:
+    .fnstart
+
+    @r0     pDstY
+    @r1     pDstUV
+    @r2     pSrcRGB
+    @r3     nWidth
+    @r4     pDstY2 = pDstY + nWidth
+    @r5     pSrcRGB2 = pSrcRGB + nWidthx2
+    @r6     temp7, nWidth16m
+    @r7     temp6, accumilator
+    @r8     temp5, nWidthTemp
+    @r9     temp4, Raw RGB565
+    @r10    temp3, r,g,b
+    @r11    temp2, immediate operand
+    @r12    temp1, nHeight
+    @r14    temp0, debugging pointer
+
+    .equ CACHE_LINE_SIZE, 32
+    .equ PRE_LOAD_OFFSET, 6
+
+    stmfd       sp!, {r4-r12,r14}       @ backup registers
+    ldr         r12, [sp, #40]           @ load nHeight
+    @ldr         r14, [sp, #44]          @ load pTest
+    add         r4, r0, r3             @r4: pDstY2 = pDstY + nWidth
+    add         r5, r2, r3, lsl #2     @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4
+    sub         r8, r3, #16                @r8: nWidthTmp = nWidth -16
+
+    @q0: temp1, R
+    @q1: temp2, GB
+    @q2: R
+    @q3: G
+    @q4: B
+    @q5: temp3, output
+
+
+    vmov.u16 q6, #66 @coefficient assignment
+    vmov.u16 q7, #129
+    vmov.u16 q8, #25
+    vmov.u16 q9,  #0x8080  @ 128<<8 + 128
+
+    vmov.u16 q10, #0x1000  @ 16<<8 + 128
+    vorr.u16 q10, #0x0080
+
+    vmov.u16 q11, #38 @#-38
+    vmov.u16 q12, #74 @#-74
+    vmov.u16 q13, #112
+    vmov.u16 q14, #94 @#-94
+    vmov.u16 q15, #18 @#-18
+
+
+
+
+LOOP_NHEIGHT2:
+    stmfd       sp!, {r12}       @ backup registers
+
+LOOP_NWIDTH16:
+    pld         [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
+   @-------------------------------------------YUV ------------------------------------------
+    vmov.u16 q14, #94 @#94
+    vmov.u16 q15, #18 @#18
+    vld4.8   {d0,d1,d2,d3}, [r2]! @loadRGB interleavely
+    vld4.8   {d4,d5,d6,d7}, [r2]! @loadRGB interleavely
+
+
+    vmov.u16 d8,d2
+    vmov.u16 d9,d6
+    vmov.u16 d10,d1
+    vmov.u16 d11,d5
+    vmov.u16 d12,d0
+    vmov.u16 d13,d4
+
+    vand.u16 q4,#0x00FF  @R
+    vand.u16 q5,#0x00FF  @G
+    vand.u16 q6,#0x00FF  @B
+
+    vmov.u16 q8,q9   @ CalcU()
+    vmla.u16 q8,q6,q13  @112 * B[k]
+    vmls.u16 q8,q4,q11  @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2
+    vmls.u16 q8,q5,q12  @-(74 * G[k])
+    vshr.u16 q8,q8, #8  @(128<<8+ 128 + u)>>8
+
+    vmov.u16 q7,q9      @CalcV()
+    vmla.u16 q7,q4,q13  @112 * R[k]
+    vmls.u16 q7,q5,q14  @q0:U -(94 * G[k])  @128<<6+ 32 + v>>2
+    vmls.u16 q7,q6,q15  @-(18 * B[k])
+    vshr.u16 q7,q7, #8  @(128<<8+ 128 + v)>>8
+
+
+    vtrn.8 q8,q7
+    vst1.8  {q8}, [r1]!    @write UV component to yuv420_buffer+linear_ylanesiez
+
+    @-------------------------------------------Y ------------------------------------------
+
+    vmov.u16 q14, #66 @#66
+    vmov.u16 q15, #129 @#129
+    vmov.u16 q8, #25 @#25
+
+    @CalcY_Y()
+
+    vmul.u16 q7,q4,q14  @q0 = 66 *R[k]
+    vmla.u16 q7,q5,q15  @q0 += 129 *G[k]
+    vmla.u16 q7,q6,q8  @q0 += 25 *B[k]
+
+    vadd.u16 q7,q7,q10
+    vshr.u16 q7,q7, #8
+
+    vmov.u16 d8,d2
+    vmov.u16 d9,d6
+    vmov.u16 d10,d1
+    vmov.u16 d11,d5
+    vmov.u16 d12,d0
+    vmov.u16 d13,d4
+
+    vshr.u16 q4,q4,#8  @R
+    vshr.u16 q5,q5,#8  @G
+    vshr.u16 q6,q6,#8  @B
+
+    vmul.u16 q0,q4,q14  @q0 = 66 *R[k]
+    vmla.u16 q0,q5,q15  @q0 += 129 *G[k]
+    vmla.u16 q0,q6,q8  @q0 += 25 *B[k]
+    vadd.u16 q0,q0,q10
+    vshr.u16 q0,q0, #8
+
+    vtrn.8 q7,q0
+    vst1.8  {q7}, [r0]!@write to Y to yuv420_buffer
+
+
+
+   @-------------------------------------------Y ------------------------------------------
+
+            @---------------------------------------------Y1-------------------------------------------
+
+    pld         [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
+    vld4.8   {d0,d1,d2,d3}, [r5]! @loadRGB interleavely
+    vld4.8   {d4,d5,d6,d7}, [r5]! @loadRGB interleavely
+
+    vmov.u16 d8,d2
+    vmov.u16 d9,d6
+    vmov.u16 d10,d1
+    vmov.u16 d11,d5
+    vmov.u16 d12,d0
+    vmov.u16 d13,d4
+
+
+    vand.u16 q4,#0x00FF  @R
+    vand.u16 q5,#0x00FF  @G
+    vand.u16 q6,#0x00FF  @B
+
+
+
+    vmul.u16 q7,q4,q14  @q0 = 66 *R[k]
+    vmla.u16 q7,q5,q15  @q0 += 129 *G[k]
+    vmla.u16 q7,q6,q8  @q0 += 25 *B[k]
+    vadd.u16 q7,q7,q10
+    vshr.u16 q7,q7, #8
+
+    vmov.u16 d8,d2
+    vmov.u16 d9,d6
+    vmov.u16 d10,d1
+    vmov.u16 d11,d5
+    vmov.u16 d12,d0
+    vmov.u16 d13,d4
+
+    vshr.u16 q4,q4,#8  @R
+    vshr.u16 q5,q5,#8  @G
+    vshr.u16 q6,q6,#8  @B
+
+    vmul.u16 q0,q4,q14  @q0 = 66 *R[k]
+    vmla.u16 q0,q5,q15  @q0 += 129 *G[k]
+    vmla.u16 q0,q6,q8  @q0 += 25 *B[k]
+    vadd.u16 q0,q0,q10
+    vshr.u16 q0,q0, #8
+
+    vtrn.8 q7,q0
+    vst1.8  {q7}, [r4]!@write to Y to yuv420_buffer
+
+    subs r8,r8,#16                       @nWidth16--
+    BPL LOOP_NWIDTH16                @if nWidth16>0
+    @-----------------------------------unaligned ---------------------------------------
+
+    adds r8,r8,#16 @ + 16 - 2
+    BEQ NO_UNALIGNED  @in case that nWidht is multiple of 16
+LOOP_NWIDTH2:
+    @----------------------------------pDstRGB1--Y------------------------------------------
+    @stmfd sp!, {r14} @backup r14
+
+
+    ldr r9,  [r2], #4 @loadRGB  int
+    ldr r12,  [r2], #4 @loadRGB  int
+
+    mov r10, r9,lsr #16    @copy to r10
+    mov r14, r12    @copy to r10
+
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
+    ldr r6, =0x00FF0000
+    and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
+    add r10,r10,r14
+
+    mov r11, #66 @accumilator += R*66
+    mul r7, r10, r11
+
+    mov r10, r9,lsr #8    @copy to r10
+    mov r14, r12,lsl #8    @copy to r10
+
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @G:
+    ldr r6, =0x00FF0000
+    and r14, r14, r6 @G:
+    add r10,r10,r14
+
+    mov r11, #129 @accumilator += G *129
+    mla r7, r10, r11, r7
+
+    mov r10, r9    @copy to r10
+    mov r14, r12,lsl #16    @copy to r10
+
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @B
+    ldr r6, =0x00FF0000
+    and r14, r14, r6 @B
+    add r10,r10,r14
+
+    mov r11, #25 @accumilator 1 -= B *25
+    mla r7, r10, r11, r7
+
+    ldr r6, =0x10801080
+    add  r7, r6
+
+    lsr r7, #8
+    strb r7, [r0],#1
+    lsr r7,#16
+    strb r7, [r0],#1
+    @ldmfd sp!, {r14} @load r14
+
+
+    @----------------------------------pDstRGB2--UV------------------------------------------
+
+    mov r10, r9    @copy to r10
+    ldr  r7,=0x00008080
+    mov  r12,r7
+
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @B:
+
+    mov r11, #112 @accumilator += B*112
+    mla r7, r10, r11, r7
+
+
+    mov r11, #18 @accumilator -= B*18
+    mul r11, r10, r11
+    sub r12, r12, r11
+
+
+
+
+    mov r10, r9, lsr #16    @copy to r10
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
+
+    mov r11, #38 @accumilator -= R *38
+    mul r11, r10, r11
+    sub r7, r7, r11
+
+    mov r11, #112 @accumilator  = R *112
+    mla r12, r10, r11, r12
+
+    mov r10, r9,lsr #8    @copy to r10
+    ldr r6, =0x000000FF
+    and r10, r10, r6  @G: (rgbIn[k] & 0x07E0) >> 5;
+
+    mov r11, #74 @accumilator -= G*74
+    mul r11, r10, r11
+    sub r7, r7, r11
+
+    mov r11, #94 @accumilator -= G*94
+    mul r11, r10, r11
+    sub r12, r12, r11
+
+    lsr r7, #8 @ >>8
+    strb r7, [r1],#1
+    lsr r12, #8 @ >>8
+    strb r12, [r1],#1
+
+    @----------------------------------pDstRGB2--Y------------------------------------------
+    @stmfd sp!, {r14} @backup r14
+
+
+    ldr r9,  [r5], #4 @loadRGB  int
+    ldr r12,  [r5], #4 @loadRGB  int
+
+    mov r10, r9,lsr #16    @copy to r10
+    mov r14, r12    @copy to r10
+
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
+    ldr r6, =0x00FF0000
+    and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
+    add r10,r10,r14
+
+    mov r11, #66 @accumilator += R*66
+    mul r7, r10, r11
+
+    mov r10, r9,lsr #8    @copy to r10
+    mov r14, r12,lsl #8    @copy to r10
+
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @G:
+    ldr r6, =0x00FF0000
+    and r14, r14, r6 @G:
+    add r10,r10,r14
+
+    mov r11, #129 @accumilator += G *129
+    mla r7, r10, r11, r7
+
+    mov r10, r9    @copy to r10
+    mov r14, r12,lsl #16    @copy to r10
+
+    ldr r6, =0x000000FF
+    and r10, r10, r6 @B
+    ldr r6, =0x00FF0000
+    and r14, r14, r6 @B
+    add r10,r10,r14
+
+
+
+
+    mov r11, #25 @accumilator 1 -= B *25
+    mla r7, r10, r11, r7
+
+    ldr r6, =0x10801080
+    add  r7, r6
+    lsr r7, #8
+
+    strb r7, [r4],#1
+    lsr r7,#16
+    strb r7, [r4],#1
+    @ldmfd sp!, {r14} @load r14
+
+
+    subs r8,r8,#2                      @ nWidth2 -= 2
+    BGT LOOP_NWIDTH2                @ if nWidth2>0
+
+
+NO_UNALIGNED: @in case that nWidht is multiple of 16
+
+    @-----------------------------------------------------------------------------
+    sub         r8, r3, #16                @r8: nWidthTmp = nWidth -16
+    add r0, r0,  r3   @pDstY +  nwidth
+    add r2, r2, r3, lsl #2    @pSrcRGB +  nwidthx4
+    add r4, r4,  r3   @pDstY2 +  nwidth
+    add r5, r5, r3, lsl #2   @pSrcRGB2 +  nwidthx4
+
+    ldmfd sp!, {r12}
+    subs r12,r12,#2                       @nHeight -=2
+    BGT LOOP_NHEIGHT2                @if nHeight2>0
+
+    ldmfd       sp!, {r4-r12,pc}       @ backup registers
+    .fnend