| /* libs/pixelflinger/t32cb16blend.S | 
 | ** | 
 | ** Copyright 2006, The Android Open Source Project | 
 | ** | 
 | ** Licensed under the Apache License, Version 2.0 (the "License");  | 
 | ** you may not use this file except in compliance with the License.  | 
 | ** You may obtain a copy of the License at  | 
 | ** | 
 | **     http://www.apache.org/licenses/LICENSE-2.0  | 
 | ** | 
 | ** Unless required by applicable law or agreed to in writing, software  | 
 | ** distributed under the License is distributed on an "AS IS" BASIS,  | 
 | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
 | ** See the License for the specific language governing permissions and  | 
 | ** limitations under the License. | 
 | */ | 
 |  | 
 |  | 
 | 	.text | 
 | 	.align | 
 | 	 | 
 | 	.global scanline_t32cb16blend_arm | 
 |  | 
 |  | 
 | /* | 
 |  * .macro pixel | 
 |  * | 
 |  * \DREG is a 32-bit register containing *two* original destination RGB565  | 
 |  *       pixels, with the even one in the low-16 bits, and the odd one in the | 
 |  *       high 16 bits. | 
 |  * | 
 |  * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors. | 
 |  * | 
 |  * \FB is a target register that will contain the blended pixel values. | 
 |  * | 
 |  * \ODD is either 0 or 1 and indicates if we're blending the lower or  | 
 |  *      upper 16-bit pixels in DREG into FB | 
 |  * | 
 |  * | 
 |  * clobbered: r6, r7, lr | 
 |  * | 
 |  */ | 
 |  | 
 | .macro pixel,   DREG, SRC, FB, ODD | 
 |  | 
 |     // SRC = 0xAABBGGRR | 
 |     mov     r7, \SRC, lsr #24           // sA | 
 |     add     r7, r7, r7, lsr #7          // sA + (sA >> 7) | 
 |     rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7)) | 
 |  | 
 | 1: | 
 |  | 
 | .if \ODD | 
 |  | 
 |     // red | 
 |     mov     lr, \DREG, lsr #(16 + 11) | 
 |     smulbb  lr, r7, lr | 
 |     mov     r6, \SRC, lsr #3 | 
 |     and     r6, r6, #0x1F | 
 |     add     lr, r6, lr, lsr #8 | 
 |     cmp     lr, #0x1F | 
 |     orrhs   \FB, \FB, #(0x1F<<(16 + 11)) | 
 |     orrlo   \FB, \FB, lr, lsl #(16 + 11) | 
 |  | 
 |         // green | 
 |         and     r6, \DREG, #(0x3F<<(16 + 5)) | 
 |         smulbt  r6, r7, r6 | 
 |         mov     lr, \SRC, lsr #(8+2) | 
 |         and     lr, lr, #0x3F | 
 |         add     r6, lr, r6, lsr #(5+8) | 
 |         cmp     r6, #0x3F | 
 |         orrhs   \FB, \FB, #(0x3F<<(16 + 5)) | 
 |         orrlo   \FB, \FB, r6, lsl #(16 + 5) | 
 |  | 
 |             // blue | 
 |             and     lr, \DREG, #(0x1F << 16) | 
 |             smulbt  lr, r7, lr | 
 |             mov     r6, \SRC, lsr #(8+8+3) | 
 |             and     r6, r6, #0x1F | 
 |             add     lr, r6, lr, lsr #8 | 
 |             cmp     lr, #0x1F | 
 |             orrhs   \FB, \FB, #(0x1F << 16) | 
 |             orrlo   \FB, \FB, lr, lsl #16 | 
 |  | 
 | .else | 
 |  | 
 |     // red | 
 |     mov     lr, \DREG, lsr #11 | 
 |     and     lr, lr, #0x1F | 
 |     smulbb  lr, r7, lr | 
 |     mov     r6, \SRC, lsr #3 | 
 |     and     r6, r6, #0x1F | 
 |     add     lr, r6, lr, lsr #8 | 
 |     cmp     lr, #0x1F | 
 |     movhs   \FB, #(0x1F<<11) | 
 |     movlo   \FB, lr, lsl #11 | 
 |  | 
 |  | 
 |         // green | 
 |         and     r6, \DREG, #(0x3F<<5) | 
 |         smulbb  r6, r7, r6 | 
 |         mov     lr, \SRC, lsr #(8+2) | 
 |         and     lr, lr, #0x3F | 
 |         add     r6, lr, r6, lsr #(5+8) | 
 |         cmp     r6, #0x3F | 
 |         orrhs   \FB, \FB, #(0x3F<<5) | 
 |         orrlo   \FB, \FB, r6, lsl #5 | 
 |  | 
 |             // blue | 
 |             and     lr, \DREG, #0x1F | 
 |             smulbb  lr, r7, lr | 
 |             mov     r6, \SRC, lsr #(8+8+3) | 
 |             and     r6, r6, #0x1F | 
 |             add     lr, r6, lr, lsr #8 | 
 |             cmp     lr, #0x1F | 
 |             orrhs   \FB, \FB, #0x1F | 
 |             orrlo   \FB, \FB, lr | 
 |  | 
 | .endif | 
 |  | 
 |     .endm | 
 |      | 
 |  | 
 | // r0:  dst ptr | 
 | // r1:  src ptr | 
 | // r2:  count | 
 | // r3:  d | 
 | // r4:  s0 | 
 | // r5:  s1 | 
 | // r6:  pixel | 
 | // r7:  pixel | 
 | // r8:  free | 
 | // r9:  free | 
 | // r10: free | 
 | // r11: free | 
 | // r12: scratch | 
 | // r14: pixel | 
 |  | 
 | scanline_t32cb16blend_arm: | 
 |     stmfd	sp!, {r4-r7, lr} | 
 |  | 
 |     pld     [r0] | 
 |     pld     [r1] | 
 |  | 
 |     // align DST to 32 bits | 
 |     tst     r0, #0x3 | 
 |     beq     aligned | 
 |     subs    r2, r2, #1 | 
 |     ldmlofd	sp!, {r4-r7, lr}        // return | 
 |     bxlo    lr | 
 |  | 
 | last: | 
 |     ldr     r4, [r1], #4 | 
 |     ldrh    r3, [r0] | 
 |     pixel   r3, r4, r12, 0 | 
 |     strh    r12, [r0], #2 | 
 |  | 
 | aligned: | 
 |     subs    r2, r2, #2 | 
 |     blo     9f | 
 |  | 
 |     // The main loop is unrolled twice and processes 4 pixels | 
 | 8:  ldmia   r1!, {r4, r5} | 
 |     // stream the source | 
 |     pld     [r1, #32] | 
 |     add     r0, r0, #4 | 
 |     // it's all zero, skip this pixel | 
 |     orrs    r3, r4, r5 | 
 |     beq     7f | 
 |      | 
 |     // load the destination | 
 |     ldr     r3, [r0, #-4] | 
 |     // stream the destination | 
 |     pld     [r0, #32] | 
 |     pixel   r3, r4, r12, 0 | 
 |     pixel   r3, r5, r12, 1 | 
 |     // effectively, we're getting write-combining by virtue of the | 
 |     // cpu's write-back cache. | 
 |     str     r12, [r0, #-4] | 
 |  | 
 |     // 2nd iterration of the loop, don't stream anything | 
 |     subs    r2, r2, #2 | 
 |     movlt   r4, r5 | 
 |     blt     9f | 
 |     ldmia   r1!, {r4, r5} | 
 |     add     r0, r0, #4 | 
 |     orrs    r3, r4, r5 | 
 |     beq     7f | 
 |     ldr     r3, [r0, #-4] | 
 |     pixel   r3, r4, r12, 0 | 
 |     pixel   r3, r5, r12, 16 | 
 |     str     r12, [r0, #-4] | 
 |  | 
 |      | 
 | 7:  subs    r2, r2, #2 | 
 |     bhs     8b | 
 |     mov     r4, r5 | 
 |  | 
 | 9:  adds    r2, r2, #1 | 
 |     ldmlofd sp!, {r4-r7, lr}        // return | 
 |     bxlo    lr | 
 |     b       last |