[GPU] Add explicit byte order and PM vs. UPM 8888 configs

Review URL: http://codereview.appspot.com/5347042/



git-svn-id: http://skia.googlecode.com/svn/trunk@2618 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/gpu/GrAtlas.cpp b/src/gpu/GrAtlas.cpp
index 0838895..244f694 100644
--- a/src/gpu/GrAtlas.cpp
+++ b/src/gpu/GrAtlas.cpp
@@ -141,7 +141,7 @@
         case kA565_GrMaskFormat:
             return kRGB_565_GrPixelConfig;
         case kA888_GrMaskFormat:
-            return kRGBA_8888_GrPixelConfig;
+            return kSkia8888_PM_GrPixelConfig;
         default:
             GrAssert(!"unknown maskformat");
     }
diff --git a/src/gpu/GrContext.cpp b/src/gpu/GrContext.cpp
index f4e5e9b..8c06c21 100644
--- a/src/gpu/GrContext.cpp
+++ b/src/gpu/GrContext.cpp
@@ -718,7 +718,7 @@
                       kNoStencil_GrTextureFlagBit;
     }
 
-    desc.fConfig = kRGBA_8888_GrPixelConfig;
+    desc.fConfig = kRGBA_8888_PM_GrPixelConfig;
 
     if (PREFER_MSAA_OFFSCREEN_AA && fGpu->getCaps().fFSAASupport) {
         record->fDownsample = OffscreenRecord::kFSAA_Downsample;
@@ -1610,9 +1610,8 @@
     if (kDiscard_FlushBit & flagsBitfield) {
         fDrawBuffer->reset();
     } else {
-        flushDrawBuffer();
+        this->flushDrawBuffer();
     }
-
     if (kForceCurrentRenderTarget_FlushBit & flagsBitfield) {
         fGpu->forceRenderTargetFlush();
     }
@@ -1643,9 +1642,9 @@
     this->flush();
     GrRenderTarget* target = texture->asRenderTarget();
     if (NULL != target) {
-        return fGpu->readPixels(target,
-                                left, top, width, height, 
-                                config, buffer, 0);
+        return this->readRenderTargetPixels(target,
+                                            left, top, width, height, 
+                                            config, buffer, 0);
     } else {
         return false;
     }
@@ -1656,15 +1655,87 @@
                                        GrPixelConfig config, void* buffer,
                                        size_t rowBytes) {
     SK_TRACE_EVENT0("GrContext::readRenderTargetPixels");
-    uint32_t flushFlags = 0;
     if (NULL == target) { 
-        flushFlags |= GrContext::kForceCurrentRenderTarget_FlushBit;
+        target = fGpu->getRenderTarget();
+        if (NULL == target) {
+            return false;
+        }
+    }
+    
+    // PM <-> UPM conversion requires a draw. Currently we only support drawing
+    // into a UPM target, not reading from a UPM texture. Thus, UPM->PM is not
+    // not supported at this time.
+    if (GrPixelConfigIsUnpremultiplied(target->config()) && 
+        !GrPixelConfigIsUnpremultiplied(config)) {
+        return false;
     }
 
-    this->flush(flushFlags);
+    this->flush();
+
+    GrTexture* src = target->asTexture();
+
+    bool flipY = NULL != src &&
+                 fGpu->readPixelsWillPayForYFlip(target, left, top,
+                                                 width, height, config,
+                                                 rowBytes);
+
+    if (flipY || (!GrPixelConfigIsUnpremultiplied(target->config()) &&
+                  GrPixelConfigIsUnpremultiplied(config))) {
+        if (!src) {
+            // we should fallback to cpu conversion here. This could happen when
+            // we were given an external render target by the client that is not
+            // also a texture (e.g. FBO 0 in GL)
+            return false;
+        }
+        // Make the scratch a render target because we don't have a robust
+        // readTexturePixels as of yet (it calls this function).
+        const GrTextureDesc desc = {
+            kRenderTarget_GrTextureFlagBit,
+            kNone_GrAALevel,
+            width, height,
+            config
+        };
+        GrAutoScratchTexture ast(this, desc);
+        GrTexture* texture = ast.texture();
+        if (!texture) {
+            return false;
+        }
+        target = texture->asRenderTarget();
+        fGpu->setRenderTarget(target);
+        GrAssert(NULL != target);
+
+        GrDrawTarget::AutoStateRestore asr(fGpu);
+
+        fGpu->setViewMatrix(GrMatrix::I());
+        fGpu->setColorFilter(0, SkXfermode::kDst_Mode);
+        fGpu->disableState(GrDrawTarget::kClip_StateBit);
+        fGpu->setAlpha(0xFF);
+        fGpu->setBlendFunc(kOne_BlendCoeff,
+                           kZero_BlendCoeff);
+
+        GrSamplerState sampler;
+        sampler.setClampNoFilter();
+        GrMatrix matrix;
+        if (flipY) {
+            matrix.setTranslate(SK_Scalar1 * left,
+                                SK_Scalar1 * (top + height));
+            matrix.set(GrMatrix::kMScaleY, -GR_Scalar1);
+        } else {
+            matrix.setTranslate(SK_Scalar1 *left, SK_Scalar1 *top);
+        }
+        matrix.postIDiv(src->width(), src->height());
+        sampler.setMatrix(matrix);
+        fGpu->setSamplerState(0, sampler);
+        fGpu->setTexture(0, src);
+        GrRect rect;
+        rect.setXYWH(0, 0, SK_Scalar1 * width, SK_Scalar1 * height);
+        fGpu->drawSimpleRect(rect, NULL, 0x1);
+        left = 0;
+        top = 0;
+    }
     return fGpu->readPixels(target,
-                            left, top, width, height, 
-                            config, buffer, rowBytes);
+                            left, top, width, height,
+                            config, buffer, rowBytes, flipY);
 }
 
 void GrContext::writePixels(int left, int top, int width, int height,
@@ -1675,7 +1746,7 @@
     // TODO: when underlying api has a direct way to do this we should use it
     // (e.g. glDrawPixels on desktop GL).
 
-    this->flush(true);
+    this->flush(kForceCurrentRenderTarget_FlushBit);
 
     const GrTextureDesc desc = {
         kNone_GrTextureFlags, kNone_GrAALevel, width, height, config
@@ -1949,3 +2020,5 @@
     fGpu->setBlendFunc(kOne_BlendCoeff, kZero_BlendCoeff);
     fGpu->drawSimpleRect(rect, NULL, 1 << 0);
 }
+
+///////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/GrDrawTarget.cpp b/src/gpu/GrDrawTarget.cpp
index 8f4f3eb..6b51ca2 100644
--- a/src/gpu/GrDrawTarget.cpp
+++ b/src/gpu/GrDrawTarget.cpp
@@ -800,6 +800,12 @@
         }
     }
 #endif
+    for (int s = 0; s < GrDrawState::kNumStages; ++s) {
+        if (this->isStageEnabled(s) &&
+            GrPixelConfigIsUnpremultiplied(fCurrDrawState.fTextures[s]->config())) {
+            return false;
+        }
+    }
     return true;
 }
 
@@ -814,7 +820,6 @@
     }
 }
 
-
 void GrDrawTarget::drawNonIndexed(GrPrimitiveType type,
                                   int startVertex,
                                   int vertexCount) {
diff --git a/src/gpu/GrGLProgram.cpp b/src/gpu/GrGLProgram.cpp
index 554a274..89ced68 100644
--- a/src/gpu/GrGLProgram.cpp
+++ b/src/gpu/GrGLProgram.cpp
@@ -892,6 +892,13 @@
                         inColor.c_str(),
                         inCoverage.c_str(),
                         &segments.fFSCode);
+        if (ProgramDesc::kNo_OutputPM == fProgramDesc.fOutputPM) {
+            segments.fFSCode.appendf("\t%s = vec4(%s.rgb / %s.a, %s.a);\n",
+                                     fsColorOutput,
+                                     fsColorOutput,
+                                     fsColorOutput,
+                                     fsColorOutput);
+        }
     }
 
     segments.fVSCode.append("}\n");
diff --git a/src/gpu/GrGLProgram.h b/src/gpu/GrGLProgram.h
index 53bc0d8..9abf1b1 100644
--- a/src/gpu/GrGLProgram.h
+++ b/src/gpu/GrGLProgram.h
@@ -102,6 +102,15 @@
             memset(this, 0, sizeof(ProgramDesc));
         }
 
+        enum OutputPM {
+            // PM-color OR color with no alpha channel
+            kYes_OutputPM,
+            // nonPM-color with alpha channel
+            kNo_OutputPM,
+
+            kOutputPMCnt
+        };
+
         struct StageDesc {
             enum OptFlagBits {
                 kNoPerspective_OptFlagBit       = 1 << 0,
@@ -195,6 +204,7 @@
 #endif
 
         uint8_t fColorInput;        // casts to enum ColorInput
+        uint8_t fOutputPM;          // cases to enum OutputPM
         uint8_t fDualSrcOutput;     // casts to enum DualSrcOutput
         int8_t fFirstCoverageStage;
         SkBool8 fEmitsPointSize;
@@ -203,8 +213,6 @@
         int8_t fEdgeAANumEdges;
         uint8_t fColorFilterXfermode;  // casts to enum SkXfermode::Mode
 
-        uint8_t fPadTo32bLengthMultiple [1];
-
     } fProgramDesc;
     GR_STATIC_ASSERT(!(sizeof(ProgramDesc) % 4));
 
diff --git a/src/gpu/GrGpu.cpp b/src/gpu/GrGpu.cpp
index b7d1395..7da560e 100644
--- a/src/gpu/GrGpu.cpp
+++ b/src/gpu/GrGpu.cpp
@@ -242,11 +242,12 @@
 bool GrGpu::readPixels(GrRenderTarget* target,
                        int left, int top, int width, int height,
                        GrPixelConfig config, void* buffer,
-                       size_t rowBytes) {
-
+                       size_t rowBytes, bool invertY) {
+    GrAssert(GrPixelConfigIsUnpremultiplied(config) ==
+             GrPixelConfigIsUnpremultiplied(target->config()));
     this->handleDirtyContext();
     return this->onReadPixels(target, left, top, width, height,
-                              config, buffer, rowBytes);
+                              config, buffer, rowBytes, invertY);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/GrGpu.h b/src/gpu/GrGpu.h
index b0aaa18..11edfe3 100644
--- a/src/gpu/GrGpu.h
+++ b/src/gpu/GrGpu.h
@@ -184,7 +184,34 @@
     void forceRenderTargetFlush();
 
     /**
-     * Reads a rectangle of pixels from a render target.
+     * OpenGL's readPixels returns the result bottom-to-top while the skia
+     * API is top-to-bottom. Thus we have to do a y-axis flip. The obvious
+     * solution is to have the subclass do the flip using either the CPU or GPU.
+     * However, the caller (GrContext) may have transformations to apply and can
+     * simply fold in the y-flip for free. On the other hand, the subclass may
+     * be able to do it for free itself. For example, the subclass may have to
+     * do memcpys to handle rowBytes that aren't tight. It could do the y-flip 
+     * concurrently.
+     *
+     * This function returns true if a y-flip is required to put the pixels in
+     * top-to-bottom order and the subclass cannot do it for free.
+     *
+     * See read pixels for the params
+     * @return true if calling readPixels with the same set of params will
+     *              produce bottom-to-top data
+     */
+     virtual bool readPixelsWillPayForYFlip(GrRenderTarget* renderTarget,
+                                            int left, int top,
+                                            int width, int height,
+                                            GrPixelConfig config,
+                                            size_t rowBytes) = 0;
+
+    /**
+     * Reads a rectangle of pixels from a render target. Fails if read requires
+     * conversion between premultiplied and unpremultiplied configs. The caller
+     * should do the conversion by rendering to a target with the desire config
+     * first.
+     *
      * @param renderTarget  the render target to read from. NULL means the
      *                      current render target.
      * @param left          left edge of the rectangle to read (inclusive)
@@ -195,6 +222,8 @@
      * @param buffer        memory to read the rectangle into.
      * @param rowBytes      the number of bytes between consecutive rows. Zero
      *                      means rows are tightly packed.
+     * @param invertY       buffer should be populated bottom-to-top as opposed
+     *                      to top-to-bottom (skia's usual order)
      *
      * @return true if the read succeeded, false if not. The read can fail
      *              because of a unsupported pixel config or because no render
@@ -202,7 +231,8 @@
      */
     bool readPixels(GrRenderTarget* renderTarget,
                     int left, int top, int width, int height,
-                    GrPixelConfig config, void* buffer, size_t rowBytes);
+                    GrPixelConfig config, void* buffer, size_t rowBytes,
+                    bool invertY);
 
     const GrGpuStats& getStats() const;
     void resetStats();
@@ -356,7 +386,10 @@
     // overridden by API-specific derived class to perform the read pixels.
     virtual bool onReadPixels(GrRenderTarget* target,
                               int left, int top, int width, int height,
-                              GrPixelConfig, void* buffer, size_t rowBytes) = 0;
+                              GrPixelConfig,
+                              void* buffer,
+                              size_t rowBytes,
+                              bool invertY) = 0;
 
     // called to program the vertex data, indexCount will be 0 if drawing non-
     // indexed geometry. The subclass may adjust the startVertex and/or
diff --git a/src/gpu/GrGpuGL.cpp b/src/gpu/GrGpuGL.cpp
index a3562b2..016b110 100644
--- a/src/gpu/GrGpuGL.cpp
+++ b/src/gpu/GrGpuGL.cpp
@@ -384,14 +384,24 @@
     if (kDesktop_GrGLBinding == this->glBinding()) {
         fGLCaps.fRGBA8Renderbuffer = true;
     } else {
-        fGLCaps.fRGBA8Renderbuffer = this->hasExtension("GL_OES_rgb8_rgba8");
+        fGLCaps.fRGBA8Renderbuffer = this->hasExtension("GL_OES_rgb8_rgba8") ||
+                                     this->hasExtension("GL_ARM_rgba8");
     }
 
 
-    if (kDesktop_GrGLBinding != this->glBinding()) {
-        if (GR_GL_32BPP_COLOR_FORMAT == GR_GL_BGRA) {
-            GrAssert(this->hasExtension("GL_EXT_texture_format_BGRA8888"));
+    if (kDesktop_GrGLBinding == this->glBinding()) {
+        fGLCaps.fBGRAFormat = this->glVersion() >= GR_GL_VER(1,2) ||
+                              this->hasExtension("GL_EXT_bgra");
+    } else {
+        bool hasBGRAExt = false;
+        if (this->hasExtension("GL_APPLE_texture_format_BGRA8888")) {
+            fGLCaps.fBGRAFormat = true;
+        } else if (this->hasExtension("GL_EXT_texture_format_BGRA8888")) {
+            fGLCaps.fBGRAFormat = true;
+            fGLCaps.fBGRAInternalFormat = true;
         }
+        GrAssert(fGLCaps.fBGRAFormat ||
+                 kSkia8888_PM_GrPixelConfig != kBGRA_8888_PM_GrPixelConfig);
     }
 
     if (kDesktop_GrGLBinding == this->glBinding()) {
@@ -1497,18 +1507,35 @@
     this->flushRenderTarget(&GrIRect::EmptyIRect());
 }
 
+bool GrGpuGL::readPixelsWillPayForYFlip(GrRenderTarget* renderTarget,
+                                        int left, int top,
+                                        int width, int height,
+                                        GrPixelConfig config,
+                                        size_t rowBytes) {
+    if (kDesktop_GrGLBinding == this->glBinding()) {
+        return false;
+    } else {
+        // On ES we'll have to do memcpys to handle rowByte padding. So, we
+        // might as well flipY while we're at it.
+        return 0 != rowBytes &&
+               GrBytesPerPixel(config) * width != rowBytes; 
+    }
+}
+
 bool GrGpuGL::onReadPixels(GrRenderTarget* target,
                            int left, int top,
                            int width, int height,
-                           GrPixelConfig config, 
-                           void* buffer, size_t rowBytes) {
+                           GrPixelConfig config,
+                           void* buffer,
+                           size_t rowBytes,
+                           bool invertY) {
     GrGLenum internalFormat;  // we don't use this for glReadPixels
     GrGLenum format;
     GrGLenum type;
     if (!this->canBeTexture(config, &internalFormat, &format, &type)) {
         return false;
     }
-    
+
     // resolve the render target if necessary
     GrGLRenderTarget* tgt = static_cast<GrGLRenderTarget*>(target);
     GrAutoTPtrValueRestore<GrRenderTarget*> autoTargetRestore;
@@ -1568,29 +1595,38 @@
     // that the above readPixels did not overwrite the padding.
     if (readDst == buffer) {
         GrAssert(rowBytes == readDstRowBytes);
-        scratch.reset(tightRowBytes);
-        void* tmpRow = scratch.get();
-        // flip y in-place by rows
-        const int halfY = height >> 1;
-        char* top = reinterpret_cast<char*>(buffer);
-        char* bottom = top + (height - 1) * rowBytes;
-        for (int y = 0; y < halfY; y++) {
-            memcpy(tmpRow, top, tightRowBytes);
-            memcpy(top, bottom, tightRowBytes);
-            memcpy(bottom, tmpRow, tightRowBytes);
-            top += rowBytes;
-            bottom -= rowBytes;
+        if (!invertY) {
+            scratch.reset(tightRowBytes);
+            void* tmpRow = scratch.get();
+            // flip y in-place by rows
+            const int halfY = height >> 1;
+            char* top = reinterpret_cast<char*>(buffer);
+            char* bottom = top + (height - 1) * rowBytes;
+            for (int y = 0; y < halfY; y++) {
+                memcpy(tmpRow, top, tightRowBytes);
+                memcpy(top, bottom, tightRowBytes);
+                memcpy(bottom, tmpRow, tightRowBytes);
+                top += rowBytes;
+                bottom -= rowBytes;
+            }
         }
     } else {
-        GrAssert(readDst != buffer);
+        GrAssert(readDst != buffer);        GrAssert(rowBytes != tightRowBytes);
         // copy from readDst to buffer while flipping y
         const int halfY = height >> 1;
         const char* src = reinterpret_cast<const char*>(readDst);
-        char* dst = reinterpret_cast<char*>(buffer) + (height-1) * rowBytes;
+        char* dst = reinterpret_cast<char*>(buffer);
+        if (!invertY) {
+            dst += (height-1) * rowBytes;
+        }
         for (int y = 0; y < height; y++) {
             memcpy(dst, src, tightRowBytes);
             src += readDstRowBytes;
-            dst -= rowBytes;
+            if (invertY) {
+                dst += rowBytes;
+            } else {
+                dst -= rowBytes;
+            }
         }
     }
     return true;
@@ -2249,13 +2285,20 @@
                            GrGLenum* format,
                            GrGLenum* type) {
     switch (config) {
-        case kRGBA_8888_GrPixelConfig:
-        case kRGBX_8888_GrPixelConfig: // todo: can we tell it our X?
-            *format = GR_GL_32BPP_COLOR_FORMAT;
-            if (kDesktop_GrGLBinding != this->glBinding()) {
-                // according to GL_EXT_texture_format_BGRA8888 the *internal*
-                // format for a BGRA is BGRA not RGBA (as on desktop)
-                *internalFormat = GR_GL_32BPP_COLOR_FORMAT;
+        case kRGBA_8888_PM_GrPixelConfig:
+        case kRGBA_8888_UPM_GrPixelConfig:
+            *format = GR_GL_RGBA;
+            *internalFormat = GR_GL_RGBA;
+            *type = GR_GL_UNSIGNED_BYTE;
+            break;
+        case kBGRA_8888_PM_GrPixelConfig:
+        case kBGRA_8888_UPM_GrPixelConfig:
+            if (!fGLCaps.fBGRAFormat) {
+                return false;
+            }
+            *format = GR_GL_BGRA;
+            if (fGLCaps.fBGRAInternalFormat) {
+                *internalFormat = GR_GL_BGRA;
             } else {
                 *internalFormat = GR_GL_RGBA;
             }
@@ -2314,9 +2357,22 @@
  */
 bool GrGpuGL::fboInternalFormat(GrPixelConfig config, GrGLenum* format) {
     switch (config) {
-        case kRGBA_8888_GrPixelConfig:
-        case kRGBX_8888_GrPixelConfig:
+        // The ES story for BGRA and RenderbufferStorage appears murky. It
+        // takes an internal format as a parameter. The OES FBO extension and
+        // 2.0 spec don't refer to BGRA as it's not part of the core. One ES
+        // BGRA extensions adds BGRA as both an internal and external format
+        // and the other only as an external format (like desktop GL). OES
+        // restricts RenderbufferStorage's format to a *sized* internal format.
+        // There is no sized BGRA internal format.
+        // So if the texture has internal format BGRA we just hope that the
+        // resolve blit can do RGBA->BGRA internal format conversion.
+        case kRGBA_8888_PM_GrPixelConfig:
+        case kRGBA_8888_UPM_GrPixelConfig:
+        case kBGRA_8888_PM_GrPixelConfig:
+        case kBGRA_8888_UPM_GrPixelConfig:
             if (fGLCaps.fRGBA8Renderbuffer) {
+                // The GL_OES_rgba8_rgb8 extension defines GL_RGBA8 as a sized
+                // internal format.
                 *format = GR_GL_RGBA8;
                 return true;
             } else {
@@ -2326,9 +2382,12 @@
             // ES2 supports 565. ES1 supports it
             // with FBO extension desktop GL has
             // no such internal format
-            GrAssert(kDesktop_GrGLBinding != this->glBinding());  
-            *format = GR_GL_RGB565;
-            return true;
+            if (kDesktop_GrGLBinding != this->glBinding()) {
+                *format = GR_GL_RGB565;
+                return true;
+            } else {
+                return false;
+            }
         case kRGBA_4444_GrPixelConfig:
             *format = GR_GL_RGBA4;
             return true;
@@ -2435,6 +2494,8 @@
     GrPrintf("Max FS Uniform Vectors: %d\n", fMaxFragmentUniformVectors);
     GrPrintf("Support RGBA8 Render Buffer: %s\n",
              (fRGBA8Renderbuffer ? "YES": "NO"));
+    GrPrintf("BGRA is an internal format: %s\n",
+             (fBGRAInternalFormat ? "YES": "NO"));
     GrPrintf("Support texture swizzle: %s\n",
              (fTextureSwizzle ? "YES": "NO"));
 }
diff --git a/src/gpu/GrGpuGL.h b/src/gpu/GrGpuGL.h
index 3f10967..1dd9c80 100644
--- a/src/gpu/GrGpuGL.h
+++ b/src/gpu/GrGpuGL.h
@@ -22,7 +22,6 @@
 #include "SkString.h"
 
 class GrGpuGL : public GrGpu {
-
 public:
     virtual ~GrGpuGL();
 
@@ -30,22 +29,30 @@
     GrGLBinding glBinding() const { return fGLBinding; }
     GrGLVersion glVersion() const { return fGLVersion; }
 
+    virtual bool readPixelsWillPayForYFlip(
+                                    GrRenderTarget* renderTarget,
+                                    int left, int top,
+                                    int width, int height,
+                                    GrPixelConfig config,
+                                    size_t rowBytes) SK_OVERRIDE;
+
 protected:
     GrGpuGL(const GrGLInterface* glInterface, GrGLBinding glBinding);
 
     struct GLCaps {
         GLCaps()
             // make defaults be the most restrictive
-            : fMSFBOType(kNone_MSFBO)
+            : fStencilFormats(8) // prealloc space for    stencil formats
+            , fMSFBOType(kNone_MSFBO)
             , fMaxFragmentUniformVectors(0)
             , fRGBA8Renderbuffer(false)
             , fBGRAFormat(false)
-            , fStencilFormats(8) // prealloc space for    stencil formats
+            , fBGRAInternalFormat(false)
             , fTextureSwizzle(false) {
             memset(fAASamples, 0, sizeof(fAASamples));
         }
         SkTArray<GrGLStencilBuffer::Format, true> fStencilFormats;
-    
+
         enum {
             /**
              * no support for MSAA FBOs
@@ -64,19 +71,24 @@
              */
             kAppleES_MSFBO,
         } fMSFBOType;
-    
+
         // TODO: get rid of GrAALevel and use sample cnt directly
         GrGLuint fAASamples[4];
-    
+
         // The maximum number of fragment uniform vectors (GLES has min. 16).
         int fMaxFragmentUniformVectors;
-    
+
         // ES requires an extension to support RGBA8 in RenderBufferStorage
         bool fRGBA8Renderbuffer;
-    
+
         // Is GL_BGRA supported
         bool fBGRAFormat;
-    
+
+        // Depending on the ES extensions present the BGRA external format may
+        // correspond either a BGRA or RGBA internalFormat. On desktop GL it is
+        // RGBA
+        bool fBGRAInternalFormat;
+
         // GL_ARB_texture_swizzle support
         bool fTextureSwizzle;
     
@@ -148,7 +160,9 @@
                               int left, int top, 
                               int width, int height,
                               GrPixelConfig, 
-                              void* buffer, size_t rowBytes) SK_OVERRIDE;
+                              void* buffer,
+                              size_t rowBytes,
+                              bool invertY) SK_OVERRIDE;
 
     virtual void onGpuDrawIndexed(GrPrimitiveType type,
                                   uint32_t startVertex,
diff --git a/src/gpu/GrGpuGLShaders.cpp b/src/gpu/GrGpuGLShaders.cpp
index e3d2c2e..8e7c25f 100644
--- a/src/gpu/GrGpuGLShaders.cpp
+++ b/src/gpu/GrGpuGLShaders.cpp
@@ -221,6 +221,8 @@
         pdesc.fExperimentalGS = this->getCaps().fGeometryShaderSupport &&
                                 random.nextF() > .5f;
 #endif
+        pdesc.fOutputPM =  static_cast<int>(random.nextF() *
+                                            ProgramDesc::kOutputPMCnt);
 
         bool edgeAA = random.nextF() > .5f;
         if (edgeAA) {
@@ -1008,6 +1010,7 @@
             } else {
                 stage.fInputConfig = StageDesc::kColor_InputConfig;
             }
+
             if (sampler.getFilter() == GrSamplerState::kConvolution_Filter) {
                 stage.fKernelWidth = sampler.getKernelWidth();
             } else {
@@ -1022,6 +1025,12 @@
         }
     }
 
+    if (GrPixelConfigIsUnpremultiplied(fCurrDrawState.fRenderTarget->config())) {
+        desc.fOutputPM = ProgramDesc::kNo_OutputPM;
+    } else {
+        desc.fOutputPM = ProgramDesc::kYes_OutputPM;
+    }
+
     desc.fDualSrcOutput = ProgramDesc::kNone_DualSrcOutput;
 
     // currently the experimental GS will only work with triangle prims
diff --git a/src/gpu/SkGpuDevice.cpp b/src/gpu/SkGpuDevice.cpp
index 730f631..031333b 100644
--- a/src/gpu/SkGpuDevice.cpp
+++ b/src/gpu/SkGpuDevice.cpp
@@ -116,9 +116,10 @@
         case kRGBA_4444_GrPixelConfig:
             *isOpaque = false;
             return SkBitmap::kARGB_4444_Config;
-        case kRGBA_8888_GrPixelConfig:
-        case kRGBX_8888_GrPixelConfig:
-            *isOpaque = (kRGBX_8888_GrPixelConfig == config);
+        case kSkia8888_PM_GrPixelConfig:
+            // we don't currently have a way of knowing whether
+            // a 8888 is opaque based on the config.
+            *isOpaque = false;
             return SkBitmap::kARGB_8888_Config;
         default:
             *isOpaque = false;
@@ -256,23 +257,43 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+namespace {
+GrPixelConfig config8888_to_gr_config(SkCanvas::Config8888 config8888) {
+    switch (config8888) {
+        case SkCanvas::kNative_Premul_Config8888:
+            return kSkia8888_PM_GrPixelConfig;
+        case SkCanvas::kNative_Unpremul_Config8888:
+            return kSkia8888_UPM_GrPixelConfig;
+        case SkCanvas::kBGRA_Premul_Config8888:
+            return kBGRA_8888_PM_GrPixelConfig;
+        case SkCanvas::kBGRA_Unpremul_Config8888:
+            return kBGRA_8888_UPM_GrPixelConfig;
+        case SkCanvas::kRGBA_Premul_Config8888:
+            return kRGBA_8888_PM_GrPixelConfig;
+        case SkCanvas::kRGBA_Unpremul_Config8888:
+            return kRGBA_8888_UPM_GrPixelConfig;
+        default:
+            GrCrash("Unexpected Config8888.");
+            return kSkia8888_PM_GrPixelConfig;
+    }
+}
+}
+
 bool SkGpuDevice::onReadPixels(const SkBitmap& bitmap,
                                int x, int y,
                                SkCanvas::Config8888 config8888) {
-    // support for non-native configs coming soon
-    if (config8888 != SkCanvas::kNative_Premul_Config8888) {
-        return false;
-    }
     SkASSERT(SkBitmap::kARGB_8888_Config == bitmap.config());
     SkASSERT(!bitmap.isNull());
     SkASSERT(SkIRect::MakeWH(this->width(), this->height()).contains(SkIRect::MakeXYWH(x, y, bitmap.width(), bitmap.height())));
 
     SkAutoLockPixels alp(bitmap);
+    GrPixelConfig config;
+    config = config8888_to_gr_config(config8888);
     return fContext->readRenderTargetPixels(fRenderTarget,
                                             x, y,
                                             bitmap.width(),
                                             bitmap.height(),
-                                            kRGBA_8888_GrPixelConfig,
+                                            config,
                                             bitmap.getPixels(),
                                             bitmap.rowBytes());
 }
@@ -739,7 +760,7 @@
         srcRect.height(),
         // We actually only need A8, but it often isn't supported as a
         // render target
-        kRGBA_8888_GrPixelConfig
+        kRGBA_8888_PM_GrPixelConfig
     };
 
     GrAutoScratchTexture srcEntry(context, desc);
diff --git a/src/gpu/SkGr.cpp b/src/gpu/SkGr.cpp
index bbb55c8..6fe8292 100644
--- a/src/gpu/SkGr.cpp
+++ b/src/gpu/SkGr.cpp
@@ -207,11 +207,7 @@
         case SkBitmap::kARGB_4444_Config:
             return kRGBA_4444_GrPixelConfig;
         case SkBitmap::kARGB_8888_Config:
-            if (isOpaque) {
-                return kRGBX_8888_GrPixelConfig;
-            } else {
-                return kRGBA_8888_GrPixelConfig;
-            }
+            return kSkia8888_PM_GrPixelConfig;
         default:
             return kUnknown_GrPixelConfig;
     }
diff --git a/src/gpu/SkGrTexturePixelRef.cpp b/src/gpu/SkGrTexturePixelRef.cpp
index 40db1c5..140993a 100644
--- a/src/gpu/SkGrTexturePixelRef.cpp
+++ b/src/gpu/SkGrTexturePixelRef.cpp
@@ -78,7 +78,7 @@
         SkAutoLockPixels al(*dst);
         void* buffer = dst->getPixels();
         return fTexture->readPixels(left, top, width, height,
-                                    kRGBA_8888_GrPixelConfig,
+                                    kSkia8888_PM_GrPixelConfig,
                                     buffer);
     } else {
         return false;
@@ -122,7 +122,7 @@
         SkAutoLockPixels al(*dst);
         void* buffer = dst->getPixels();
         return fRenderTarget->readPixels(left, top, width, height,
-                                         kRGBA_8888_GrPixelConfig,
+                                         kSkia8888_PM_GrPixelConfig,
                                          buffer);
     } else {
         return false;