Do premul and r/b swap conversions in a custom effect

Review URL: https://codereview.appspot.com/6473060/



git-svn-id: http://skia.googlecode.com/svn/trunk@5284 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/gyp/gpu.gypi b/gyp/gpu.gypi
index dd70cf0..af8bfe7 100644
--- a/gyp/gpu.gypi
+++ b/gyp/gpu.gypi
@@ -124,6 +124,8 @@
       '<(skia_src_path)/gpu/effects/GrTextureStripAtlas.cpp',
       '<(skia_src_path)/gpu/effects/GrColorTableEffect.cpp',
       '<(skia_src_path)/gpu/effects/GrColorTableEffect.h',
+      '<(skia_src_path)/gpu/effects/GrConfigConversionEffect.cpp',
+      '<(skia_src_path)/gpu/effects/GrConfigConversionEffect.h',
       '<(skia_src_path)/gpu/effects/GrConvolutionEffect.cpp',
       '<(skia_src_path)/gpu/effects/GrConvolutionEffect.h',
       '<(skia_src_path)/gpu/effects/GrSingleTextureEffect.cpp',
diff --git a/include/gpu/GrContext.h b/include/gpu/GrContext.h
index 8ee326c..7ee1dd3 100644
--- a/include/gpu/GrContext.h
+++ b/include/gpu/GrContext.h
@@ -609,12 +609,13 @@
     class AutoRenderTarget : ::GrNoncopyable {
     public:
         AutoRenderTarget(GrContext* context, GrRenderTarget* target) {
-            fContext = NULL;
             fPrevTarget = context->getRenderTarget();
-            if (fPrevTarget != target) {
-                context->setRenderTarget(target);
-                fContext = context;
-            }
+            context->setRenderTarget(target);
+            fContext = context;
+        }
+        AutoRenderTarget(GrContext* context) {
+            fPrevTarget = context->getRenderTarget();
+            fContext = context;
         }
         ~AutoRenderTarget() {
             if (fContext) {
@@ -631,14 +632,31 @@
      */
     class AutoMatrix : GrNoncopyable {
     public:
+        enum InitialMatrix {
+            kPreserve_InitialMatrix,
+            kIdentity_InitialMatrix,
+        };
+
         AutoMatrix() : fContext(NULL) {}
-        AutoMatrix(GrContext* ctx) : fContext(ctx) {
+
+        AutoMatrix(GrContext* ctx, InitialMatrix initialState) : fContext(ctx) {
             fMatrix = ctx->getMatrix();
+            switch (initialState) {
+                case kPreserve_InitialMatrix:
+                    break;
+                case kIdentity_InitialMatrix:
+                    ctx->setMatrix(GrMatrix::I());
+                    break;
+                default:
+                    GrCrash("Unexpected initial matrix state");
+            }
         }
+
         AutoMatrix(GrContext* ctx, const GrMatrix& matrix) : fContext(ctx) {
             fMatrix = ctx->getMatrix();
             ctx->setMatrix(matrix);
         }
+
         void set(GrContext* ctx) {
             if (NULL != fContext) {
                 fContext->setMatrix(fMatrix);
@@ -646,6 +664,7 @@
             fMatrix = ctx->getMatrix();
             fContext = ctx;
         }
+
         void set(GrContext* ctx, const GrMatrix& matrix) {
             if (NULL != fContext) {
                 fContext->setMatrix(fMatrix);
@@ -654,6 +673,7 @@
             ctx->setMatrix(matrix);
             fContext = ctx;
         }
+
         ~AutoMatrix() {
             if (NULL != fContext) {
                 fContext->setMatrix(fMatrix);
@@ -667,6 +687,21 @@
 
     class AutoClip : GrNoncopyable {
     public:
+        // This enum exists to require a caller of the constructor to acknowledge that the clip will
+        // initially be wide open. It also could be extended if there are other desirable initial
+        // clip states.
+        enum InitialClip {
+            kWideOpen_InitialClip,
+        };
+
+        AutoClip(GrContext* context, InitialClip initialState) {
+            GrAssert(kWideOpen_InitialClip == initialState);
+            fOldClip = context->getClip();
+            fNewClipData.fClipStack = &fNewClipStack;
+            context->setClip(&fNewClipData);
+            fContext = context;
+        }
+
         AutoClip(GrContext* context, const GrRect& newClipRect)
         : fContext(context)
         , fNewClipStack(newClipRect) {
@@ -689,6 +724,19 @@
         GrClipData        fNewClipData;
     };
 
+    class AutoWideOpenIdentityDraw {
+    public:
+        AutoWideOpenIdentityDraw(GrContext* ctx, GrRenderTarget* rt)
+            : fAutoClip(ctx, AutoClip::kWideOpen_InitialClip)
+            , fAutoRT(ctx, rt)
+            , fAutoMatrix(ctx, AutoMatrix::kIdentity_InitialMatrix) {
+        }
+    private:
+        AutoClip fAutoClip;
+        AutoRenderTarget fAutoRT;
+        AutoMatrix fAutoMatrix;
+    };
+
     ///////////////////////////////////////////////////////////////////////////
     // Functions intended for internal use only.
     GrGpu* getGpu() { return fGpu; }
@@ -742,6 +790,10 @@
 
     GrAARectRenderer*           fAARectRenderer;
 
+    bool                        fDidTestPMConversions;
+    int                         fPMToUPMConversion;
+    int                         fUPMToPMConversion;
+
     GrContext(GrGpu* gpu);
 
     void setupDrawBuffer();
@@ -771,6 +823,9 @@
     // for use with textures released from an GrAutoScratchTexture.
     void addExistingTextureToCache(GrTexture* texture);
 
+    GrCustomStage* createPMToUPMEffect(GrTexture* texture, bool swapRAndB);
+    GrCustomStage* createUPMToPMEffect(GrTexture* texture, bool swapRAndB);
+
     typedef GrRefCnt INHERITED;
 };
 
diff --git a/include/gpu/GrSamplerState.h b/include/gpu/GrSamplerState.h
index 12c0937..e8182f9 100644
--- a/include/gpu/GrSamplerState.h
+++ b/include/gpu/GrSamplerState.h
@@ -121,7 +121,6 @@
         // memcpy() breaks refcounting
         fTextureParams = s.fTextureParams;
         fMatrix = s.fMatrix;
-        fSwapRAndB = s.fSwapRAndB;
 
         GrSafeAssign(fCustomStage, s.fCustomStage);
 
@@ -129,8 +128,6 @@
     }
 
     const GrMatrix& getMatrix() const { return fMatrix; }
-    bool swapsRAndB() const { return fSwapRAndB; }
-    bool premultiply() const { return fPremultiply; }
 
     GrTextureParams* textureParams() { return &fTextureParams; }
     const GrTextureParams& getTextureParams() const { return fTextureParams; }
@@ -141,18 +138,6 @@
     GrMatrix* matrix() { return &fMatrix; }
 
     /**
-     * Swaps the R and B components when reading from the texture. Has no effect
-     * if the texture is alpha only.
-     */
-    void setRAndBSwap(bool swap) { fSwapRAndB = swap; }
-
-    /**
-     * If the texture is RGBA/BGRA 8888 config then its rgb components will be
-     * multiplied by its a component after the texture read.
-     **/
-    void setPremultiply(bool premul) { fPremultiply = premul; }
-
-    /**
      *  Multiplies the current sampler matrix  a matrix
      *
      *  After this call M' = M*m where M is the old matrix, m is the parameter
@@ -169,8 +154,6 @@
                const GrMatrix& matrix) {
         fTextureParams.reset(tileXAndY, filter);
         fMatrix = matrix;
-        fSwapRAndB = false;
-        fPremultiply = false;
         GrSafeSetNull(fCustomStage);
     }
     void reset(SkShader::TileMode wrapXAndY, bool filter) {
@@ -191,8 +174,6 @@
 
 private:
     GrTextureParams     fTextureParams;
-    bool                fSwapRAndB;
-    bool                fPremultiply; // temporary, will be replaced soon by a custom stage.
     GrMatrix            fMatrix;
 
     GrCustomStage*      fCustomStage;
diff --git a/src/gpu/GrContext.cpp b/src/gpu/GrContext.cpp
index d1fdb18..2b5014d 100644
--- a/src/gpu/GrContext.cpp
+++ b/src/gpu/GrContext.cpp
@@ -11,6 +11,7 @@
 
 #include "effects/GrConvolutionEffect.h"
 #include "effects/GrSingleTextureEffect.h"
+#include "effects/GrConfigConversionEffect.h"
 
 #include "GrBufferAllocPool.h"
 #include "GrGpu.h"
@@ -1235,6 +1236,25 @@
             return false;
     }
 }
+
+// It returns a configuration with where the byte position of the R & B components are swapped in
+// relation to the input config. This should only be called with the result of
+// grconfig_to_config8888 as it will fail for other configs.
+SkCanvas::Config8888 swap_config8888_red_and_blue(SkCanvas::Config8888 config8888) {
+    switch (config8888) {
+        case SkCanvas::kBGRA_Premul_Config8888:
+            return SkCanvas::kRGBA_Premul_Config8888;
+        case SkCanvas::kBGRA_Unpremul_Config8888:
+            return SkCanvas::kRGBA_Unpremul_Config8888;
+        case SkCanvas::kRGBA_Premul_Config8888:
+            return SkCanvas::kBGRA_Premul_Config8888;
+        case SkCanvas::kRGBA_Unpremul_Config8888:
+            return SkCanvas::kBGRA_Unpremul_Config8888;
+        default:
+            GrCrash("Unexpected input");
+            return SkCanvas::kBGRA_Unpremul_Config8888;;
+    }
+}
 }
 
 bool GrContext::readRenderTargetPixels(GrRenderTarget* target,
@@ -1255,67 +1275,53 @@
         this->flush();
     }
 
-    if ((kUnpremul_PixelOpsFlag & flags) &&
-        !fGpu->canPreserveReadWriteUnpremulPixels()) {
+    // Determine which conversions have to be applied: flipY, swapRAnd, and/or unpremul.
 
-        SkCanvas::Config8888 srcConfig8888, dstConfig8888;
-        if (!grconfig_to_config8888(target->config(), false, &srcConfig8888) ||
-            !grconfig_to_config8888(config, true, &dstConfig8888)) {
-            return false;
-        }
-        // do read back using target's own config
-        this->readRenderTargetPixels(target,
-                                     left, top,
-                                     width, height,
-                                     target->config(),
-                                     buffer, rowBytes,
-                                     kDontFlush_PixelOpsFlag); // we already flushed
-        // sw convert the pixels to unpremul config
-        uint32_t* pixels = reinterpret_cast<uint32_t*>(buffer);
-        SkConvertConfig8888Pixels(pixels, rowBytes, dstConfig8888,
-                                  pixels, rowBytes, srcConfig8888,
-                                  width, height);
-        return true;
-    }
-
-    GrTexture* src = target->asTexture();
-    bool swapRAndB = NULL != src &&
-                     fGpu->preferredReadPixelsConfig(config) ==
-                     GrPixelConfigSwapRAndB(config);
-
-    bool flipY = NULL != src &&
-                 fGpu->readPixelsWillPayForYFlip(target, left, top,
+    // If fGpu->readPixels would incur a y-flip cost then we will read the pixels upside down. We'll
+    // either do the flipY by drawing into a scratch with a matrix or on the cpu after the read.
+    bool flipY = fGpu->readPixelsWillPayForYFlip(target, left, top,
                                                  width, height, config,
                                                  rowBytes);
+    bool swapRAndB = fGpu->preferredReadPixelsConfig(config) == GrPixelConfigSwapRAndB(config);
+
     bool unpremul = SkToBool(kUnpremul_PixelOpsFlag & flags);
 
-    if (NULL == src && unpremul) {
-        // we should fallback to cpu conversion here. This could happen when
-        // we were given an external render target by the client that is not
-        // also a texture (e.g. FBO 0 in GL)
+    // flipY will get set to false when it is handled below using a scratch. However, in that case
+    // we still want to do the read upside down.
+    bool readUpsideDown = flipY;
+
+    if (unpremul && kRGBA_8888_GrPixelConfig != config && kBGRA_8888_GrPixelConfig != config) {
+        // The unpremul flag is only allowed for these two configs.
         return false;
     }
-    // we draw to a scratch texture if any of these conversion are applied
+
+    GrPixelConfig readConfig;
+    if (swapRAndB) {
+        readConfig = GrPixelConfigSwapRAndB(config);
+        GrAssert(kUnknown_GrPixelConfig != config);
+    } else {
+        readConfig = config;
+    }
+
+    // If the src is a texture and we would have to do conversions after read pixels, we instead
+    // do the conversions by drawing the src to a scratch texture. If we handle any of the
+    // conversions in the draw we set the corresponding bool to false so that we don't reapply it
+    // on the read back pixels.
+    GrTexture* src = target->asTexture();
     GrAutoScratchTexture ast;
-    if (flipY || swapRAndB || unpremul) {
-        GrAssert(NULL != src);
-        if (swapRAndB) {
-            config = GrPixelConfigSwapRAndB(config);
-            GrAssert(kUnknown_GrPixelConfig != config);
-        }
-        // Make the scratch a render target because we don't have a robust
-        // readTexturePixels as of yet (it calls this function).
+    if (NULL != src && (swapRAndB || unpremul || flipY)) {
+        // Make the scratch a render target because we don't have a robust readTexturePixels as of
+        // yet. It calls this function.
         GrTextureDesc desc;
         desc.fFlags = kRenderTarget_GrTextureFlagBit;
         desc.fWidth = width;
         desc.fHeight = height;
-        desc.fConfig = config;
+        desc.fConfig = readConfig;
 
-        // When a full readback is faster than a partial we could always make
-        // the scratch exactly match the passed rect. However, if we see many
-        // different size rectangles we will trash our texture cache and pay the
-        // cost of creating and destroying many textures. So, we only request
-        // an exact match when the caller is reading an entire RT.
+        // When a full readback is faster than a partial we could always make the scratch exactly
+        // match the passed rect. However, if we see many different size rectangles we will trash
+        // our texture cache and pay the cost of creating and destroying many textures. So, we only
+        // request an exact match when the caller is reading an entire RT.
         ScratchTexMatch match = kApprox_ScratchTexMatch;
         if (0 == left &&
             0 == top &&
@@ -1326,42 +1332,104 @@
         }
         ast.set(this, desc, match);
         GrTexture* texture = ast.texture();
-        if (!texture) {
-            return false;
-        }
-        target = texture->asRenderTarget();
-        GrAssert(NULL != target);
+        if (texture) {
+            SkAutoTUnref<GrCustomStage> stage;
+            if (unpremul) {
+                stage.reset(this->createPMToUPMEffect(src, swapRAndB));
+            }
+            // If we failed to create a PM->UPM effect and have no other conversions to perform then
+            // there is no longer any point to using the scratch.
+            if (NULL != stage || flipY || swapRAndB) {
+                if (NULL == stage) {
+                    stage.reset(GrConfigConversionEffect::Create(src, swapRAndB));
+                    GrAssert(NULL != stage);
+                } else {
+                    unpremul = false; // we will handle the UPM conversion in the draw
+                }
+                swapRAndB = false; // we will handle the swap in the draw.
 
-        GrDrawTarget::AutoStateRestore asr(fGpu,
-                                           GrDrawTarget::kReset_ASRInit);
-        GrDrawState* drawState = fGpu->drawState();
-        drawState->setRenderTarget(target);
-
-        if (unpremul) {
-            drawState->enableState(GrDrawState::kUnpremultiply_StageBit);
+                GrDrawTarget::AutoStateRestore asr(fGpu, GrDrawTarget::kReset_ASRInit);
+                GrDrawState* drawState = fGpu->drawState();
+                drawState->setRenderTarget(texture->asRenderTarget());
+                GrMatrix matrix;
+                if (flipY) {
+                    matrix.setTranslate(SK_Scalar1 * left,
+                                        SK_Scalar1 * (top + height));
+                    matrix.set(GrMatrix::kMScaleY, -GR_Scalar1);
+                    flipY = false; // the y flip will be handled in the draw
+                } else {
+                    matrix.setTranslate(SK_Scalar1 *left, SK_Scalar1 *top);
+                }
+                matrix.postIDiv(src->width(), src->height());
+                drawState->sampler(0)->reset(matrix);
+                drawState->sampler(0)->setCustomStage(stage);
+                GrRect rect = GrRect::MakeWH(GrIntToScalar(width), GrIntToScalar(height));
+                fGpu->drawSimpleRect(rect, NULL);
+                // we want to read back from the scratch's origin
+                left = 0;
+                top = 0;
+                target = texture->asRenderTarget();
+            }
         }
-
-        GrMatrix matrix;
-        if (flipY) {
-            matrix.setTranslate(SK_Scalar1 * left,
-                                SK_Scalar1 * (top + height));
-            matrix.set(GrMatrix::kMScaleY, -GR_Scalar1);
-        } else {
-            matrix.setTranslate(SK_Scalar1 *left, SK_Scalar1 *top);
-        }
-        matrix.postIDiv(src->width(), src->height());
-        drawState->sampler(0)->reset(matrix);
-        drawState->sampler(0)->setRAndBSwap(swapRAndB);
-        drawState->createTextureEffect(0, src);
-        GrRect rect;
-        rect.setXYWH(0, 0, SK_Scalar1 * width, SK_Scalar1 * height);
-        fGpu->drawSimpleRect(rect, NULL);
-        left = 0;
-        top = 0;
     }
-    return fGpu->readPixels(target,
-                            left, top, width, height,
-                            config, buffer, rowBytes, flipY);
+    if (!fGpu->readPixels(target,
+                          left, top, width, height,
+                          readConfig, buffer, rowBytes, readUpsideDown)) {
+        return false;
+    }
+    // Perform any conversions we weren't able to perfom using a scratch texture.
+    if (unpremul || swapRAndB || flipY) {
+        SkCanvas::Config8888 srcC8888;
+        SkCanvas::Config8888 dstC8888;
+        bool c8888IsValid = grconfig_to_config8888(config, false, &srcC8888);
+        grconfig_to_config8888(config, unpremul, &dstC8888);
+        if (swapRAndB) {
+            GrAssert(c8888IsValid); // we should only do r/b swap on 8888 configs
+            srcC8888 = swap_config8888_red_and_blue(srcC8888);
+        }
+        if (flipY) {
+            size_t tightRB = width * GrBytesPerPixel(config);
+            if (0 == rowBytes) {
+                rowBytes = tightRB;
+            }
+            SkAutoSTMalloc<256, uint8_t> tempRow(tightRB);
+            intptr_t top = reinterpret_cast<intptr_t>(buffer);
+            intptr_t bot = top + (height - 1) * rowBytes;
+            while (top < bot) {
+                uint32_t* t = reinterpret_cast<uint32_t*>(top);
+                uint32_t* b = reinterpret_cast<uint32_t*>(bot);
+                uint32_t* temp = reinterpret_cast<uint32_t*>(tempRow.get());
+                memcpy(temp, t, tightRB);
+                if (c8888IsValid) {
+                    SkConvertConfig8888Pixels(t, tightRB, dstC8888,
+                                              b, tightRB, srcC8888,
+                                              width, 1);
+                    SkConvertConfig8888Pixels(b, tightRB, dstC8888,
+                                              temp, tightRB, srcC8888,
+                                              width, 1);
+                } else {
+                    memcpy(t, b, tightRB);
+                    memcpy(b, temp, tightRB);
+                }
+                top += rowBytes;
+                bot -= rowBytes;
+            }
+            // The above loop does nothing on the middle row when height is odd.
+            if (top == bot && c8888IsValid && dstC8888 != srcC8888) {
+                uint32_t* mid = reinterpret_cast<uint32_t*>(top);
+                SkConvertConfig8888Pixels(mid, tightRB, dstC8888, mid, tightRB, srcC8888, width, 1);
+            }
+        } else {
+            // if we aren't flipping Y then we have no reason to be here other than doing
+            // conversions for 8888 (r/b swap or upm).
+            GrAssert(c8888IsValid);
+            uint32_t* b32 = reinterpret_cast<uint32_t*>(buffer);
+            SkConvertConfig8888Pixels(b32, rowBytes, dstC8888,
+                                      b32, rowBytes, srcC8888,
+                                      width, height);
+        }
+    }
+    return true;
 }
 
 void GrContext::resolveRenderTarget(GrRenderTarget* target) {
@@ -1415,17 +1483,21 @@
         }
     }
 
-    // TODO: when underlying api has a direct way to do this we should use it
-    // (e.g. glDrawPixels on desktop GL).
+    // TODO: when underlying api has a direct way to do this we should use it (e.g. glDrawPixels on
+    // desktop GL).
+
+    // We will always call some form of writeTexturePixels and we will pass our flags on to it.
+    // Thus, we don't perform a flush here since that call will do it (if the kNoFlush flag isn't
+    // set.)
 
     // If the RT is also a texture and we don't have to premultiply then take the texture path.
     // We expect to be at least as fast or faster since it doesn't use an intermediate texture as
     // we do below.
 
 #if !GR_MAC_BUILD
-    // At least some drivers on the Mac get confused when glTexImage2D is called
-    // on a texture attached to an FBO. The FBO still sees the old image. TODO:
-    // determine what OS versions and/or HW is affected.
+    // At least some drivers on the Mac get confused when glTexImage2D is called on a texture
+    // attached to an FBO. The FBO still sees the old image. TODO: determine what OS versions and/or
+    // HW is affected.
     if (NULL != target->asTexture() && !(kUnpremul_PixelOpsFlag & flags)) {
         this->writeTexturePixels(target->asTexture(),
                                  left, top, width, height,
@@ -1433,48 +1505,59 @@
         return;
     }
 #endif
-    if ((kUnpremul_PixelOpsFlag & flags) &&
-        !fGpu->canPreserveReadWriteUnpremulPixels()) {
-        SkCanvas::Config8888 srcConfig8888, dstConfig8888;
-        if (!grconfig_to_config8888(config, true, &srcConfig8888) ||
-            !grconfig_to_config8888(target->config(), false, &dstConfig8888)) {
-            return;
-        }
-        // allocate a tmp buffer and sw convert the pixels to premul
-        SkAutoSTMalloc<128 * 128, uint32_t> tmpPixels(width * height);
-        const uint32_t* src = reinterpret_cast<const uint32_t*>(buffer);
-        SkConvertConfig8888Pixels(tmpPixels.get(), 4 * width, dstConfig8888,
-                                  src, rowBytes, srcConfig8888,
-                                  width, height);
-        // upload the already premul pixels
-        flags &= ~kUnpremul_PixelOpsFlag;
-        this->writeRenderTargetPixels(target,
-                                      left, top,
-                                      width, height,
-                                      target->config(),
-                                      tmpPixels, 4 * width,
-                                      flags);
-        return;
-    }
+    SkAutoTUnref<GrCustomStage> stage;
+    bool swapRAndB = (fGpu->preferredReadPixelsConfig(config) == GrPixelConfigSwapRAndB(config));
 
-    bool swapRAndB = fGpu->preferredReadPixelsConfig(config) ==
-                     GrPixelConfigSwapRAndB(config);
+    GrPixelConfig textureConfig;
     if (swapRAndB) {
-        config = GrPixelConfigSwapRAndB(config);
+        textureConfig = GrPixelConfigSwapRAndB(config);
+    } else {
+        textureConfig = config;
     }
 
     GrTextureDesc desc;
     desc.fWidth = width;
     desc.fHeight = height;
-    desc.fConfig = config;
-
+    desc.fConfig = textureConfig;
     GrAutoScratchTexture ast(this, desc);
     GrTexture* texture = ast.texture();
     if (NULL == texture) {
         return;
     }
-    this->writeTexturePixels(texture, 0, 0, width, height,
-                             config, buffer, rowBytes, flags & ~kUnpremul_PixelOpsFlag);
+    // allocate a tmp buffer and sw convert the pixels to premul
+    SkAutoSTMalloc<128 * 128, uint32_t> tmpPixels(0);
+
+    if (kUnpremul_PixelOpsFlag & flags) {
+        if (kRGBA_8888_GrPixelConfig != config && kBGRA_8888_GrPixelConfig != config) {
+            return;
+        }
+        stage.reset(this->createUPMToPMEffect(texture, swapRAndB));
+        if (NULL == stage) {
+            SkCanvas::Config8888 srcConfig8888, dstConfig8888;
+            GR_DEBUGCODE(bool success = )
+            grconfig_to_config8888(config, true, &srcConfig8888);
+            GrAssert(success);
+            GR_DEBUGCODE(success = )
+            grconfig_to_config8888(config, false, &dstConfig8888);
+            GrAssert(success);
+            const uint32_t* src = reinterpret_cast<const uint32_t*>(buffer);
+            tmpPixels.reset(width * height);
+            SkConvertConfig8888Pixels(tmpPixels.get(), 4 * width, dstConfig8888,
+                                      src, rowBytes, srcConfig8888,
+                                      width, height);
+            buffer = tmpPixels.get();
+            rowBytes = 4 * width;
+        }
+    }
+    if (NULL == stage) {
+        stage.reset(GrConfigConversionEffect::Create(texture, swapRAndB));
+        GrAssert(NULL != stage);
+    }
+
+    this->writeTexturePixels(texture,
+                             0, 0, width, height,
+                             textureConfig, buffer, rowBytes,
+                             flags & ~kUnpremul_PixelOpsFlag);
 
     GrDrawTarget::AutoStateRestore  asr(fGpu, GrDrawTarget::kReset_ASRInit);
     GrDrawState* drawState = fGpu->drawState();
@@ -1486,20 +1569,9 @@
 
     matrix.setIDiv(texture->width(), texture->height());
     drawState->sampler(0)->reset(matrix);
-    drawState->createTextureEffect(0, texture);
-    drawState->sampler(0)->setRAndBSwap(swapRAndB);
-    drawState->sampler(0)->setPremultiply(SkToBool(kUnpremul_PixelOpsFlag & flags));
+    drawState->sampler(0)->setCustomStage(stage);
 
-    static const GrVertexLayout layout = 0;
-    static const int VCOUNT = 4;
-    // TODO: Use GrGpu::drawRect here
-    GrDrawTarget::AutoReleaseGeometry geo(fGpu, layout, VCOUNT, 0);
-    if (!geo.succeeded()) {
-        GrPrintf("Failed to get space for vertices!\n");
-        return;
-    }
-    ((GrPoint*)geo.vertices())->setIRectFan(0, 0, width, height);
-    fGpu->drawNonIndexed(kTriangleFan_GrPrimitiveType, 0, VCOUNT);
+    fGpu->drawSimpleRect(GrRect::MakeWH(SkIntToScalar(width), SkIntToScalar(height)), NULL);
 }
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -1673,6 +1745,8 @@
 
     fAARectRenderer = SkNEW(GrAARectRenderer);
 
+    fDidTestPMConversions = false;
+
     this->setupDrawBuffer();
 }
 
@@ -1710,6 +1784,42 @@
     return fGpu->getQuadIndexBuffer();
 }
 
+namespace {
+void test_pm_conversions(GrContext* ctx, int* pmToUPMValue, int* upmToPMValue) {
+    GrConfigConversionEffect::PMConversion pmToUPM;
+    GrConfigConversionEffect::PMConversion upmToPM;
+    GrConfigConversionEffect::TestForPreservingPMConversions(ctx, &pmToUPM, &upmToPM);
+    *pmToUPMValue = pmToUPM;
+    *upmToPMValue = upmToPM;
+}
+}
+
+GrCustomStage* GrContext::createPMToUPMEffect(GrTexture* texture, bool swapRAndB) {
+    if (!fDidTestPMConversions) {
+        test_pm_conversions(this, &fPMToUPMConversion, &fUPMToPMConversion);
+    }
+    GrConfigConversionEffect::PMConversion pmToUPM =
+        static_cast<GrConfigConversionEffect::PMConversion>(fPMToUPMConversion);
+    if (GrConfigConversionEffect::kNone_PMConversion != pmToUPM) {
+        return GrConfigConversionEffect::Create(texture, swapRAndB, pmToUPM);
+    } else {
+        return NULL;
+    }
+}
+
+GrCustomStage* GrContext::createUPMToPMEffect(GrTexture* texture, bool swapRAndB) {
+    if (!fDidTestPMConversions) {
+        test_pm_conversions(this, &fPMToUPMConversion, &fUPMToPMConversion);
+    }
+    GrConfigConversionEffect::PMConversion upmToPM =
+        static_cast<GrConfigConversionEffect::PMConversion>(fUPMToPMConversion);
+    if (GrConfigConversionEffect::kNone_PMConversion != upmToPM) {
+        return GrConfigConversionEffect::Create(texture, swapRAndB, upmToPM);
+    } else {
+        return NULL;
+    }
+}
+
 GrTexture* GrContext::gaussianBlur(GrTexture* srcTexture,
                                    bool canClobberSrc,
                                    const SkRect& rect,
diff --git a/src/gpu/GrDrawState.h b/src/gpu/GrDrawState.h
index 1db2e4a..8be76b3 100644
--- a/src/gpu/GrDrawState.h
+++ b/src/gpu/GrDrawState.h
@@ -670,13 +670,6 @@
          */
         kColorMatrix_StateBit   = 0x10,
 
-        /**
-         * The pixels written to the render target should be unpremultiplied.
-         * This flag is temporary and will be removed when this functionality is
-         * captured in a custom stage.
-         */
-         kUnpremultiply_StageBit = 0x20,
-
         // Users of the class may add additional bits to the vector
         kDummyStateBit,
         kLastPublicStateBit = kDummyStateBit-1,
diff --git a/src/gpu/GrGpu.h b/src/gpu/GrGpu.h
index d700de5..fb5c1f5 100644
--- a/src/gpu/GrGpu.h
+++ b/src/gpu/GrGpu.h
@@ -164,18 +164,10 @@
     void forceRenderTargetFlush();
 
     /**
-     * If this returns true then a sequence that reads unpremultiplied pixels
-     * from a surface, writes back the same values, and reads them again will
-     * give the same pixel values back in both reads.
-     */
-    virtual bool canPreserveReadWriteUnpremulPixels() = 0;
-
-    /**
      * readPixels with some configs may be slow. Given a desired config this
      * function returns a fast-path config. The returned config must have the
-     * same components, component sizes, and not require conversion between
-     * pre- and unpremultiplied alpha. The caller is free to ignore the result
-     * and call readPixels with the original config.
+     * same components and component sizes. The caller is free to ignore the
+     * result and call readPixels with the original config.
      */
     virtual GrPixelConfig preferredReadPixelsConfig(GrPixelConfig config)
                                                                         const {
@@ -220,10 +212,7 @@
      virtual bool fullReadPixelsIsFasterThanPartial() const { return false; };
 
     /**
-     * Reads a rectangle of pixels from a render target. Fails if read requires
-     * conversion between premultiplied and unpremultiplied configs. The caller
-     * should do the conversion by rendering to a target with the desire config
-     * first.
+     * Reads a rectangle of pixels from a render target.
      *
      * @param renderTarget  the render target to read from. NULL means the
      *                      current render target.
diff --git a/src/gpu/effects/GrConfigConversionEffect.cpp b/src/gpu/effects/GrConfigConversionEffect.cpp
new file mode 100644
index 0000000..972f07f
--- /dev/null
+++ b/src/gpu/effects/GrConfigConversionEffect.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "GrConfigConversionEffect.h"
+#include "gl/GrGLProgramStage.h"
+
+class GrGLConfigConversionEffect : public GrGLProgramStage {
+public:
+    GrGLConfigConversionEffect(const GrProgramStageFactory& factory,
+                               const GrCustomStage& s) : INHERITED (factory) {
+        const GrConfigConversionEffect& stage = static_cast<const GrConfigConversionEffect&>(s);
+        fSwapRedAndBlue = stage.swapsRedAndBlue();
+        fPMConversion = stage.pmConversion();
+    }
+
+    virtual void emitVS(GrGLShaderBuilder* builder,
+                        const char* vertexCoords) SK_OVERRIDE { }
+    virtual void emitFS(GrGLShaderBuilder* builder,
+                        const char* outputColor,
+                        const char* inputColor,
+                        const char* samplerName) SK_OVERRIDE {
+        builder->fFSCode.append("\tvec4 tempColor;\n");
+        builder->emitDefaultFetch("tempColor", samplerName);
+        if (GrConfigConversionEffect::kNone_PMConversion == fPMConversion) {
+            GrAssert(fSwapRedAndBlue);
+            builder->fFSCode.appendf("\t%s = tempColor.bgra;\n", outputColor);
+        } else {
+            const char* swiz = fSwapRedAndBlue ? "bgr" : "rgb";
+            switch (fPMConversion) {
+                case GrConfigConversionEffect::kMulByAlpha_RoundUp_PMConversion:
+                    builder->fFSCode.appendf(
+                        "\t%s = vec4(ceil(tempColor.%s*tempColor.a*255.0)/255.0, tempColor.a);\n",
+                         outputColor, swiz);
+                    break;
+                case GrConfigConversionEffect::kMulByAlpha_RoundDown_PMConversion:
+                    builder->fFSCode.appendf(
+                        "\t%s = vec4(floor(tempColor.%s*tempColor.a*255.0)/255.0, tempColor.a);\n",
+                         outputColor, swiz);
+                    break;
+                case GrConfigConversionEffect::kDivByAlpha_RoundUp_PMConversion:
+                    builder->fFSCode.appendf("\t%s = tempColor.a <= 0.0 ? vec4(0,0,0,0) : vec4(ceil(tempColor.%s / tempColor.a * 255.0)/255.0, tempColor.a);\n",
+                        outputColor, swiz);
+                    break;
+                case GrConfigConversionEffect::kDivByAlpha_RoundDown_PMConversion:
+                    builder->fFSCode.appendf("\t%s = tempColor.a <= 0.0 ? vec4(0,0,0,0) : vec4(floor(tempColor.%s / tempColor.a * 255.0)/255.0, tempColor.a);\n",
+                        outputColor, swiz);
+                    break;
+            }
+        }
+    }
+
+    static inline StageKey GenKey(const GrCustomStage& s, const GrGLCaps&) {
+        const GrConfigConversionEffect& stage = static_cast<const GrConfigConversionEffect&>(s);
+        return static_cast<int>(stage.swapsRedAndBlue()) | (stage.pmConversion() << 1);
+    }
+
+private:
+    bool                                    fSwapRedAndBlue;
+    GrConfigConversionEffect::PMConversion  fPMConversion;
+
+    typedef GrGLProgramStage INHERITED;
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+GrConfigConversionEffect::GrConfigConversionEffect(GrTexture* texture,
+                                                   bool swapRedAndBlue,
+                                                   PMConversion pmConversion)
+    : GrSingleTextureEffect(texture)
+    , fSwapRedAndBlue(swapRedAndBlue)
+    , fPMConversion(pmConversion) {
+    GrAssert(kRGBA_8888_GrPixelConfig == texture->config() ||
+             kBGRA_8888_GrPixelConfig == texture->config());
+    // Why did we pollute our texture cache instead of using a GrSingleTextureEffect?
+    GrAssert(swapRedAndBlue || kNone_PMConversion != pmConversion);
+}
+
+const GrProgramStageFactory& GrConfigConversionEffect::getFactory() const {
+    return GrTProgramStageFactory<GrConfigConversionEffect>::getInstance();
+}
+
+bool GrConfigConversionEffect::isEqual(const GrCustomStage& s) const {
+    const GrConfigConversionEffect& other = static_cast<const GrConfigConversionEffect&>(s);
+    return other.fSwapRedAndBlue == fSwapRedAndBlue && other.fPMConversion == fPMConversion;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+GR_DEFINE_CUSTOM_STAGE_TEST(GrConfigConversionEffect);
+
+GrCustomStage* GrConfigConversionEffect::TestCreate(SkRandom* random,
+                                                    GrContext* context,
+                                                    GrTexture* textures[]) {
+    PMConversion pmConv = static_cast<PMConversion>(random->nextULessThan(kPMConversionCnt));
+    bool swapRB;
+    if (kNone_PMConversion == pmConv) {
+        swapRB = true;
+    } else {
+        swapRB = random->nextBool();
+    }
+    return SkNEW_ARGS(GrConfigConversionEffect,
+            (textures[GrCustomStageUnitTest::kSkiaPMTextureIdx], swapRB, pmConv));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void GrConfigConversionEffect::TestForPreservingPMConversions(GrContext* context,
+                                                              PMConversion* pmToUPMRule,
+                                                              PMConversion* upmToPMRule) {
+    *pmToUPMRule = kNone_PMConversion;
+    *upmToPMRule = kNone_PMConversion;
+    SkAutoTMalloc<uint32_t> data(256 * 256 * 3);
+    uint32_t* srcData = data.get();
+    uint32_t* firstRead = data.get() + 256 * 256;
+    uint32_t* secondRead = data.get() + 2 * 256 * 256;
+
+    // Fill with every possible premultiplied A, color channel value. There will be 256-y duplicate
+    // values in row y. We set r,g, and b to the same value since they are handled identically.
+    for (int y = 0; y < 256; ++y) {
+        for (int x = 0; x < 256; ++x) {
+            uint8_t* color = reinterpret_cast<uint8_t*>(&srcData[256*y + x]);
+            color[3] = y;
+            color[2] = GrMin(x, y);
+            color[1] = GrMin(x, y);
+            color[0] = GrMin(x, y);
+        }
+    }
+
+    GrTextureDesc desc;
+    desc.fFlags = kRenderTarget_GrTextureFlagBit |
+                  kNoStencil_GrTextureFlagBit;
+    desc.fWidth = 256;
+    desc.fHeight = 256;
+    desc.fConfig = kRGBA_8888_GrPixelConfig;
+
+    SkAutoTUnref<GrTexture> readTex(context->createUncachedTexture(desc, NULL, 0));
+    if (!readTex.get()) {
+        return;
+    }
+    SkAutoTUnref<GrTexture> tempTex(context->createUncachedTexture(desc, NULL, 0));
+    if (!tempTex.get()) {
+        return;
+    }
+    desc.fFlags = kNone_GrTextureFlags;
+    SkAutoTUnref<GrTexture> dataTex(context->createUncachedTexture(desc, data, 0));
+    if (!dataTex.get()) {
+        return;
+    }
+
+    static const PMConversion kConversionRules[][2] = {
+        {kDivByAlpha_RoundDown_PMConversion, kMulByAlpha_RoundUp_PMConversion},
+        {kDivByAlpha_RoundUp_PMConversion, kMulByAlpha_RoundDown_PMConversion},
+    };
+
+    GrContext::AutoWideOpenIdentityDraw awoid(context, NULL);
+
+    bool failed = true;
+
+    for (size_t i = 0; i < GR_ARRAY_COUNT(kConversionRules) && failed; ++i) {
+        *pmToUPMRule = kConversionRules[i][0];
+        *upmToPMRule = kConversionRules[i][1];
+
+        static const GrRect kDstRect = GrRect::MakeWH(GrIntToScalar(256), GrIntToScalar(256));
+        static const GrRect kSrcRect = GrRect::MakeWH(GR_Scalar1, GR_Scalar1);
+        // We do a PM->UPM draw from dataTex to readTex and read the data. Then we do a UPM->PM draw
+        // from readTex to tempTex followed by a PM->UPM draw to readTex and finally read the data.
+        // We then verify that two reads produced the same values.
+
+        GrPaint paint;
+        paint.reset();
+
+        SkAutoTUnref<GrCustomStage> pmToUPMStage1(SkNEW_ARGS(GrConfigConversionEffect,
+                                                             (dataTex, false, *pmToUPMRule)));
+        SkAutoTUnref<GrCustomStage> upmToPMStage(SkNEW_ARGS(GrConfigConversionEffect,
+                                                            (readTex, false, *upmToPMRule)));
+        SkAutoTUnref<GrCustomStage> pmToUPMStage2(SkNEW_ARGS(GrConfigConversionEffect,
+                                                             (tempTex, false, *pmToUPMRule)));
+
+        context->setRenderTarget(readTex->asRenderTarget());
+        paint.textureSampler(0)->setCustomStage(pmToUPMStage1);
+        context->drawRectToRect(paint, kDstRect, kSrcRect);
+
+        readTex->readPixels(0, 0, 256, 256, kRGBA_8888_GrPixelConfig, firstRead);
+
+        context->setRenderTarget(tempTex->asRenderTarget());
+        paint.textureSampler(0)->setCustomStage(upmToPMStage);
+        context->drawRectToRect(paint, kDstRect, kSrcRect);
+        context->setRenderTarget(readTex->asRenderTarget());
+        paint.textureSampler(0)->setCustomStage(pmToUPMStage2);
+        context->drawRectToRect(paint, kDstRect, kSrcRect);
+
+        readTex->readPixels(0, 0, 256, 256, kRGBA_8888_GrPixelConfig, secondRead);
+
+        failed = false;
+        for (int y = 0; y < 256 && !failed; ++y) {
+            for (int x = 0; x <= y; ++x) {
+                if (firstRead[256 * y + x] != secondRead[256 * y + x]) {
+                    failed = true;
+                    break;
+                }
+            }
+        }
+    }
+    if (failed) {
+        *pmToUPMRule = kNone_PMConversion;
+        *upmToPMRule = kNone_PMConversion;
+    }
+}
+
+GrCustomStage* GrConfigConversionEffect::Create(GrTexture* texture,
+                                                bool swapRedAndBlue,
+                                                PMConversion pmConversion) {
+    if (!swapRedAndBlue && kNone_PMConversion == pmConversion) {
+        // If we returned a GrConfigConversionEffect that was equivalent to a GrSingleTextureEffect
+        // then we may pollute our texture cache with redundant shaders. So in the case that no
+        // conversions were requested we instead return a GrSingleTextureEffect.
+        return SkNEW_ARGS(GrSingleTextureEffect, (texture));
+    } else {
+        if (kRGBA_8888_GrPixelConfig != texture->config() &&
+            kBGRA_8888_GrPixelConfig != texture->config() &&
+            kNone_PMConversion != pmConversion) {
+            // The PM conversions assume colors are 0..255
+            return NULL;
+        }
+        return SkNEW_ARGS(GrConfigConversionEffect, (texture, swapRedAndBlue, pmConversion));
+    }
+}
diff --git a/src/gpu/effects/GrConfigConversionEffect.h b/src/gpu/effects/GrConfigConversionEffect.h
new file mode 100644
index 0000000..9bb7e3a
--- /dev/null
+++ b/src/gpu/effects/GrConfigConversionEffect.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrConfigConversionEffect_DEFINED
+#define GrConfigConversionEffect_DEFINED
+
+#include "GrSingleTextureEffect.h"
+
+class GrGLConfigConversionEffect;
+
+/**
+ * This class is used to perform config conversions. Clients may want to read/write data that is
+ * unpremultiplied. Also on some systems reading/writing BGRA or RGBA is faster. In those cases we
+ * read/write using the faster path and perform an R/B swap in the shader if the client data is in
+ * the slower config.
+ */
+class GrConfigConversionEffect : public GrSingleTextureEffect {
+public:
+    /**
+     * The PM->UPM or UPM->PM conversions to apply.
+     */
+    enum PMConversion {
+        kNone_PMConversion = 0,
+        kMulByAlpha_RoundUp_PMConversion,
+        kMulByAlpha_RoundDown_PMConversion,
+        kDivByAlpha_RoundUp_PMConversion,
+        kDivByAlpha_RoundDown_PMConversion,
+
+        kPMConversionCnt
+    };
+
+    // This will fail if the config is not 8888 and a PM conversion is requested.
+    static GrCustomStage* Create(GrTexture*,
+                                 bool swapRedAndBlue,
+                                 PMConversion pmConversion = kNone_PMConversion);
+
+    static const char* Name() { return "Config Conversion"; }
+    typedef GrGLConfigConversionEffect GLProgramStage;
+
+    virtual const GrProgramStageFactory& getFactory() const SK_OVERRIDE;
+    virtual bool isEqual(const GrCustomStage&) const SK_OVERRIDE;
+
+    bool swapsRedAndBlue() const { return fSwapRedAndBlue; }
+    PMConversion  pmConversion() const { return fPMConversion; }
+
+    // This function determines whether it is possible to choose PM->UPM and UPM->PM conversions
+    // for which in any PM->UPM->PM->UPM sequence the two UPM values are the same. This means that
+    // if pixels are read back to a UPM buffer, written back to PM to the GPU, and read back again
+    // both reads will produce the same result. This test is quite expensive and should not be run
+    // multiple times for a given context.
+    static void TestForPreservingPMConversions(GrContext* context,
+                                               PMConversion* PMToUPMRule,
+                                               PMConversion* UPMToPMRule);
+
+private:
+    GrConfigConversionEffect(GrTexture*,
+                            bool swapRedAndBlue,
+                            PMConversion pmConversion);
+
+    bool            fSwapRedAndBlue;
+    PMConversion    fPMConversion;
+
+    GR_DECLARE_CUSTOM_STAGE_TEST;
+
+    typedef GrSingleTextureEffect INHERITED;
+};
+
+#endif
diff --git a/src/gpu/gl/GrGLProgram.cpp b/src/gpu/gl/GrGLProgram.cpp
index bd406f4..a82e527 100644
--- a/src/gpu/gl/GrGLProgram.cpp
+++ b/src/gpu/gl/GrGLProgram.cpp
@@ -879,21 +879,6 @@
                             inCoverage.c_str(),
                             &builder.fFSCode);
         }
-        if (Desc::kUnpremultiplied_RoundDown_OutputConfig == fDesc.fOutputConfig) {
-            builder.fFSCode.appendf("\t%s = %s.a <= 0.0 ? vec4(0,0,0,0) : vec4(floor(%s.rgb / %s.a * 255.0)/255.0, %s.a);\n",
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str());
-        } else if (Desc::kUnpremultiplied_RoundUp_OutputConfig == fDesc.fOutputConfig) {
-            builder.fFSCode.appendf("\t%s = %s.a <= 0.0 ? vec4(0,0,0,0) : vec4(ceil(%s.rgb / %s.a * 255.0)/255.0, %s.a);\n",
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str(),
-                                    colorOutput.getName().c_str());
-        }
     }
 
     builder.fVSCode.append("}\n");
@@ -1083,40 +1068,6 @@
     builder->computeSwizzle(desc.fInConfigFlags);
     builder->computeModulate(fsInColor);
 
-    static const uint32_t kMulByAlphaMask =
-        (StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag |
-         StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag);
-
-    // NOTE: GrGLProgramStages are now responsible for fetching
-    if (NULL == customStage) {
-        if (desc.fInConfigFlags & kMulByAlphaMask) {
-            // only one of the mul by alpha flags should be set
-            GrAssert(GrIsPow2(kMulByAlphaMask & desc.fInConfigFlags));
-            GrAssert(!(desc.fInConfigFlags &
-                       StageDesc::kSmearAlpha_InConfigFlag));
-            GrAssert(!(desc.fInConfigFlags &
-                       StageDesc::kSmearRed_InConfigFlag));
-            builder->fFSCode.appendf("\t%s = %s(%s, %s)%s;\n",
-                                     fsOutColor,
-                                     builder->fTexFunc.c_str(),
-                                     samplerName,
-                                     builder->fSampleCoords.c_str(),
-                                     builder->fSwizzle.c_str());
-            if (desc.fInConfigFlags &
-                StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag) {
-                builder->fFSCode.appendf("\t%s = vec4(ceil(%s.rgb*%s.a*255.0)/255.0,%s.a)%s;\n",
-                                         fsOutColor, fsOutColor, fsOutColor,
-                                         fsOutColor, builder->fModulate.c_str());
-            } else {
-                builder->fFSCode.appendf("\t%s = vec4(floor(%s.rgb*%s.a*255.0)/255.0,%s.a)%s;\n",
-                                         fsOutColor, fsOutColor, fsOutColor,
-                                         fsOutColor, builder->fModulate.c_str());
-            }
-        } else {
-            builder->emitDefaultFetch(fsOutColor, samplerName);
-        }
-    }
-
     if (NULL != customStage) {
         // Enclose custom code in a block to avoid namespace conflicts
         builder->fFSCode.appendf("\t{ // stage %d %s \n",
diff --git a/src/gpu/gl/GrGLProgram.h b/src/gpu/gl/GrGLProgram.h
index b755f75..8b06b3e 100644
--- a/src/gpu/gl/GrGLProgram.h
+++ b/src/gpu/gl/GrGLProgram.h
@@ -90,19 +90,6 @@
             return reinterpret_cast<const uint32_t*>(this);
         }
 
-        enum OutputConfig {
-            // PM-color OR color with no alpha channel
-            kPremultiplied_OutputConfig,
-            // nonPM-color with alpha channel. Round components up after
-            // dividing by alpha. Assumes output is 8 bits for r, g, and b
-            kUnpremultiplied_RoundUp_OutputConfig,
-            // nonPM-color with alpha channel. Round components down after
-            // dividing by alpha. Assumes output is 8 bits for r, g, and b
-            kUnpremultiplied_RoundDown_OutputConfig,
-
-            kOutputConfigCnt
-        };
-
         struct StageDesc {
             enum OptFlagBits {
                 kNoPerspective_OptFlagBit       = 1 << 0,
@@ -118,42 +105,20 @@
                 kNone_InConfigFlag                      = 0x00,
 
                 /**
-                  Swap the R and B channels. This is incompatible with
-                  kSmearAlpha. It is prefereable to perform the swizzle outside
-                  the shader using GL_ARB_texture_swizzle if possible rather
-                  than setting this flag.
-                 */
-                kSwapRAndB_InConfigFlag                 = 0x01,
-
-                /**
                  Smear alpha across all four channels. This is incompatible with
-                 kSwapRAndB, kMulRGBByAlpha* and kSmearRed. It is prefereable
-                 to perform the smear outside the shader using
-                 GL_ARB_texture_swizzle if possible rather than setting this
-                 flag.
+                 kSmearRed. It is prefereable to perform the smear outside the
+                 shader using GL_ARB_texture_swizzle if possible rather than
+                 setting this flag.
                 */
                 kSmearAlpha_InConfigFlag                = 0x02,
 
                 /**
-                 Smear the red channel across all four channels. This flag is
-                 incompatible with kSwapRAndB, kMulRGBByAlpha*and kSmearAlpha.
-                 It is preferable to use GL_ARB_texture_swizzle instead of this
-                 flag.
+                 Smear the red channel across all four channels. This flag is 
+                 incompatible with kSmearAlpha. It is preferable to use
+                 GL_ARB_texture_swizzle instead of this  flag.
                 */
                 kSmearRed_InConfigFlag                  = 0x04,
 
-                /**
-                 Multiply r,g,b by a after texture reads. This flag incompatible
-                 with kSmearAlpha.
-
-                 It is assumed the src texture has 8bit color components. After
-                 reading the texture one version rounds up to the next multiple
-                 of 1/255.0 and the other rounds down. At most one of these
-                 flags may be set.
-                 */
-                kMulRGBByAlpha_RoundUp_InConfigFlag     =  0x08,
-                kMulRGBByAlpha_RoundDown_InConfigFlag   =  0x10,
-
                 kDummyInConfigFlag,
                 kInConfigBitMask = (kDummyInConfigFlag-1) |
                                    (kDummyInConfigFlag-2)
@@ -218,14 +183,12 @@
 
         uint8_t fColorInput;        // casts to enum ColorInput
         uint8_t fCoverageInput;     // casts to enum CoverageInput
-        uint8_t fOutputConfig;      // casts to enum OutputConfig
         uint8_t fDualSrcOutput;     // casts to enum DualSrcOutput
         int8_t fFirstCoverageStage;
         SkBool8 fEmitsPointSize;
         SkBool8 fColorMatrixEnabled;
 
         uint8_t fColorFilterXfermode;  // casts to enum SkXfermode::Mode
-        int8_t fPadding[1];
     };
     GR_STATIC_ASSERT(!(sizeof(Desc) % 4));
 
diff --git a/src/gpu/gl/GrGLShaderBuilder.cpp b/src/gpu/gl/GrGLShaderBuilder.cpp
index 167eb78..2243a7c 100644
--- a/src/gpu/gl/GrGLShaderBuilder.cpp
+++ b/src/gpu/gl/GrGLShaderBuilder.cpp
@@ -87,24 +87,12 @@
 }
 
 void GrGLShaderBuilder::computeSwizzle(uint32_t configFlags) {
-   static const uint32_t kMulByAlphaMask =
-        (GrGLProgram::StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag |
-         GrGLProgram::StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag);
-
     fSwizzle = "";
-    if (configFlags & GrGLProgram::StageDesc::kSwapRAndB_InConfigFlag) {
-        GrAssert(!(configFlags &
-                   GrGLProgram::StageDesc::kSmearAlpha_InConfigFlag));
-        GrAssert(!(configFlags &
-                   GrGLProgram::StageDesc::kSmearRed_InConfigFlag));
-        fSwizzle = ".bgra";
-    } else if (configFlags & GrGLProgram::StageDesc::kSmearAlpha_InConfigFlag) {
-        GrAssert(!(configFlags & kMulByAlphaMask));
+    if (configFlags & GrGLProgram::StageDesc::kSmearAlpha_InConfigFlag) {
         GrAssert(!(configFlags &
                    GrGLProgram::StageDesc::kSmearRed_InConfigFlag));
         fSwizzle = ".aaaa";
     } else if (configFlags & GrGLProgram::StageDesc::kSmearRed_InConfigFlag) {
-        GrAssert(!(configFlags & kMulByAlphaMask));
         GrAssert(!(configFlags &
                    GrGLProgram::StageDesc::kSmearAlpha_InConfigFlag));
         fSwizzle = ".rrrr";
diff --git a/src/gpu/gl/GrGpuGL.cpp b/src/gpu/gl/GrGpuGL.cpp
index 3763709..8e6e43c 100644
--- a/src/gpu/gl/GrGpuGL.cpp
+++ b/src/gpu/gl/GrGpuGL.cpp
@@ -178,7 +178,6 @@
     fProgramCache = SkNEW_ARGS(ProgramCache, (this->glContextInfo()));
 
     fLastSuccessfulStencilFmtIdx = 0;
-    fCanPreserveUnpremulRoundtrip = kUnknown_CanPreserveUnpremulRoundtrip;
     if (false) { // avoid bit rot, suppress warning
         fbo_test(this->glInterface(), 0, 0);
     }
@@ -343,88 +342,6 @@
     }
 }
 
-bool GrGpuGL::canPreserveReadWriteUnpremulPixels() {
-    if (kUnknown_CanPreserveUnpremulRoundtrip ==
-        fCanPreserveUnpremulRoundtrip) {
-
-        SkAutoTMalloc<uint32_t> data(256 * 256 * 3);
-        uint32_t* srcData = data.get();
-        uint32_t* firstRead = data.get() + 256 * 256;
-        uint32_t* secondRead = data.get() + 2 * 256 * 256;
-
-        for (int y = 0; y < 256; ++y) {
-            for (int x = 0; x < 256; ++x) {
-                uint8_t* color = reinterpret_cast<uint8_t*>(&srcData[256*y + x]);
-                color[3] = y;
-                color[2] = x;
-                color[1] = x;
-                color[0] = x;
-            }
-        }
-
-        // We have broader support for read/write pixels on render targets
-        // than on textures.
-        GrTextureDesc dstDesc;
-        dstDesc.fFlags = kRenderTarget_GrTextureFlagBit |
-                         kNoStencil_GrTextureFlagBit;
-        dstDesc.fWidth = 256;
-        dstDesc.fHeight = 256;
-        dstDesc.fConfig = kRGBA_8888_GrPixelConfig;
-
-        SkAutoTUnref<GrTexture> dstTex(this->createTexture(dstDesc, NULL, 0));
-        if (!dstTex.get()) {
-            return false;
-        }
-        GrRenderTarget* rt = dstTex.get()->asRenderTarget();
-        GrAssert(NULL != rt);
-
-        bool failed = true;
-        static const UnpremulConversion gMethods[] = {
-            kUpOnWrite_DownOnRead_UnpremulConversion,
-            kDownOnWrite_UpOnRead_UnpremulConversion,
-        };
-
-        // pretend that we can do the roundtrip to avoid recursive calls to
-        // this function
-        fCanPreserveUnpremulRoundtrip = kYes_CanPreserveUnpremulRoundtrip;
-        for (size_t i = 0; i < GR_ARRAY_COUNT(gMethods) && failed; ++i) {
-            fUnpremulConversion = gMethods[i];
-            rt->writePixels(0, 0,
-                            256, 256,
-                            kRGBA_8888_GrPixelConfig, srcData, 0,
-                            GrContext::kUnpremul_PixelOpsFlag);
-            rt->readPixels(0, 0,
-                           256, 256,
-                           kRGBA_8888_GrPixelConfig, firstRead, 0,
-                           GrContext::kUnpremul_PixelOpsFlag);
-            rt->writePixels(0, 0,
-                            256, 256,
-                            kRGBA_8888_GrPixelConfig, firstRead, 0,
-                            GrContext::kUnpremul_PixelOpsFlag);
-            rt->readPixels(0, 0,
-                           256, 256,
-                           kRGBA_8888_GrPixelConfig, secondRead, 0,
-                           GrContext::kUnpremul_PixelOpsFlag);
-            failed = false;
-            for (int j = 0; j < 256 * 256; ++j) {
-                if (firstRead[j] != secondRead[j]) {
-                    failed = true;
-                    break;
-                }
-            }
-        }
-        fCanPreserveUnpremulRoundtrip = failed ?
-                        kNo_CanPreserveUnpremulRoundtrip :
-                        kYes_CanPreserveUnpremulRoundtrip;
-    }
-
-    if (kYes_CanPreserveUnpremulRoundtrip == fCanPreserveUnpremulRoundtrip) {
-        return true;
-    } else {
-        return false;
-    }
-}
-
 GrPixelConfig GrGpuGL::preferredReadPixelsConfig(GrPixelConfig config) const {
     if (GR_GL_RGBA_8888_PIXEL_OPS_SLOW && GrPixelConfigIsRGBA8888(config)) {
         return GrPixelConfigSwapRAndB(config);
@@ -2095,10 +2012,6 @@
                                                     GR_GL_ALPHA, GR_GL_ALPHA };
             return gAlphaSmear;
         }
-    } else if (sampler.swapsRAndB()) {
-        static const GrGLenum gRedBlueSwap[] = { GR_GL_BLUE, GR_GL_GREEN,
-                                                 GR_GL_RED,  GR_GL_ALPHA };
-        return gRedBlueSwap;
     } else {
         static const GrGLenum gStraight[] = { GR_GL_RED, GR_GL_GREEN,
                                               GR_GL_BLUE,  GR_GL_ALPHA };
diff --git a/src/gpu/gl/GrGpuGL.h b/src/gpu/gl/GrGpuGL.h
index 2cfdd0d..d2a22d0 100644
--- a/src/gpu/gl/GrGpuGL.h
+++ b/src/gpu/gl/GrGpuGL.h
@@ -49,8 +49,6 @@
                                     size_t rowBytes) const SK_OVERRIDE;
     virtual bool fullReadPixelsIsFasterThanPartial() const SK_OVERRIDE;
 
-    virtual bool canPreserveReadWriteUnpremulPixels() SK_OVERRIDE;
-
     virtual void abandonResources() SK_OVERRIDE;
 
     bool programUnitTest();
@@ -391,17 +389,6 @@
     // from our loop that tries stencil formats and calls check fb status.
     int fLastSuccessfulStencilFmtIdx;
 
-    enum UnpremulConversion {
-        kUpOnWrite_DownOnRead_UnpremulConversion,
-        kDownOnWrite_UpOnRead_UnpremulConversion
-    } fUnpremulConversion;
-
-    enum CanPreserveUnpremulRoundtrip {
-        kUnknown_CanPreserveUnpremulRoundtrip,
-        kNo_CanPreserveUnpremulRoundtrip,
-        kYes_CanPreserveUnpremulRoundtrip,
-    } fCanPreserveUnpremulRoundtrip;
-
     bool fPrintedCaps;
 
     typedef GrGpu INHERITED;
diff --git a/src/gpu/gl/GrGpuGL_program.cpp b/src/gpu/gl/GrGpuGL_program.cpp
index f9d71db..174f5cb 100644
--- a/src/gpu/gl/GrGpuGL_program.cpp
+++ b/src/gpu/gl/GrGpuGL_program.cpp
@@ -729,20 +729,6 @@
                             // We can use A8 textures so use kSmearAlpha.
                             stage.fInConfigFlags |= StageDesc::kSmearAlpha_InConfigFlag;
                         }
-                    } else if (sampler.swapsRAndB()) {
-                        stage.fInConfigFlags |= StageDesc::kSwapRAndB_InConfigFlag;
-                    }
-                }
-                if (sampler.premultiply()) {
-                    // Assert that if we're doing a premul conversion that the texture is 1 byte
-                    // per color component. The rounding performed by the shader generator (in
-                    // normalized float color space) assumes this.
-                    GrAssert(4 == GrBytesPerPixel(texture->config()));
-                    if (kUpOnWrite_DownOnRead_UnpremulConversion ==
-                        fUnpremulConversion) {
-                        stage.fInConfigFlags |= StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag;
-                    } else {
-                        stage.fInConfigFlags |= StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag;
                     }
                 }
             }
@@ -758,19 +744,6 @@
         }
     }
 
-    if (drawState.isStateFlagEnabled(GrDrawState::kUnpremultiply_StageBit)) {
-        // The shader generator assumes that color channels are bytes
-        // when rounding.
-        GrAssert(4 == GrBytesPerPixel(drawState.getRenderTarget()->config()));
-        if (kUpOnWrite_DownOnRead_UnpremulConversion == fUnpremulConversion) {
-            desc->fOutputConfig = ProgramDesc::kUnpremultiplied_RoundUp_OutputConfig;
-        } else {
-            desc->fOutputConfig = ProgramDesc::kUnpremultiplied_RoundDown_OutputConfig;
-        }
-    } else {
-        desc->fOutputConfig = ProgramDesc::kPremultiplied_OutputConfig;
-    }
-
     desc->fDualSrcOutput = ProgramDesc::kNone_DualSrcOutput;
 
     // Currently the experimental GS will only work with triangle prims (and it doesn't do anything
diff --git a/tests/GLProgramsTest.cpp b/tests/GLProgramsTest.cpp
index e14198c..fd24518 100644
--- a/tests/GLProgramsTest.cpp
+++ b/tests/GLProgramsTest.cpp
@@ -14,6 +14,7 @@
 
 #include "gl/GrGpuGL.h"
 #include "GrProgramStageFactory.h"
+#include "effects/GrConfigConversionEffect.h"
 
 #include "GrRandom.h"
 #include "Test.h"
@@ -44,12 +45,6 @@
                                           GrContext* context,
                                           GrTexture* dummyTextures[]) {
 
-    // TODO: Remove this when generator doesn't apply this non-custom-stage
-    // notion to custom stages automatically.
-    static const uint32_t kMulByAlphaMask =
-        StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag |
-        StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag;
-
     // The new code uses SkRandom not GrRandom.
     // TODO: Remove GrRandom.
     SkRandom sk_random;
@@ -82,10 +77,6 @@
     };
     static const int IN_CONFIG_FLAGS[] = {
         StageDesc::kNone_InConfigFlag,
-        StageDesc::kSwapRAndB_InConfigFlag,
-        StageDesc::kSwapRAndB_InConfigFlag |
-        StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag,
-        StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag,
         StageDesc::kSmearAlpha_InConfigFlag,
         StageDesc::kSmearRed_InConfigFlag,
     };
@@ -121,7 +112,6 @@
         pdesc.fExperimentalGS = this->getCaps().fGeometryShaderSupport &&
                                 random_bool(&random);
 #endif
-        pdesc.fOutputConfig =  random_int(&random, ProgramDesc::kOutputConfigCnt);
 
         bool edgeAA = random_bool(&random);
         if (edgeAA) {
@@ -212,6 +202,7 @@
 void forceLinking() {
     SkLightingImageFilter::CreateDistantLitDiffuse(SkPoint3(0,0,0), 0, 0, 0);
     SkMagnifierImageFilter mag(SkRect::MakeWH(SK_Scalar1, SK_Scalar1), SK_Scalar1);
+    GrConfigConversionEffect::Create(NULL, false);
 }
 
 #endif
diff --git a/tests/WritePixelsTest.cpp b/tests/WritePixelsTest.cpp
index 5a9cf26..4f045ef 100644
--- a/tests/WritePixelsTest.cpp
+++ b/tests/WritePixelsTest.cpp
@@ -402,7 +402,7 @@
     for (size_t i = 0; i < SK_ARRAY_COUNT(gCanvasConfigs); ++i) {
         REPORTER_ASSERT(reporter, setupCanvas(&canvas, gCanvasConfigs[i], context));
 
-        static const SkCanvas::Config8888 gReadConfigs[] = {
+        static const SkCanvas::Config8888 gSrcConfigs[] = {
             SkCanvas::kNative_Premul_Config8888,
             SkCanvas::kNative_Unpremul_Config8888,
             SkCanvas::kBGRA_Premul_Config8888,
@@ -413,9 +413,9 @@
         for (size_t r = 0; r < SK_ARRAY_COUNT(testRects); ++r) {
             const SkIRect& rect = testRects[r];
             for (int tightBmp = 0; tightBmp < 2; ++tightBmp) {
-                for (size_t c = 0; c < SK_ARRAY_COUNT(gReadConfigs); ++c) {
+                for (size_t c = 0; c < SK_ARRAY_COUNT(gSrcConfigs); ++c) {
                     fillCanvas(&canvas);
-                    SkCanvas::Config8888 config8888 = gReadConfigs[c];
+                    SkCanvas::Config8888 config8888 = gSrcConfigs[c];
                     SkBitmap bmp;
                     REPORTER_ASSERT(reporter, setupBitmap(&bmp, config8888, rect.width(), rect.height(), SkToBool(tightBmp)));
                     canvas.writePixels(bmp, rect.fLeft, rect.fTop, config8888);