GPU-based Gaussian blur.

This is a first stab at implementing a GPU-based
Gaussian blur in Ganesh.  The convolution shader is implemented as a new
filtering mode.  There are several known issues:

- no support for blur types other than "normal"
- FBO truncation problem at high zoom values
- uses bilinear for upsampling instead of Mitchell

Review URL:  http://codereview.appspot.com/4645082/



git-svn-id: http://skia.googlecode.com/svn/trunk@1830 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/gpu/include/GrContext.h b/gpu/include/GrContext.h
index a8810f9..0c873ee 100644
--- a/gpu/include/GrContext.h
+++ b/gpu/include/GrContext.h
@@ -490,6 +490,21 @@
     void writePixels(int left, int top, int width, int height,
                      GrPixelConfig, const void* buffer, size_t stride);
 
+    /**
+     * Performs a 1D convolution over a rectangle of pixels.  Set
+     * imageIncrement to (1/w, 0) for a convolution in X, (0, 1/h) for a
+     * convolution in Y, where w, h are the texture dimensions.
+     * @param srcTexture      the texture to read from
+     * @param dstRect         the destination rectangle
+     * @param imageIncrement  the displacement between pixel samples
+     * @param kernel          the convolution kernel (kernelWidth elements)
+     * @param kernelWidth     the width of the convolution kernel
+     */
+    void convolveRect(GrTexture* srcTexture,
+                      const SkRect& dstRect,
+                      float imageIncrement[2],
+                      const float* kernel,
+                      int kernelWidth);
     ///////////////////////////////////////////////////////////////////////////
     // Helpers
 
diff --git a/gpu/include/GrSamplerState.h b/gpu/include/GrSamplerState.h
index d10d8c4..c7a4f2b 100644
--- a/gpu/include/GrSamplerState.h
+++ b/gpu/include/GrSamplerState.h
@@ -21,6 +21,8 @@
 #include "GrTypes.h"
 #include "GrMatrix.h"
 
+#define MAX_KERNEL_WIDTH 25
+
 class GrSamplerState {
 public:
     enum Filter {
@@ -39,6 +41,10 @@
          * between texels in x and y spaced 4 texels apart.)
          */
         k4x4Downsample_Filter,
+        /**
+         * Apply a separable convolution kernel.
+         */
+        kConvolution_Filter
     };
 
     /**
@@ -148,6 +154,9 @@
     const GrRect& getTextureDomain() const { return fTextureDomain; }
     bool hasTextureDomain() const {return SkIntToScalar(0) != fTextureDomain.right();}
     Filter getFilter() const { return fFilter; }
+    int getKernelWidth() const { return fKernelWidth; }
+    const float* getKernel() const { return fKernel; }
+    const float* getImageIncrement() const { return fImageIncrement; }
 
     bool isGradient() const {
         return  kRadial_SampleMode == fSampleMode ||
@@ -220,6 +229,19 @@
         fRadial2PosRoot = posRoot;
     }
 
+    void setConvolutionParams(int kernelWidth, const float* kernel, float imageIncrement[2]) {
+        GrAssert(kernelWidth >= 0 && kernelWidth <= MAX_KERNEL_WIDTH);
+        fKernelWidth = kernelWidth;
+        if (NULL != kernel) {
+            memcpy(fKernel, kernel, kernelWidth * sizeof(float));
+        }
+        if (NULL != imageIncrement) {
+            memcpy(fImageIncrement, imageIncrement, sizeof(fImageIncrement));
+        } else {
+            memset(fImageIncrement, 0, sizeof(fImageIncrement));
+        }
+    }
+
     static const GrSamplerState& ClampNoFilter() {
         return gClampNoFilter;
     }
@@ -237,6 +259,11 @@
     GrScalar    fRadial2Radius0;
     bool        fRadial2PosRoot;
 
+    // These are undefined unless fFilter == kConvolution_Filter
+    int         fKernelWidth;
+    float       fKernel[MAX_KERNEL_WIDTH];
+    float       fImageIncrement[2];
+
     static const GrSamplerState gClampNoFilter;
 };
 
diff --git a/gpu/src/GrContext.cpp b/gpu/src/GrContext.cpp
index b75c917..092b0ba 100644
--- a/gpu/src/GrContext.cpp
+++ b/gpu/src/GrContext.cpp
@@ -1699,3 +1699,22 @@
     }
 }
 
+void GrContext::convolveRect(GrTexture* srcTexture,
+                             const SkRect& rect,
+                             float imageIncrement[2],
+                             const float* kernel,
+                             int kernelWidth) {
+    GrDrawTarget::AutoStateRestore asr(fGpu);
+    GrMatrix sampleM;
+    GrSamplerState sampler(GrSamplerState::kClamp_WrapMode, 
+                           GrSamplerState::kClamp_WrapMode,
+                           GrSamplerState::kConvolution_Filter);
+    sampler.setConvolutionParams(kernelWidth, kernel, imageIncrement);
+    sampleM.setScale(GR_Scalar1 / srcTexture->width(),
+                     GR_Scalar1 / srcTexture->height());
+    sampler.setMatrix(sampleM);
+    fGpu->setSamplerState(0, sampler);
+    fGpu->setViewMatrix(GrMatrix::I());
+    fGpu->setTexture(0, srcTexture);
+    fGpu->drawSimpleRect(rect, NULL, 1 << 0);
+}
diff --git a/gpu/src/GrGLProgram.cpp b/gpu/src/GrGLProgram.cpp
index d6a832c..4290c96 100644
--- a/gpu/src/GrGLProgram.cpp
+++ b/gpu/src/GrGLProgram.cpp
@@ -136,6 +136,13 @@
     s->appendS32(stage);
 }
 
+static void convolve_param_names(int stage, GrStringBuilder* k, GrStringBuilder* i) {
+    *k = "uKernel";
+    k->appendS32(stage);
+    *i = "uImageIncrement";
+    i->appendS32(stage);
+}
+
 static void tex_domain_name(int stage, GrStringBuilder* s) {
     *s = "uTexDom";
     s->appendS32(stage);
@@ -941,6 +948,22 @@
                                              texDomName.c_str()));
                 GrAssert(kUnusedUniform != locations.fTexDomUni);
             }
+
+            GrStringBuilder kernelName, imageIncrementName;
+            convolve_param_names(s, &kernelName, &imageIncrementName);
+            if (kUseUniform == locations.fKernelUni) {
+                locations.fKernelUni = GR_GL(GetUniformLocation(
+                                             progID,
+                                             kernelName.c_str()));
+                GrAssert(kUnusedUniform != locations.fKernelUni);
+            }
+
+            if (kUseUniform == locations.fImageIncrementUni) {
+                locations.fImageIncrementUni = GR_GL(GetUniformLocation(
+                                                     progID,
+                                                     imageIncrementName.c_str()));
+                GrAssert(kUnusedUniform != locations.fImageIncrementUni);
+            }
         }
     }
     GR_GL(UseProgram(progID));
@@ -1058,6 +1081,24 @@
         }
     }
 
+    GrStringBuilder kernelName, kernelWidthName, imageIncrementName;
+    convolve_param_names(stageNum, &kernelName, &imageIncrementName);
+
+    if (ProgramDesc::StageDesc::kConvolution_FetchMode == desc.fFetchMode) {
+        segments->fFSUnis.appendf("uniform float %s[%d];\n",
+                                  kernelName.c_str(), desc.fKernelWidth);
+        segments->fFSUnis.appendf("uniform vec2 %s;\n",
+                                  imageIncrementName.c_str());
+        segments->fVSUnis.appendf("uniform vec2 %s;\n",
+                                  imageIncrementName.c_str());
+        locations->fKernelUni = kUseUniform;
+        locations->fImageIncrementUni = kUseUniform;
+        float scale = (desc.fKernelWidth - 1) * 0.5f;
+        segments->fVSCode.appendf("\t%s -= vec2(%g, %g) * %s;\n",
+                                  varyingName.c_str(), scale, scale,
+                                  imageIncrementName.c_str());
+}
+
     /// Fragment Shader Stuff
     GrStringBuilder fsCoordName;
     // function used to access the shader, may be made projective
@@ -1229,6 +1270,17 @@
         segments->fFSCode.appendf("\t%s += %s(%s, %s + vec2(-%s.x,+%s.y))%s;\n", accumVar.c_str(), texFunc.c_str(), samplerName.c_str(), sampleCoords.c_str(), texelSizeName.c_str(), texelSizeName.c_str(), smear);
         segments->fFSCode.appendf("\t%s += %s(%s, %s + vec2(+%s.x,+%s.y))%s;\n", accumVar.c_str(), texFunc.c_str(), samplerName.c_str(), sampleCoords.c_str(), texelSizeName.c_str(), texelSizeName.c_str(), smear);
         segments->fFSCode.appendf("\t%s = .25 * %s%s;\n", fsOutColor, accumVar.c_str(), modulate.c_str());
+    } else if (ProgramDesc::StageDesc::kConvolution_FetchMode == desc.fFetchMode) {
+        segments->fFSCode.append("\tvec4 sum = vec4(0, 0, 0, 0);\n");
+        segments->fFSCode.appendf("\tvec2 coord = %s;\n", sampleCoords.c_str());
+        segments->fFSCode.appendf("\tfor (int i = 0; i < %d; i++) {\n", desc.fKernelWidth);
+        segments->fFSCode.appendf("\t\tsum += %s(%s, coord)%s * %s[i];\n",
+                                  texFunc.c_str(), samplerName.c_str(),
+                                  smear, kernelName.c_str());
+        segments->fFSCode.appendf("\t\tcoord += %s;\n",
+                                  imageIncrementName.c_str());
+        segments->fFSCode.appendf("\t}\n");
+        segments->fFSCode.appendf("\t%s = sum%s;\n", fsOutColor, modulate.c_str());
     } else {
         segments->fFSCode.appendf("\t%s = %s(%s, %s)%s%s;\n", fsOutColor, texFunc.c_str(), samplerName.c_str(), sampleCoords.c_str(), smear, modulate.c_str());
     }
diff --git a/gpu/src/GrGLProgram.h b/gpu/src/GrGLProgram.h
index edd4737..60a7177 100644
--- a/gpu/src/GrGLProgram.h
+++ b/gpu/src/GrGLProgram.h
@@ -114,6 +114,7 @@
             enum FetchMode {
                 kSingle_FetchMode,
                 k2x2_FetchMode,
+                kConvolution_FetchMode,
 
                 kFetchModeCnt,
             };
@@ -132,6 +133,7 @@
             uint8_t fModulation;  // casts to enum Modulation
             uint8_t fFetchMode;  // casts to enum FetchMode
             uint8_t fCoordMapping;  // casts to enum CoordMapping
+            uint8_t fKernelWidth;
 
             inline bool isEnabled() const {
                 return fOptFlags & kIsEnabled_OptFlagBit;
@@ -197,12 +199,16 @@
         GrGLint fSamplerUni;
         GrGLint fRadial2Uni;
         GrGLint fTexDomUni;
+        GrGLint fKernelUni;
+        GrGLint fImageIncrementUni;
         void reset() {
             fTextureMatrixUni = kUnusedUniform;
             fNormalizedTexelSizeUni = kUnusedUniform;
             fSamplerUni = kUnusedUniform;
             fRadial2Uni = kUnusedUniform;
             fTexDomUni = kUnusedUniform;
+            fKernelUni = kUnusedUniform;
+            fImageIncrementUni = kUnusedUniform;
         }
     };
 
diff --git a/gpu/src/GrGpuGL.cpp b/gpu/src/GrGpuGL.cpp
index 1f9afdc..1a9d7fe 100644
--- a/gpu/src/GrGpuGL.cpp
+++ b/gpu/src/GrGpuGL.cpp
@@ -574,8 +574,8 @@
         fHWDrawState.fSamplerStates[s].setRadial2Params(-GR_ScalarMax,
                                                         -GR_ScalarMax,
                                                         true);
-
         fHWDrawState.fSamplerStates[s].setMatrix(GrMatrix::InvalidMatrix());
+        fHWDrawState.fSamplerStates[s].setConvolutionParams(0, NULL, NULL);
     }
 
     fHWBounds.fScissorRect.invalidate();
@@ -586,6 +586,7 @@
     fHWDrawState.fStencilSettings.invalidate();
     fHWStencilClip = false;
     fClipState.fClipIsDirty = true;
+    fClipState.fClipInStencil = false;
 
     fHWGeometryState.fIndexBuffer = NULL;
     fHWGeometryState.fVertexBuffer = NULL;
@@ -1786,6 +1787,20 @@
     }
 }
 
+static unsigned grToGLFilter(GrSamplerState::Filter filter) {
+    switch (filter) {
+        case GrSamplerState::kBilinear_Filter:
+        case GrSamplerState::k4x4Downsample_Filter:
+            return GR_GL_LINEAR;
+        case GrSamplerState::kNearest_Filter:
+        case GrSamplerState::kConvolution_Filter:
+            return GR_GL_NEAREST;
+        default:
+            GrAssert(!"Unknown filter type");
+            return GR_GL_LINEAR;
+    }
+}
+
 bool GrGpuGL::flushGLStateCommon(GrPrimitiveType type) {
 
     // GrGpu::setupClipAndFlushState should have already checked this
@@ -1827,11 +1842,7 @@
                                                 nextTexture->getTexParams();
             GrGLTexture::TexParams newTexParams;
 
-            if (GrSamplerState::kNearest_Filter == sampler.getFilter()) {
-                newTexParams.fFilter = GR_GL_NEAREST;
-            } else {
-                newTexParams.fFilter = GR_GL_LINEAR;
-            }
+            newTexParams.fFilter = grToGLFilter(sampler.getFilter());
 
             newTexParams.fWrapS =
                         GrGLTexture::WrapMode2GLWrap()[sampler.getWrapX()];
diff --git a/gpu/src/GrGpuGLShaders.cpp b/gpu/src/GrGpuGLShaders.cpp
index 1309802..a52501b 100644
--- a/gpu/src/GrGpuGLShaders.cpp
+++ b/gpu/src/GrGpuGLShaders.cpp
@@ -453,6 +453,18 @@
     }
 }
 
+void GrGpuGLShaders::flushConvolution(int s) {
+    const GrSamplerState& sampler = fCurrDrawState.fSamplerStates[s];
+    int kernelUni = fProgramData->fUniLocations.fStages[s].fKernelUni;
+    if (GrGLProgram::kUnusedUniform != kernelUni) {
+        GR_GL(Uniform1fv(kernelUni, sampler.getKernelWidth(), sampler.getKernel()));
+    }
+    int imageIncrementUni = fProgramData->fUniLocations.fStages[s].fImageIncrementUni;
+    if (GrGLProgram::kUnusedUniform != imageIncrementUni) {
+        GR_GL(Uniform2fv(imageIncrementUni, 1, sampler.getImageIncrement()));
+    }
+}
+
 void GrGpuGLShaders::flushTexelSize(int s) {
     const int& uni = fProgramData->fUniLocations.fStages[s].fNormalizedTexelSizeUni;
     if (GrGLProgram::kUnusedUniform != uni) {
@@ -587,6 +599,8 @@
 
         this->flushRadial2(s);
 
+        this->flushConvolution(s);
+
         this->flushTexelSize(s);
 
         this->flushTextureDomain(s);
@@ -784,6 +798,10 @@
                 case GrSamplerState::k4x4Downsample_Filter:
                     stage.fFetchMode = StageDesc::k2x2_FetchMode;
                     break;
+                // performs fKernelWidth texture2D()s
+                case GrSamplerState::kConvolution_Filter:
+                    stage.fFetchMode = StageDesc::kConvolution_FetchMode;
+                    break;
                 default:
                     GrCrash("Unexpected filter!");
                     break;
@@ -802,6 +820,11 @@
             } else {
                 stage.fModulation = StageDesc::kColor_Modulation;
             }
+            if (sampler.getFilter() == GrSamplerState::kConvolution_Filter) {
+                stage.fKernelWidth = sampler.getKernelWidth();
+            } else {
+                stage.fKernelWidth = 0;
+            }
         } else {
             stage.fOptFlags     = 0;
             stage.fCoordMapping = (StageDesc::CoordMapping)0;
diff --git a/gpu/src/GrGpuGLShaders.h b/gpu/src/GrGpuGLShaders.h
index 17811c8..43ff5ea 100644
--- a/gpu/src/GrGpuGLShaders.h
+++ b/gpu/src/GrGpuGLShaders.h
@@ -70,6 +70,9 @@
     // flushes the parameters to two point radial gradient
     void flushRadial2(int stage);
 
+    // flushes the parameters for convolution
+    void flushConvolution(int stage);
+
     // flushes the normalized texel size
     void flushTexelSize(int stage);
 
diff --git a/src/gpu/SkGpuDevice.cpp b/src/gpu/SkGpuDevice.cpp
index d6ee03d..54f0e79 100644
--- a/src/gpu/SkGpuDevice.cpp
+++ b/src/gpu/SkGpuDevice.cpp
@@ -47,6 +47,8 @@
 };
 
 
+#define USE_GPU_BLUR false
+#define MAX_SIGMA 4.0f
 ///////////////////////////////////////////////////////////////////////////////
 
 SkGpuDevice::SkAutoCachedTexture::
@@ -790,6 +792,214 @@
 #include "SkMaskFilter.h"
 #include "SkBounder.h"
 
+static GrPathFill skToGrFillType(SkPath::FillType fillType) {
+    switch (fillType) {
+        case SkPath::kWinding_FillType:
+            return kWinding_PathFill;
+        case SkPath::kEvenOdd_FillType:
+            return kEvenOdd_PathFill;
+        case SkPath::kInverseWinding_FillType:
+            return kInverseWinding_PathFill;
+        case SkPath::kInverseEvenOdd_FillType:
+            return kInverseEvenOdd_PathFill;
+        default:
+            SkDebugf("Unsupported path fill type\n");
+            return kHairLine_PathFill;
+    }
+}
+
+static float gauss(float x, float sigma)
+{
+    // Note that the constant term (1/(sqrt(2*pi*sigma^2)) is dropped here,
+    // since we renormalize the kernel after generation anyway.
+    return exp(- (x * x) / (2.0f * sigma * sigma));
+}
+
+static void buildKernel(float sigma, float* kernel, int kernelWidth)
+{
+    int halfWidth = (kernelWidth - 1) / 2;
+    float sum = 0.0f;
+    for (int i = 0; i < kernelWidth; ++i) {
+        kernel[i] = gauss(i - halfWidth, sigma);
+        sum += kernel[i];
+    }
+    // Normalize the kernel
+    float scale = 1.0f / sum;
+    for (int i = 0; i < kernelWidth; ++i)
+        kernel[i] *= scale;
+}
+
+static void swap(GrTexture*& a, GrTexture*& b)
+{
+    GrTexture* tmp = a;
+    a = b;
+    b = tmp;
+}
+
+static void scaleRect(SkRect* rect, float scale)
+{
+    rect->fLeft *= scale;
+    rect->fTop *= scale;
+    rect->fRight *= scale;
+    rect->fBottom *= scale;
+}
+
+static bool drawWithGPUMaskFilter(GrContext* context, const SkPath& path,
+                                  SkMaskFilter* filter, const SkMatrix& matrix,
+                                  const SkRegion& clip, SkBounder* bounder,
+                                  GrPaint* grp) {
+    SkMaskFilter::BlurInfo info;
+    if (!filter->asABlur(&info)) {
+        return false;
+    }
+    float radius = info.fIgnoreTransform ? info.fRadius
+                                         : matrix.mapRadius(info.fRadius);
+    float sigma = radius * 0.6666f;
+    SkRect srcRect = path.getBounds();
+
+    int scaleFactor = 1;
+
+    while (sigma > MAX_SIGMA) {
+        scaleFactor *= 2;
+        sigma *= 0.5f;
+    }
+    scaleRect(&srcRect, 1.0f / scaleFactor);
+    int halfWidth = static_cast<int>(sigma * 3.0f);
+    int kernelWidth = halfWidth * 2 + 1;
+
+    SkIRect srcIRect;
+    srcRect.roundOut(&srcIRect);
+    srcRect.set(srcIRect);
+    srcRect.inset(-halfWidth, -halfWidth);
+
+    scaleRect(&srcRect, scaleFactor);
+    SkRect finalRect = srcRect;
+
+    SkIRect finalIRect;
+    finalRect.roundOut(&finalIRect);
+    if (clip.quickReject(finalIRect)) {
+        return false;
+    }
+    if (bounder && !bounder->doIRect(finalIRect)) {
+        return false;
+    }
+    GrPoint offset = GrPoint::Make(-srcRect.fLeft, -srcRect.fTop);
+    srcRect.offset(-srcRect.fLeft, -srcRect.fTop);
+    const GrTextureDesc desc = {
+        kRenderTarget_GrTextureFlagBit,
+        kNone_GrAALevel,
+        srcRect.width(),
+        srcRect.height(),
+        // We actually only need A8, but it often isn't supported as a
+        // render target
+        kRGBA_8888_GrPixelConfig
+    };
+
+    GrTextureEntry* srcEntry = context->findApproximateKeylessTexture(desc);
+    GrTextureEntry* dstEntry = context->findApproximateKeylessTexture(desc);
+    if (NULL == srcEntry || NULL == dstEntry) {
+        return false;
+    }
+    GrTexture* srcTexture = srcEntry->texture();
+    GrTexture* dstTexture = dstEntry->texture();
+    if (NULL == srcTexture || NULL == dstTexture) {
+        return false;
+    }
+    GrRenderTarget* oldRenderTarget = context->getRenderTarget();
+    context->setRenderTarget(dstTexture->asRenderTarget());
+    // FIXME:  could just clear bounds
+    context->clear(NULL, 0);
+    GrMatrix transM;
+    GrPaint tempPaint;
+    tempPaint.reset();
+
+    GrAutoMatrix avm(context, GrMatrix::I());
+    // Draw hard shadow to offscreen context, with path topleft at origin 0,0.
+    context->drawPath(tempPaint, path, skToGrFillType(path.getFillType()), &offset);
+    swap(srcTexture, dstTexture);
+
+    GrMatrix sampleM;
+    sampleM.setScale(GR_Scalar1 / srcTexture->width(),
+                     GR_Scalar1 / srcTexture->height());
+    GrPaint paint;
+    paint.reset();
+    paint.getTextureSampler(0)->setFilter(GrSamplerState::kBilinear_Filter);
+    paint.getTextureSampler(0)->setMatrix(sampleM);
+    for (int i = 1; i < scaleFactor; i *= 2) {
+        context->setRenderTarget(dstTexture->asRenderTarget());
+        SkRect dstRect(srcRect);
+        scaleRect(&dstRect, 0.5f);
+        // Clear out 1 pixel border for linear filtering.
+        // FIXME:  for now, clear everything
+        context->clear(NULL, 0);
+        paint.setTexture(0, srcTexture);
+        context->drawRectToRect(paint, dstRect, srcRect);
+        srcRect = dstRect;
+        swap(srcTexture, dstTexture);
+    }
+
+    SkAutoTMalloc<float> kernelStorage(kernelWidth);
+    float* kernel = kernelStorage.get();
+    buildKernel(sigma, kernel, kernelWidth);
+
+    float imageIncrementX[2] = {1.0f / srcTexture->width(), 0.0f};
+    context->setRenderTarget(dstTexture->asRenderTarget());
+    context->clear(NULL, 0);
+    context->convolveRect(srcTexture, srcRect, imageIncrementX, kernel,
+                          kernelWidth);
+    swap(srcTexture, dstTexture);
+
+    float imageIncrementY[2] = {0.0f, 1.0f / srcTexture->height()};
+    context->setRenderTarget(dstTexture->asRenderTarget());
+    context->clear(NULL, 0);
+    context->convolveRect(srcTexture, srcRect, imageIncrementY, kernel,
+                          kernelWidth);
+    swap(srcTexture, dstTexture);
+
+    if (scaleFactor > 1) {
+        // FIXME:  This should be mitchell, not bilinear.
+        paint.getTextureSampler(0)->setFilter(GrSamplerState::kBilinear_Filter);
+        sampleM.setScale(GR_Scalar1 / srcTexture->width(),
+                         GR_Scalar1 / srcTexture->height());
+        paint.getTextureSampler(0)->setMatrix(sampleM);
+        context->setRenderTarget(dstTexture->asRenderTarget());
+        // Clear out 2 pixel border for bicubic filtering.
+        // FIXME:  for now, clear everything
+        context->clear(NULL, 0);
+        paint.setTexture(0, srcTexture);
+        SkRect dstRect(srcRect);
+        scaleRect(&dstRect, scaleFactor);
+        context->drawRectToRect(paint, dstRect, srcRect);
+        srcRect = dstRect;
+        swap(srcTexture, dstTexture);
+    }
+
+    context->setRenderTarget(oldRenderTarget);
+    
+    if (grp->hasTextureOrMask()) {
+        GrMatrix inverse;
+        if (!matrix.invert(&inverse)) {
+            return false;
+        }
+        grp->preConcatActiveSamplerMatrices(inverse);
+    }
+
+    static const int MASK_IDX = GrPaint::kMaxMasks - 1;
+    // we assume the last mask index is available for use
+    GrAssert(NULL == grp->getMask(MASK_IDX));
+    grp->setMask(MASK_IDX, srcTexture);
+    grp->getMaskSampler(MASK_IDX)->setClampNoFilter();
+
+    GrMatrix m;
+    m.setTranslate(-finalRect.fLeft, -finalRect.fTop);
+    m.postIDiv(srcTexture->width(), srcTexture->height());
+    grp->getMaskSampler(MASK_IDX)->setMatrix(m);
+    context->drawRect(*grp, finalRect);
+    context->unlockTexture(srcEntry);
+    context->unlockTexture(dstEntry);
+    return true;
+}
+
 static bool drawWithMaskFilter(GrContext* context, const SkPath& path,
                                SkMaskFilter* filter, const SkMatrix& matrix,
                                const SkRegion& clip, SkBounder* bounder,
@@ -928,9 +1138,15 @@
 
         // transform the path into device space
         pathPtr->transform(*draw.fMatrix, devPathPtr);
-
-        drawWithMaskFilter(fContext, *devPathPtr, paint.getMaskFilter(),
-                           *draw.fMatrix, *draw.fClip, draw.fBounder, &grPaint);
+        if (USE_GPU_BLUR) {
+            drawWithGPUMaskFilter(fContext, *devPathPtr, paint.getMaskFilter(),
+                                  *draw.fMatrix, *draw.fClip, draw.fBounder,
+                                  &grPaint);
+        } else {
+            drawWithMaskFilter(fContext, *devPathPtr, paint.getMaskFilter(),
+                               *draw.fMatrix, *draw.fClip, draw.fBounder,
+                               &grPaint);
+        }
         return;
     }