diff --git a/gyp/gpu.gyp b/gyp/gpu.gyp
index 04cb4e3..621ea02 100644
--- a/gyp/gpu.gyp
+++ b/gyp/gpu.gyp
@@ -187,6 +187,7 @@
         '../include/gpu/GrNoncopyable.h',
         '../include/gpu/GrPaint.h',
         '../include/gpu/GrPoint.h',
+        '../include/gpu/GrProgramStageFactory.h',
         '../include/gpu/GrRect.h',
         '../include/gpu/GrRefCnt.h',
         '../include/gpu/GrRenderTarget.h',
@@ -247,8 +248,6 @@
         '../src/gpu/GrPathUtils.cpp',
         '../src/gpu/GrPathUtils.h',
         '../src/gpu/GrPlotMgr.h',
-        '../src/gpu/GrProgramStageFactory.cpp',
-        '../src/gpu/GrProgramStageFactory.h',
         '../src/gpu/GrRandom.h',
         '../src/gpu/GrRectanizer.cpp',
         '../src/gpu/GrRectanizer.h',
diff --git a/include/gpu/GrCustomStage.h b/include/gpu/GrCustomStage.h
index 64bcce3..908b10d 100644
--- a/include/gpu/GrCustomStage.h
+++ b/include/gpu/GrCustomStage.h
@@ -9,16 +9,18 @@
 #define GrCustomStage_DEFINED
 
 #include "GrRefCnt.h"
+#include "GrNoncopyable.h"
+#include "GrProgramStageFactory.h"
+#include "SkTemplates.h"
 
 class GrContext;
-class GrProgramStageFactory;
 
 /** Provides custom vertex shader, fragment shader, uniform data for a
-    particular stage of the Ganesh shading pipeline.
-    TODO: may want to refcount these? */
+    particular stage of the Ganesh shading pipeline. */
 class GrCustomStage : public GrRefCnt {
 
 public:
+    typedef GrProgramStageFactory::StageKey StageKey;
 
     GrCustomStage();
     virtual ~GrCustomStage();
@@ -31,10 +33,24 @@
         stage guaranteed to produce an opaque output? */
     virtual bool isOpaque(bool inputTextureIsOpaque) const;
 
-    /** This pointer, besides creating back-end-specific helper
-        objects, is used for run-time-type-identification. Every
-        subclass must return a consistent unique value for it. */
-    virtual GrProgramStageFactory* getFactory() const = 0;
+    /** This object, besides creating back-end-specific helper
+        objects, is used for run-time-type-identification. The factory should be
+        an instance of templated class, GrTProgramStageFactory. It is templated
+        on the subclass of GrCustomStage. The subclass must have a nested type
+        (or typedef) named GLProgramStage which will be the subclass of
+        GrGLProgramStage created by the factory.
+
+        Example:
+        class MyCustomStage : public GrCustomStage {
+        ...
+            virtual const GrProgramStageFactory& getFactory() const 
+                                                            SK_OVERRIDE {
+                return GrTProgramStageFactory<MyCustomStage>::getInstance();
+            }
+        ...
+        };
+     */
+    virtual const GrProgramStageFactory& getFactory() const = 0;
 
     /** Returns true if the other custom stage will generate
         equal output.
diff --git a/include/gpu/GrProgramStageFactory.h b/include/gpu/GrProgramStageFactory.h
new file mode 100644
index 0000000..e73b9ba
--- /dev/null
+++ b/include/gpu/GrProgramStageFactory.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrProgramStageFactory_DEFINED
+#define GrProgramStageFactory_DEFINED
+
+#include "GrTypes.h"
+
+/** Given a GrCustomStage of a particular type, creates the corresponding
+    graphics-backend-specific GrProgramStage. Also tracks equivalence
+    of shaders generated via a key.
+ */
+
+class GrCustomStage;
+class GrGLProgramStage;
+
+class GrProgramStageFactory : public GrNoncopyable {
+public:
+    typedef uint16_t StageKey;
+    enum {
+        kProgramStageKeyBits = 10,
+    };
+
+    virtual StageKey stageKey(const GrCustomStage* stage) const = 0;
+    virtual GrGLProgramStage* createGLInstance(
+        const GrCustomStage* stage) const = 0;
+
+    bool operator ==(const GrProgramStageFactory& b) const {
+        return fStageClassID == b.fStageClassID;
+    }
+    bool operator !=(const GrProgramStageFactory& b) const {
+        return !(*this == b);
+    }
+
+protected:
+    enum {
+        kIllegalStageClassID = 0,
+    };
+
+    GrProgramStageFactory() {
+        fStageClassID = kIllegalStageClassID;
+    }
+
+    static StageKey GenID() {
+        // fCurrStageClassID has been initialized to kIllegalStageClassID. The
+        // atomic inc returns the old value not the incremented value. So we add
+        // 1 to the returned value.
+        int32_t id = sk_atomic_inc(&fCurrStageClassID) + 1;
+        GrAssert(id < (1 << (8 * sizeof(StageKey) - kProgramStageKeyBits)));
+        return id;
+    }
+
+    StageKey fStageClassID;
+
+private:
+    static int32_t fCurrStageClassID;
+};
+
+template <typename StageClass>
+class GrTProgramStageFactory : public GrProgramStageFactory {
+
+public:
+    typedef typename StageClass::GLProgramStage GLProgramStage; 
+    
+    /** Returns an value that idenitifes the shader code generated by
+        a GrCustomStage. This enables caching of generated shaders. Part of the
+        id identifies the GrCustomShader subclass. The remainder is based
+        on the aspects of the GrCustomStage object's configuration that affect
+        code generation. */
+    virtual StageKey stageKey(const GrCustomStage* stage) const SK_OVERRIDE {
+        GrAssert(kIllegalStageClassID != fStageClassID);
+        StageKey stageID = GLProgramStage::GenKey(stage);
+        static const StageKey kIllegalIDMask =
+            ~((1 << kProgramStageKeyBits) - 1);
+        GrAssert(!(kIllegalIDMask & stageID));
+        return fStageClassID | stageID;
+    }
+
+    /** Returns a new instance of the appropriate *GL* implementation class
+     for the given GrCustomStage; caller is responsible for deleting
+     the object. */
+    virtual GLProgramStage* createGLInstance(
+                        const GrCustomStage* stage) const SK_OVERRIDE {
+        return new GLProgramStage(stage);
+    }
+
+    static const GrProgramStageFactory& getInstance() {
+        static SkAlignedSTStorage<1, GrTProgramStageFactory> gInstanceMem;
+        static const GrTProgramStageFactory* gInstance;
+        if (!gInstance) {
+            gInstance = new (gInstanceMem.get()) GrTProgramStageFactory();
+        }
+        return *gInstance;
+    }
+
+protected:
+    GrTProgramStageFactory() {
+        fStageClassID = GenID() << kProgramStageKeyBits;
+    }
+};
+
+#endif
diff --git a/src/gpu/GrCustomStage.cpp b/src/gpu/GrCustomStage.cpp
index f63c79a..c7d0844 100644
--- a/src/gpu/GrCustomStage.cpp
+++ b/src/gpu/GrCustomStage.cpp
@@ -8,8 +8,10 @@
 #include "GrContext.h"
 #include "GrCustomStage.h"
 
-GrCustomStage::GrCustomStage() {
+int32_t GrProgramStageFactory::fCurrStageClassID =
+                                    GrProgramStageFactory::kIllegalStageClassID;
 
+GrCustomStage::GrCustomStage() {
 }
 
 GrCustomStage::~GrCustomStage() {
diff --git a/src/gpu/GrProgramStageFactory.cpp b/src/gpu/GrProgramStageFactory.cpp
deleted file mode 100644
index 4bacf92..0000000
--- a/src/gpu/GrProgramStageFactory.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright 2012 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "GrProgramStageFactory.h"
-
-GrProgramStageFactory::~GrProgramStageFactory(void) {
-
-}
-
-uint16_t GrProgramStageFactory::stageKey(const GrCustomStage*) {
-    return 0;
-}
-
diff --git a/src/gpu/GrProgramStageFactory.h b/src/gpu/GrProgramStageFactory.h
deleted file mode 100644
index d3580d5..0000000
--- a/src/gpu/GrProgramStageFactory.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2012 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef GrProgramStageFactory_DEFINED
-#define GrProgramStageFactory_DEFINED
-
-#include "GrTypes.h"
-
-class GrCustomStage;
-class GrGLProgramStage;
-
-/** Given a GrCustomStage of a particular type, creates the corresponding
-    graphics-backend-specific GrProgramStage. Also tracks equivalence
-    of shaders generated via stageKey().
-
-    TODO: most of this class' subclasses are boilerplate and ought to be
-    templateable?
-*/
-
-class GrProgramStageFactory {
-
-public:
-
-    virtual ~GrProgramStageFactory();
-
-    /** Returns a short unique identifier for this subclass x its
-        parameters. If the key differs, different shader code must
-        be generated; if the key matches, shader code can be reused.
-        0 == no custom stage. */
-    virtual uint16_t stageKey(const GrCustomStage*);
-
-    /** Returns a new instance of the appropriate *GL* implementation class
-        for the given GrCustomStage; caller is responsible for deleting
-        the object. */
-    virtual GrGLProgramStage* createGLInstance(GrCustomStage*) = 0;
-
-protected:
-
-    /** Disable default constructor - instances should be singletons
-        with static factory functions: our test examples are all stateless,
-        but we suspect that future implementations may want to cache data? */
-    GrProgramStageFactory() { }
-};
-
-
-#endif
diff --git a/src/gpu/effects/GrConvolutionEffect.cpp b/src/gpu/effects/GrConvolutionEffect.cpp
index d63b023..563d1a4 100644
--- a/src/gpu/effects/GrConvolutionEffect.cpp
+++ b/src/gpu/effects/GrConvolutionEffect.cpp
@@ -17,7 +17,7 @@
 
 public:
 
-    GrGLConvolutionEffect(GrConvolutionEffect* data);
+    GrGLConvolutionEffect(const GrCustomStage* stage);
     virtual const char* name() const SK_OVERRIDE;
     virtual void setupVSUnis(VarArray* vsUnis, int stage) SK_OVERRIDE;
     virtual void setupFSUnis(VarArray* fsUnis, int stage) SK_OVERRIDE;
@@ -30,13 +30,14 @@
                         const char* sampleCoords) SK_OVERRIDE;
     virtual void initUniforms(const GrGLInterface*, int programID) SK_OVERRIDE;
 
-    virtual void setData(const GrGLInterface*, GrCustomStage*,
+    virtual void setData(const GrGLInterface*, const GrCustomStage*,
                          const GrGLTexture*) SK_OVERRIDE;
 
+    static inline StageKey GenKey(const GrCustomStage* s);
+    
 protected:
 
-    GrConvolutionEffect* fData;
- 
+    int            fKernelWidth;
     GrGLShaderVar* fKernelVar;
     GrGLShaderVar* fImageIncrementVar;
  
@@ -48,18 +49,16 @@
     typedef GrGLProgramStage INHERITED;
 };
 
-GrGLConvolutionEffect::GrGLConvolutionEffect(GrConvolutionEffect* data)
-    : fData(data)
-    , fKernelVar(NULL)
+GrGLConvolutionEffect::GrGLConvolutionEffect(const GrCustomStage* data)
+    : fKernelVar(NULL)
     , fImageIncrementVar(NULL)
     , fKernelLocation(0)
-    , fImageIncrementLocation(0)
-{
-
+    , fImageIncrementLocation(0) {
+    fKernelWidth = static_cast<const GrConvolutionEffect*>(data)->width();
 }
 
 const char* GrGLConvolutionEffect::name() const {
-    return fData->name();
+    return GrConvolutionEffect::Name();
 }
 
 void GrGLConvolutionEffect::setupVSUnis(VarArray* vsUnis,
@@ -81,7 +80,7 @@
     fKernelVar->setType(kFloat_GrSLType);
     fKernelVar->setTypeModifier(
         GrGLShaderVar::kUniform_TypeModifier);
-    fKernelVar->setArrayCount(fData->fKernelWidth);
+    fKernelVar->setArrayCount(fKernelWidth);
     (*fKernelVar->accessName()) = "uKernel";
     fKernelVar->accessName()->appendS32(stage);
 
@@ -93,7 +92,7 @@
 
 void GrGLConvolutionEffect::emitVS(GrStringBuilder* code,
                         const char* vertexCoords) {
-    float scale = (fData->fKernelWidth - 1) * 0.5f;
+    float scale = (fKernelWidth - 1) * 0.5f;
     code->appendf("\t\t%s -= vec2(%g, %g) * %s;\n",
                   vertexCoords, scale, scale,
                   fImageIncrementVar->getName().c_str());
@@ -121,7 +120,7 @@
     code->appendf("\t\tvec4 sum = vec4(0, 0, 0, 0);\n");
     code->appendf("\t\tvec2 coord = %s;\n", sampleCoords);
     code->appendf("\t\tfor (int i = 0; i < %d; i++) {\n",
-                  fData->fKernelWidth);
+                  fKernelWidth);
 
     code->appendf("\t\t\tsum += ");
     this->emitTextureLookup(code, samplerName, "coord");
@@ -143,14 +142,17 @@
 }
 
 void GrGLConvolutionEffect::setData(const GrGLInterface* gl,
-                                    GrCustomStage* data,
+                                    const GrCustomStage* data,
                                     const GrGLTexture* texture) {
-    fData = static_cast<GrConvolutionEffect*>(data);
+    const GrConvolutionEffect* conv =
+        static_cast<const GrConvolutionEffect*>(data);
+    // the code we generated was for a specific kernel width
+    GrAssert(conv->width() == fKernelWidth);
     GR_GL_CALL(gl, Uniform1fv(fKernelLocation,
-                              fData->fKernelWidth,
-                              fData->fKernel));
+                              fKernelWidth,
+                              conv->kernel()));
     float imageIncrement[2] = { 0 };
-    switch (fData->fDirection) {
+    switch (conv->direction()) {
         case GrSamplerState::kX_FilterDirection:
             imageIncrement[0] = 1.0f / texture->width();
             break;
@@ -163,58 +165,9 @@
     GR_GL_CALL(gl, Uniform2fv(fImageIncrementLocation, 1, imageIncrement));
 }
 
-/////////////////////////////////////////////////////////////////////
-// TODO: stageKey() and sEffectId are the only non-boilerplate in
-// this class; we ought to be able to templatize?
-
-class GrConvolutionEffectFactory : public GrProgramStageFactory {
-
-public:
-
-    virtual ~GrConvolutionEffectFactory();
-
-    virtual uint16_t stageKey(const GrCustomStage* s) SK_OVERRIDE;
-    virtual GrGLProgramStage* createGLInstance(GrCustomStage* s) SK_OVERRIDE;
-
-    static GrConvolutionEffectFactory* getInstance();
-
-protected:
-
-    GrConvolutionEffectFactory();
-
-    // TODO: find a more reliable installation than hand-coding
-    // id values like '1'. 
-    static const int sEffectId = 1;
-
-private:
-
-    typedef GrProgramStageFactory INHERITED;
-};
-
-GrConvolutionEffectFactory::~GrConvolutionEffectFactory() {
-
-}
-
-uint16_t GrConvolutionEffectFactory::stageKey(const GrCustomStage* s) {
-    const GrConvolutionEffect* c =
-        static_cast<const GrConvolutionEffect*>(s);
-    GrAssert(c->width() < 256);
-    return (sEffectId << 8) | (c->width() & 0xff);
-}
-
-GrGLProgramStage* GrConvolutionEffectFactory::createGLInstance(
-    GrCustomStage* s) {
-    return new GrGLConvolutionEffect(static_cast<GrConvolutionEffect*>(s));
-}
-
-GrConvolutionEffectFactory* GrConvolutionEffectFactory::getInstance() {
-    static GrConvolutionEffectFactory* instance =
-        new GrConvolutionEffectFactory;
-    return instance;
-}
-
-GrConvolutionEffectFactory::GrConvolutionEffectFactory() {
-
+GrGLProgramStage::StageKey GrGLConvolutionEffect::GenKey(
+                                                    const GrCustomStage* s) {
+    return static_cast<const GrConvolutionEffect*>(s)->width();
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -236,11 +189,12 @@
 }
 
 const char* GrConvolutionEffect::name() const {
-    return "Convolution";
+    return Name();
 }
 
-GrProgramStageFactory* GrConvolutionEffect::getFactory() const {
-    return GrConvolutionEffectFactory::getInstance();
+
+const GrProgramStageFactory& GrConvolutionEffect::getFactory() const {
+    return GrTProgramStageFactory<GrConvolutionEffect>::getInstance();
 }
 
 bool GrConvolutionEffect::isEqual(const GrCustomStage * sBase) const {
diff --git a/src/gpu/effects/GrConvolutionEffect.h b/src/gpu/effects/GrConvolutionEffect.h
index e3c3aa9..4fdddd9 100644
--- a/src/gpu/effects/GrConvolutionEffect.h
+++ b/src/gpu/effects/GrConvolutionEffect.h
@@ -11,6 +11,8 @@
 #include "GrCustomStage.h"
 #include "GrSamplerState.h" // for MAX_KENEL_WIDTH, FilterDirection
 
+class GrGLConvolutionEffect;
+
 class GrConvolutionEffect : public GrCustomStage {
 
 public:
@@ -19,11 +21,17 @@
                         unsigned int kernelWidth, const float* kernel);
     virtual ~GrConvolutionEffect();
 
-    virtual const char* name() const SK_OVERRIDE;
-    virtual GrProgramStageFactory* getFactory() const SK_OVERRIDE;
-    virtual bool isEqual(const GrCustomStage *) const SK_OVERRIDE;
-
     unsigned int width() const { return fKernelWidth; }
+    const float* kernel() const { return fKernel; }
+    GrSamplerState::FilterDirection direction() const { return fDirection; }
+    
+    static const char* Name() { return "Convolution"; }
+
+    typedef GrGLConvolutionEffect GLProgramStage;
+    
+    virtual const char* name() const SK_OVERRIDE;
+    virtual const GrProgramStageFactory& getFactory() const SK_OVERRIDE;
+    virtual bool isEqual(const GrCustomStage *) const SK_OVERRIDE;
 
 protected:
 
@@ -31,7 +39,6 @@
     unsigned int fKernelWidth;
     float fKernel[MAX_KERNEL_WIDTH];
 
-    friend class GrGLConvolutionEffect;
 
 private:
 
diff --git a/src/gpu/gl/GrGLProgram.cpp b/src/gpu/gl/GrGLProgram.cpp
index e576163..755a2f8 100644
--- a/src/gpu/gl/GrGLProgram.cpp
+++ b/src/gpu/gl/GrGLProgram.cpp
@@ -629,10 +629,10 @@
                 }
 
                 if (NULL != customStages[s]) {
-                    GrProgramStageFactory* factory =
+                    const GrProgramStageFactory& factory =
                         customStages[s]->getFactory();
                     programData->fCustomStage[s] =
-                        factory->createGLInstance(customStages[s]);
+                        factory.createGLInstance(customStages[s]);
                 }
                 this->genStageCode(gl,
                                    s,
@@ -755,10 +755,10 @@
                     }
 
                     if (NULL != customStages[s]) {
-                        GrProgramStageFactory* factory =
+                        const GrProgramStageFactory& factory =
                             customStages[s]->getFactory();
                         programData->fCustomStage[s] =
-                            factory->createGLInstance(customStages[s]);
+                            factory.createGLInstance(customStages[s]);
                     }
                     this->genStageCode(gl, s,
                         fProgramDesc.fStages[s],
diff --git a/src/gpu/gl/GrGLProgramStage.cpp b/src/gpu/gl/GrGLProgramStage.cpp
index 06c4d93..28a711d 100644
--- a/src/gpu/gl/GrGLProgramStage.cpp
+++ b/src/gpu/gl/GrGLProgramStage.cpp
@@ -25,7 +25,7 @@
 
 }
 
-void GrGLProgramStage::setData(const GrGLInterface*, GrCustomStage*,
+void GrGLProgramStage::setData(const GrGLInterface*, const GrCustomStage*,
                                const GrGLTexture*) {
 
 }
diff --git a/src/gpu/gl/GrGLProgramStage.h b/src/gpu/gl/GrGLProgramStage.h
index 61e62e5..2141640 100644
--- a/src/gpu/gl/GrGLProgramStage.h
+++ b/src/gpu/gl/GrGLProgramStage.h
@@ -9,12 +9,12 @@
 #define GrGLCustomStage_DEFINED
 
 #include "GrAllocator.h"
+#include "GrCustomStage.h"
 #include "GrGLShaderBuilder.h"
 #include "GrGLShaderVar.h"
 #include "GrGLSL.h"
 #include "GrStringBuilder.h"
 
-class GrCustomStage;
 struct GrGLInterface;
 class GrGLTexture;
 
@@ -31,6 +31,7 @@
 class GrGLProgramStage {
 
 public:
+    typedef GrCustomStage::StageKey StageKey ;
     // TODO: redundant with GrGLProgram.cpp
     enum {
         kUnusedUniform = -1,
@@ -90,7 +91,7 @@
         are to be read.
         TODO: since we don't have a factory, we can't assert to enforce
         this. Shouldn't we? */
-    virtual void setData(const GrGLInterface*, GrCustomStage*,
+    virtual void setData(const GrGLInterface*, const GrCustomStage*,
                          const GrGLTexture*);
 
     // TODO: needs a better name
diff --git a/src/gpu/gl/GrGpuGLShaders.cpp b/src/gpu/gl/GrGpuGLShaders.cpp
index d32c57e..08cfaf7 100644
--- a/src/gpu/gl/GrGpuGLShaders.cpp
+++ b/src/gpu/gl/GrGpuGLShaders.cpp
@@ -302,7 +302,7 @@
                     (GrSamplerState::FilterDirection)direction,
                     stage.fKernelWidth, kernel);
                 stage.fCustomStageKey =
-                    customStages[s]->getFactory()->stageKey(customStages[s]);
+                    customStages[s]->getFactory().stageKey(customStages[s]);
             }
         }
         CachedData cachedData;
@@ -933,8 +933,8 @@
                         GrGLProgram* program, int index) {
     GrCustomStage* customStage = sampler.getCustomStage();
     if (customStage) {
-        GrProgramStageFactory* factory = customStage->getFactory();
-        stage->fCustomStageKey = factory->stageKey(customStage);
+        const GrProgramStageFactory& factory = customStage->getFactory();
+        stage->fCustomStageKey = factory.stageKey(customStage);
         customStages[index] = customStage;
     } else {
         stage->fCustomStageKey = 0;
