Threading RS ForEach.

Change-Id: I5d6fe4db2b6ac0613394bc5a066ff90ec146d60e
diff --git a/libs/rs/java/ImageProcessing/res/raw/horizontal_blur.rs b/libs/rs/java/ImageProcessing/res/raw/horizontal_blur.rs
index 7b0e6bc..10815fb 100644
--- a/libs/rs/java/ImageProcessing/res/raw/horizontal_blur.rs
+++ b/libs/rs/java/ImageProcessing/res/raw/horizontal_blur.rs
@@ -5,17 +5,14 @@
 
 #include "ip.rsh"
 
-uchar4 * ScratchPixel;
-
-#pragma rs export_var(ScratchPixel)
-
 void root(const void *v_in, void *v_out, const void *usrData, uint32_t x, uint32_t y) {
     uchar4 *output = (uchar4 *)v_out;
-    const uchar4 *input = (uchar4 *)v_in;
     const FilterStruct *fs = (const FilterStruct *)usrData;
+    const uchar4 *input = (const uchar4 *)rsGetElementAt(fs->ain, 0, y);
 
     float4 blurredPixel = 0;
     float4 currentPixel = 0;
+
     for(int r = -fs->radius; r <= fs->radius; r ++) {
         // Stepping left and right away from the pixel
         int validW = x + r;
diff --git a/libs/rs/java/ImageProcessing/res/raw/horizontal_blur_bc.bc b/libs/rs/java/ImageProcessing/res/raw/horizontal_blur_bc.bc
index c9ba5d9..5920f3a 100644
--- a/libs/rs/java/ImageProcessing/res/raw/horizontal_blur_bc.bc
+++ b/libs/rs/java/ImageProcessing/res/raw/horizontal_blur_bc.bc
Binary files differ
diff --git a/libs/rs/java/ImageProcessing/res/raw/ip.rsh b/libs/rs/java/ImageProcessing/res/raw/ip.rsh
index 4073304..dea92c3 100644
--- a/libs/rs/java/ImageProcessing/res/raw/ip.rsh
+++ b/libs/rs/java/ImageProcessing/res/raw/ip.rsh
@@ -3,6 +3,8 @@
 #define MAX_RADIUS 25
 
 typedef struct {
+    rs_allocation ain;
+
     float *gaussian; //[MAX_RADIUS * 2 + 1];
     rs_matrix3x3 colorMat;
 
diff --git a/libs/rs/java/ImageProcessing/res/raw/threshold.rs b/libs/rs/java/ImageProcessing/res/raw/threshold.rs
index ecbfac4..aa6b6fa 100644
--- a/libs/rs/java/ImageProcessing/res/raw/threshold.rs
+++ b/libs/rs/java/ImageProcessing/res/raw/threshold.rs
@@ -24,7 +24,6 @@
 static float inWMinInB;
 static float outWMinOutB;
 static float overInWMinInB;
-static FilterStruct filterStruct;
 
 #pragma rs export_var(height, width, radius, InPixel, OutPixel, ScratchPixel, inBlack, outBlack, inWhite, outWhite, gamma, saturation, InPixel, OutPixel, ScratchPixel, vBlurScript, hBlurScript)
 #pragma rs export_func(filter, filterBenchmark);
@@ -106,138 +105,70 @@
     }
 }
 
-// This needs to be inline
-static float4 levelsSaturation(float4 currentPixel) {
-    float3 temp = rsMatrixMultiply(&colorMat, currentPixel.xyz);
-    temp = (clamp(temp, 0.1f, 255.f) - inBlack) * overInWMinInB;
-    temp = pow(temp, (float3)gamma);
-    currentPixel.xyz = clamp(temp * outWMinOutB + outBlack, 0.1f, 255.f);
-    return currentPixel;
-}
-
 static void processNoBlur() {
-    int w, h, r;
-    int count = 0;
-
     float inWMinInB = inWhite - inBlack;
     float outWMinOutB = outWhite - outBlack;
     float4 currentPixel = 0;
 
-    for(h = 0; h < height; h ++) {
-        for(w = 0; w < width; w ++) {
-            uchar4 *input = InPixel + h*width + w;
+    for(int h = 0; h < height; h ++) {
+        uchar4 *input = InPixel + h*width;
+        uchar4 *output = OutPixel + h*width;
 
+        for(int w = 0; w < width; w ++) {
             //currentPixel.xyz = convert_float3(input.xyz);
             currentPixel.x = (float)(input->x);
             currentPixel.y = (float)(input->y);
             currentPixel.z = (float)(input->z);
 
-            currentPixel = levelsSaturation(currentPixel);
+            float3 temp = rsMatrixMultiply(&colorMat, currentPixel.xyz);
+            temp = (clamp(temp, 0.f, 255.f) - inBlack) * overInWMinInB;
+            temp = pow(temp, (float3)gamma);
+            currentPixel.xyz = clamp(temp * outWMinOutB + outBlack, 0.f, 255.f);
 
-            uchar4 *output = OutPixel + h*width + w;
             //output.xyz = convert_uchar3(currentPixel.xyz);
             output->x = (uint8_t)currentPixel.x;
             output->y = (uint8_t)currentPixel.y;
             output->z = (uint8_t)currentPixel.z;
             output->w = input->w;
-        }
-    }
-    rsSendToClient(&count, 1, 4, 0);
-}
 
-static void horizontalBlurLevels() {
-    float4 blurredPixel = 0;
-    float4 currentPixel = 0;
-    // Horizontal blur
-    int w, h, r;
-    for(h = 0; h < height; h ++) {
-        uchar4 *output = OutPixel + h*width;
-
-        for(w = 0; w < width; w ++) {
-            blurredPixel = 0;
-
-            for(r = -radius; r <= radius; r ++) {
-                // Stepping left and right away from the pixel
-                int validW = w + r;
-                // Clamp to zero and width max() isn't exposed for ints yet
-                if(validW < 0) {
-                    validW = 0;
-                }
-                if(validW > width - 1) {
-                    validW = width - 1;
-                }
-                //int validW = rsClamp(w + r, 0, width - 1);
-
-                uchar4 *input = InPixel + h*width + validW;
-
-                float weight = gaussian[r + radius];
-                currentPixel.x = (float)(input->x);
-                currentPixel.y = (float)(input->y);
-                currentPixel.z = (float)(input->z);
-                //currentPixel.w = (float)(input->a);
-
-                blurredPixel.xyz += currentPixel.xyz * weight;
-            }
-
-            blurredPixel = levelsSaturation(blurredPixel);
-
-            output->x = (uint8_t)blurredPixel.x;
-            output->y = (uint8_t)blurredPixel.y;
-            output->z = (uint8_t)blurredPixel.z;
-            //output->a = (uint8_t)blurredPixel.w;
+            input++;
             output++;
         }
     }
 }
 
-static void initStructs() {
-    filterStruct.gaussian = gaussian;
-    filterStruct.width = width;
-    filterStruct.height = height;
-    filterStruct.radius = radius;
+static void blur() {
+    computeGaussianWeights();
+
+    FilterStruct fs;
+    fs.gaussian = gaussian;
+    fs.width = width;
+    fs.height = height;
+    fs.radius = radius;
+
+    fs.ain = rsGetAllocation(InPixel);
+    rsForEach(hBlurScript, fs.ain, rsGetAllocation(ScratchPixel), &fs);
+
+    fs.ain = rsGetAllocation(ScratchPixel);
+    rsForEach(vBlurScript, fs.ain, rsGetAllocation(OutPixel), &fs);
 }
 
 void filter() {
-    RS_DEBUG(height);
-    RS_DEBUG(width);
     RS_DEBUG(radius);
 
-    initStructs();
-
     computeColorMatrix();
 
-    if(radius == 0) {
-        processNoBlur();
-        return;
+    if(radius > 0) {
+        blur();
     }
-
-    computeGaussianWeights();
-
-    horizontalBlurLevels();
-
-    rsForEach(vBlurScript,
-              rsGetAllocation(InPixel),
-              rsGetAllocation(OutPixel),
-              &filterStruct);
+    processNoBlur();
 
     int count = 0;
     rsSendToClient(&count, 1, 4, 0);
 }
 
 void filterBenchmark() {
-    initStructs();
-
-    computeGaussianWeights();
-
-    rsForEach(hBlurScript,
-              rsGetAllocation(InPixel),
-              rsGetAllocation(OutPixel),
-              &filterStruct);
-
-    rsForEach(vBlurScript,
-              rsGetAllocation(InPixel),
-              rsGetAllocation(OutPixel),
-              &filterStruct);
+    blur();
 
     int count = 0;
     rsSendToClient(&count, 1, 4, 0);
diff --git a/libs/rs/java/ImageProcessing/res/raw/threshold_bc.bc b/libs/rs/java/ImageProcessing/res/raw/threshold_bc.bc
index 8f37fdc..2b5d254 100644
--- a/libs/rs/java/ImageProcessing/res/raw/threshold_bc.bc
+++ b/libs/rs/java/ImageProcessing/res/raw/threshold_bc.bc
Binary files differ
diff --git a/libs/rs/java/ImageProcessing/res/raw/vertical_blur.rs b/libs/rs/java/ImageProcessing/res/raw/vertical_blur.rs
index 846f515..f5f2d69 100644
--- a/libs/rs/java/ImageProcessing/res/raw/vertical_blur.rs
+++ b/libs/rs/java/ImageProcessing/res/raw/vertical_blur.rs
@@ -5,14 +5,10 @@
 
 #include "ip.rsh"
 
-uchar4 * ScratchPixel;
-
-#pragma rs export_var(ScratchPixel)
-
 void root(const void *v_in, void *v_out, const void *usrData, uint32_t x, uint32_t y) {
     uchar4 *output = (uchar4 *)v_out;
-    const uchar4 *input = (uchar4 *)v_in;
     const FilterStruct *fs = (const FilterStruct *)usrData;
+    const uchar4 *input = (const uchar4 *)rsGetElementAt(fs->ain, x, 0);
 
     float4 blurredPixel = 0;
     float4 currentPixel = 0;
@@ -27,19 +23,21 @@
             validH = fs->height - 1;
         }
 
-        uchar4 *input = ScratchPixel + validH * fs->width + x;
+        const uchar4 *i = input + validH * fs->width;
+        //const uchar4 *i = (const uchar4 *)rsGetElementAt(fs->ain, x, validH);
 
         float weight = fs->gaussian[r + fs->radius];
 
-        currentPixel.x = (float)(input->x);
-        currentPixel.y = (float)(input->y);
-        currentPixel.z = (float)(input->z);
+        currentPixel.x = (float)(i->x);
+        currentPixel.y = (float)(i->y);
+        currentPixel.z = (float)(i->z);
 
         blurredPixel.xyz += currentPixel.xyz * weight;
 #else
         int validH = rsClamp(y + r, 0, height - 1);
-        uchar4 *input = ScratchPixel + validH * width + x;
-        blurredPixel.xyz += convert_float3(input->xyz) * gaussian[r + fs->radius];
+        validH -= y;
+        uchar4 *i = input + validH * width + x;
+        blurredPixel.xyz += convert_float3(i->xyz) * gaussian[r + fs->radius];
 #endif
     }
 
diff --git a/libs/rs/java/ImageProcessing/res/raw/vertical_blur_bc.bc b/libs/rs/java/ImageProcessing/res/raw/vertical_blur_bc.bc
index af1cd8e..be5d0e4 100644
--- a/libs/rs/java/ImageProcessing/res/raw/vertical_blur_bc.bc
+++ b/libs/rs/java/ImageProcessing/res/raw/vertical_blur_bc.bc
Binary files differ
diff --git a/libs/rs/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java b/libs/rs/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
index 21c3d74..0ed1185 100644
--- a/libs/rs/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
+++ b/libs/rs/java/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
@@ -376,10 +376,7 @@
         mScratchPixelsAllocation = Allocation.createBitmapRef(mRS, mBitmapScratch);
 
         mScriptVBlur = new ScriptC_Vertical_blur(mRS, getResources(), R.raw.vertical_blur_bc, false);
-        mScriptVBlur.bind_ScratchPixel(mScratchPixelsAllocation);
-
         mScriptHBlur = new ScriptC_Horizontal_blur(mRS, getResources(), R.raw.horizontal_blur_bc, false);
-        mScriptHBlur.bind_ScratchPixel(mScratchPixelsAllocation);
 
         mScript = new ScriptC_Threshold(mRS, getResources(), R.raw.threshold_bc, false);
         mScript.set_width(mBitmapIn.getWidth());
@@ -431,8 +428,8 @@
         android.util.Log.v("Img", "Renderscript frame time core ms " + t);
 
         long javaTime = javaFilter();
-
         mBenchmarkResult.setText("RS: " + t + " ms  Java: " + javaTime + " ms");
+        //mBenchmarkResult.setText("RS: " + t + " ms");
 
         mRadius = oldRadius;
         mScript.set_radius(mRadius);
diff --git a/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Horizontal_blur.java b/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Horizontal_blur.java
index 8ee50a8..c447b9b 100644
--- a/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Horizontal_blur.java
+++ b/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Horizontal_blur.java
@@ -26,17 +26,5 @@
         super(rs, resources, id, isRoot);
     }
 
-    private final static int mExportVarIdx_ScratchPixel = 0;
-    private Allocation mExportVar_ScratchPixel;
-    public void bind_ScratchPixel(Allocation v) {
-        mExportVar_ScratchPixel = v;
-        if(v == null) bindAllocation(null, mExportVarIdx_ScratchPixel);
-        else bindAllocation(v, mExportVarIdx_ScratchPixel);
-    }
-
-    public Allocation get_ScratchPixel() {
-        return mExportVar_ScratchPixel;
-    }
-
 }
 
diff --git a/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Vertical_blur.java b/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Vertical_blur.java
index 0215f60..cee74d9 100644
--- a/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Vertical_blur.java
+++ b/libs/rs/java/ImageProcessing/src/com/android/rs/image/ScriptC_Vertical_blur.java
@@ -26,17 +26,5 @@
         super(rs, resources, id, isRoot);
     }
 
-    private final static int mExportVarIdx_ScratchPixel = 0;
-    private Allocation mExportVar_ScratchPixel;
-    public void bind_ScratchPixel(Allocation v) {
-        mExportVar_ScratchPixel = v;
-        if(v == null) bindAllocation(null, mExportVarIdx_ScratchPixel);
-        else bindAllocation(v, mExportVarIdx_ScratchPixel);
-    }
-
-    public Allocation get_ScratchPixel() {
-        return mExportVar_ScratchPixel;
-    }
-
 }
 
diff --git a/libs/rs/rsContext.cpp b/libs/rs/rsContext.cpp
index 68eca44..629b481 100644
--- a/libs/rs/rsContext.cpp
+++ b/libs/rs/rsContext.cpp
@@ -23,6 +23,7 @@
 
 #include <sys/types.h>
 #include <sys/resource.h>
+#include <sched.h>
 
 #include <cutils/properties.h>
 
@@ -355,6 +356,49 @@
      return NULL;
 }
 
+void * Context::helperThreadProc(void *vrsc)
+{
+     Context *rsc = static_cast<Context *>(vrsc);
+     uint32_t idx = (uint32_t)android_atomic_inc(&rsc->mWorkers.mLaunchCount);
+
+     LOGE("helperThreadProc 1 %p idx=%i", rsc, idx);
+
+     rsc->mWorkers.mLaunchSignals[idx].init();
+     rsc->mWorkers.mNativeThreadId[idx] = gettid();
+
+     //cpu_set_t cpset[16];
+     //int ret = sched_getaffinity(rsc->mWorkers.mNativeThreadId[idx], sizeof(cpset), &cpset);
+     //LOGE("ret = %i", ret);
+
+//sched_setaffinity
+
+     setpriority(PRIO_PROCESS, rsc->mWorkers.mNativeThreadId[idx], rsc->mThreadPriority);
+     while(rsc->mRunning) {
+         rsc->mWorkers.mLaunchSignals[idx].wait();
+         if (rsc->mWorkers.mLaunchCallback) {
+    LOGE("helperThreadProc 4");
+            rsc->mWorkers.mLaunchCallback(rsc->mWorkers.mLaunchData, idx);
+         }
+    LOGE("helperThreadProc 5");
+         android_atomic_dec(&rsc->mWorkers.mRunningCount);
+         rsc->mWorkers.mCompleteSignal.set();
+     }
+     return NULL;
+}
+
+void Context::launchThreads(WorkerCallback_t cbk, void *data)
+{
+    mWorkers.mLaunchData = data;
+    mWorkers.mLaunchCallback = cbk;
+    mWorkers.mRunningCount = (int)mWorkers.mCount;
+    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
+        mWorkers.mLaunchSignals[ct].set();
+    }
+    while(mWorkers.mRunningCount) {
+        mWorkers.mCompleteSignal.wait();
+    }
+}
+
 void Context::setPriority(int32_t p)
 {
     // Note: If we put this in the proper "background" policy
@@ -371,7 +415,10 @@
         // success; reset the priority as well
     }
 #else
-        setpriority(PRIO_PROCESS, mNativeThreadId, p);
+    setpriority(PRIO_PROCESS, mNativeThreadId, p);
+    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
+        setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], p);
+    }
 #endif
 }
 
@@ -421,10 +468,26 @@
     timerInit();
     timerSet(RS_TIMER_INTERNAL);
 
-    LOGV("RS Launching thread");
+    LOGV("RS Launching thread(s)");
+    mWorkers.mCount = 2;
+    mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
+    mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
+    mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
+    mWorkers.mLaunchCallback = NULL;
     status = pthread_create(&mThreadId, &threadAttr, threadProc, this);
     if (status) {
         LOGE("Failed to start rs context thread.");
+        return;
+    }
+    mWorkers.mRunningCount = 0;
+    mWorkers.mLaunchCount = 0;
+    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
+        status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
+        if (status) {
+            mWorkers.mCount = ct;
+            LOGE("Created fewer than expected number of RS threads.");
+            break;
+        }
     }
 
     while(!mRunning) {
diff --git a/libs/rs/rsContext.h b/libs/rs/rsContext.h
index 06433a1..98ad3a4 100644
--- a/libs/rs/rsContext.h
+++ b/libs/rs/rsContext.h
@@ -65,6 +65,7 @@
         Script * mScript;
     };
 
+    typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
 
     //StructuredAllocationContext mStateAllocation;
     ElementState mStateElement;
@@ -172,6 +173,8 @@
 
     bool ext_OES_texture_npot() const {return mGL.OES_texture_npot;}
 
+    void launchThreads(WorkerCallback_t cbk, void *data);
+
 protected:
     Device *mDev;
 
@@ -222,6 +225,20 @@
     pthread_t mThreadId;
     pid_t mNativeThreadId;
 
+    struct Workers {
+        volatile int mRunningCount;
+        volatile int mLaunchCount;
+        uint32_t mCount;
+        pthread_t *mThreadId;
+        pid_t *mNativeThreadId;
+        Signal mCompleteSignal;
+
+        Signal *mLaunchSignals;
+        WorkerCallback_t mLaunchCallback;
+        void *mLaunchData;
+    };
+    Workers mWorkers;
+
     ObjectBaseRef<Script> mRootScript;
     ObjectBaseRef<ProgramFragment> mFragment;
     ObjectBaseRef<ProgramVertex> mVertex;
@@ -248,6 +265,7 @@
     uint32_t runRootScript();
 
     static void * threadProc(void *);
+    static void * helperThreadProc(void *);
 
     ANativeWindow *mWndSurface;
 
diff --git a/libs/rs/rsScriptC.cpp b/libs/rs/rsScriptC.cpp
index b87ac28..9693b16e 100644
--- a/libs/rs/rsScriptC.cpp
+++ b/libs/rs/rsScriptC.cpp
@@ -137,72 +137,155 @@
 }
 
 
+typedef struct {
+    Context *rsc;
+    ScriptC *script;
+    const Allocation * ain;
+    Allocation * aout;
+    const void * usr;
+
+    uint32_t mSliceSize;
+    volatile int mSliceNum;
+
+    const uint8_t *ptrIn;
+    uint32_t eStrideIn;
+    uint8_t *ptrOut;
+    uint32_t eStrideOut;
+
+    uint32_t xStart;
+    uint32_t xEnd;
+    uint32_t yStart;
+    uint32_t yEnd;
+    uint32_t zStart;
+    uint32_t zEnd;
+    uint32_t arrayStart;
+    uint32_t arrayEnd;
+
+    uint32_t dimX;
+    uint32_t dimY;
+    uint32_t dimZ;
+    uint32_t dimArray;
+} MTLaunchStruct;
+typedef int (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
+
+static void wc_xy(void *usr, uint32_t idx)
+{
+    MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
+    LOGE("usr %p, idx %i", usr, idx);
+
+    while (1) {
+        uint32_t slice = (uint32_t)android_atomic_inc(&mtls->mSliceNum);
+        uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
+        uint32_t yEnd = yStart + mtls->mSliceSize;
+        yEnd = rsMin(yEnd, mtls->yEnd);
+        if (yEnd <= yStart) {
+            return;
+        }
+
+        //LOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
+
+        for (uint32_t y = yStart; y < yEnd; y++) {
+            uint32_t offset = mtls->dimX * y;
+            uint8_t *xPtrOut = mtls->ptrOut + (mtls->eStrideOut * offset);
+            const uint8_t *xPtrIn = mtls->ptrIn + (mtls->eStrideIn * offset);
+
+            for (uint32_t x = mtls->xStart; x < mtls->xEnd; x++) {
+                ((rs_t)mtls->script->mProgram.mRoot) (xPtrIn, xPtrOut, mtls->usr, x, y, 0, 0);
+                xPtrIn += mtls->eStrideIn;
+                xPtrOut += mtls->eStrideOut;
+            }
+        }
+    }
+
+}
+
 void ScriptC::runForEach(Context *rsc,
                          const Allocation * ain,
                          Allocation * aout,
                          const void * usr,
                          const RsScriptCall *sc)
 {
-    uint32_t dimX = ain->getType()->getDimX();
-    uint32_t dimY = ain->getType()->getDimY();
-    uint32_t dimZ = ain->getType()->getDimZ();
-    uint32_t dimA = 0;//ain->getType()->getDimArray();
+    MTLaunchStruct mtls;
+    memset(&mtls, 0, sizeof(mtls));
 
-    uint32_t xStart = 0;
-    uint32_t xEnd = 0;
-    uint32_t yStart = 0;
-    uint32_t yEnd = 0;
-    uint32_t zStart = 0;
-    uint32_t zEnd = 0;
-    uint32_t arrayStart = 0;
-    uint32_t arrayEnd = 0;
+    if (ain) {
+        mtls.dimX = ain->getType()->getDimX();
+        mtls.dimY = ain->getType()->getDimY();
+        mtls.dimZ = ain->getType()->getDimZ();
+        //mtls.dimArray = ain->getType()->getDimArray();
+    } else if (aout) {
+        mtls.dimX = aout->getType()->getDimX();
+        mtls.dimY = aout->getType()->getDimY();
+        mtls.dimZ = aout->getType()->getDimZ();
+        //mtls.dimArray = aout->getType()->getDimArray();
+    } else {
+        rsc->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
 
     if (!sc || (sc->xEnd == 0)) {
-        xStart = 0;
-        xEnd = ain->getType()->getDimX();
+        mtls.xEnd = mtls.dimX;
     } else {
-        rsAssert(xStart < dimX);
-        rsAssert(xEnd <= dimX);
+        rsAssert(sc->xStart < mtls.dimX);
+        rsAssert(sc->xEnd <= mtls.dimX);
         rsAssert(sc->xStart < sc->xEnd);
-        xStart = rsMin(dimX, sc->xStart);
-        xEnd = rsMin(dimX, sc->xEnd);
-        if (xStart >= xEnd) return;
+        mtls.xStart = rsMin(mtls.dimX, sc->xStart);
+        mtls.xEnd = rsMin(mtls.dimX, sc->xEnd);
+        if (mtls.xStart >= mtls.xEnd) return;
     }
 
     if (!sc || (sc->yEnd == 0)) {
-        yStart = 0;
-        yEnd = ain->getType()->getDimY();
+        mtls.yEnd = mtls.dimY;
     } else {
-        rsAssert(yStart < dimY);
-        rsAssert(yEnd <= dimY);
+        rsAssert(sc->yStart < mtls.dimY);
+        rsAssert(sc->yEnd <= mtls.dimY);
         rsAssert(sc->yStart < sc->yEnd);
-        yStart = rsMin(dimY, sc->yStart);
-        yEnd = rsMin(dimY, sc->yEnd);
-        if (yStart >= yEnd) return;
+        mtls.yStart = rsMin(mtls.dimY, sc->yStart);
+        mtls.yEnd = rsMin(mtls.dimY, sc->yEnd);
+        if (mtls.yStart >= mtls.yEnd) return;
     }
 
-    xEnd = rsMax((uint32_t)1, xEnd);
-    yEnd = rsMax((uint32_t)1, yEnd);
-    zEnd = rsMax((uint32_t)1, zEnd);
-    arrayEnd = rsMax((uint32_t)1, arrayEnd);
+    mtls.xEnd = rsMax((uint32_t)1, mtls.xEnd);
+    mtls.yEnd = rsMax((uint32_t)1, mtls.yEnd);
+    mtls.zEnd = rsMax((uint32_t)1, mtls.zEnd);
+    mtls.arrayEnd = rsMax((uint32_t)1, mtls.arrayEnd);
 
     rsAssert(ain->getType()->getDimZ() == 0);
 
     setupScript(rsc);
     Script * oldTLS = setTLS(this);
 
-    typedef int (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
 
-    const uint8_t *ptrIn = (const uint8_t *)ain->getPtr();
-    uint32_t eStrideIn = ain->getType()->getElementSizeBytes();
+    mtls.rsc = rsc;
+    mtls.ain = ain;
+    mtls.aout = aout;
+    mtls.script = this;
+    mtls.usr = usr;
+    mtls.mSliceSize = 10;
+    mtls.mSliceNum = 0;
 
-    uint8_t *ptrOut = NULL;
-    uint32_t eStrideOut = 0;
-    if (aout) {
-        ptrOut = (uint8_t *)aout->getPtr();
-        eStrideOut = aout->getType()->getElementSizeBytes();
+    mtls.ptrIn = NULL;
+    mtls.eStrideIn = 0;
+    if (ain) {
+        mtls.ptrIn = (const uint8_t *)ain->getPtr();
+        mtls.eStrideIn = ain->getType()->getElementSizeBytes();
     }
 
+    mtls.ptrOut = NULL;
+    mtls.eStrideOut = 0;
+    if (aout) {
+        mtls.ptrOut = (uint8_t *)aout->getPtr();
+        mtls.eStrideOut = aout->getType()->getElementSizeBytes();
+    }
+
+
+    {
+        LOGE("launch 1");
+        rsc->launchThreads(wc_xy, &mtls);
+        LOGE("launch 2");
+    }
+
+/*
     for (uint32_t ar = arrayStart; ar < arrayEnd; ar++) {
         for (uint32_t z = zStart; z < zEnd; z++) {
             for (uint32_t y = yStart; y < yEnd; y++) {
@@ -221,7 +304,7 @@
         }
 
     }
-
+*/
     setTLS(oldTLS);
 }
 
diff --git a/libs/rs/rsScriptC_Lib.cpp b/libs/rs/rsScriptC_Lib.cpp
index 8d9ca9f..9c29ca6 100644
--- a/libs/rs/rsScriptC_Lib.cpp
+++ b/libs/rs/rsScriptC_Lib.cpp
@@ -329,6 +329,29 @@
     return a->getType()->getDimFaces();
 }
 
+const void * SC_getElementAtX(RsAllocation va, uint32_t x)
+{
+    const Allocation *a = static_cast<const Allocation *>(va);
+    const Type *t = a->getType();
+    const uint8_t *p = (const uint8_t *)a->getPtr();
+    return &p[t->getElementSizeBytes() * x];
+}
+
+const void * SC_getElementAtXY(RsAllocation va, uint32_t x, uint32_t y)
+{
+    const Allocation *a = static_cast<const Allocation *>(va);
+    const Type *t = a->getType();
+    const uint8_t *p = (const uint8_t *)a->getPtr();
+    return &p[t->getElementSizeBytes() * (x + y*t->getDimX())];
+}
+
+const void * SC_getElementAtXYZ(RsAllocation va, uint32_t x, uint32_t y, uint32_t z)
+{
+    const Allocation *a = static_cast<const Allocation *>(va);
+    const Type *t = a->getType();
+    const uint8_t *p = (const uint8_t *)a->getPtr();
+    return &p[t->getElementSizeBytes() * (x + y*t->getDimX())];
+}
 
 
 static void SC_debugF(const char *s, float f) {
@@ -350,6 +373,10 @@
     LOGE("%s %i  0x%x", s, i, i);
 }
 
+static void SC_debugP(const char *s, const void *p) {
+    LOGE("%s %p", s, p);
+}
+
 static uint32_t SC_toClient(void *data, int cmdID, int len, int waitForSpace)
 {
     GET_TLS();
@@ -433,12 +460,18 @@
     { "rsAllocationGetDimFaces", (void *)&SC_allocGetDimFaces },
     { "rsGetAllocation", (void *)&SC_getAllocation },
 
+    { "_Z14rsGetElementAt13rs_allocationj", (void *)&SC_getElementAtX },
+    { "_Z14rsGetElementAt13rs_allocationjj", (void *)&SC_getElementAtXY },
+    { "_Z14rsGetElementAt13rs_allocationjjj", (void *)&SC_getElementAtXYZ },
+
+
     // Debug
     { "_Z7rsDebugPKcf", (void *)&SC_debugF },
     { "_Z7rsDebugPKcDv2_f", (void *)&SC_debugFv2 },
     { "_Z7rsDebugPKcDv3_f", (void *)&SC_debugFv3 },
     { "_Z7rsDebugPKcDv4_f", (void *)&SC_debugFv4 },
     { "_Z7rsDebugPKci", (void *)&SC_debugI32 },
+    { "_Z7rsDebugPKcPKv", (void *)&SC_debugP },
     //extern void __attribute__((overloadable))rsDebug(const char *, const void *);
 
 
diff --git a/libs/rs/rsUtils.h b/libs/rs/rsUtils.h
index 0a37a5b..17feb22 100644
--- a/libs/rs/rsUtils.h
+++ b/libs/rs/rsUtils.h
@@ -30,6 +30,7 @@
 #include <stdlib.h>
 #include <pthread.h>
 #include <time.h>
+#include <cutils/atomic.h>
 
 #ifndef ANDROID_RS_BUILD_FOR_HOST
 #include <EGL/egl.h>
diff --git a/libs/rs/scriptc/rs_math.rsh b/libs/rs/scriptc/rs_math.rsh
index e11c832..bd6e5a9 100644
--- a/libs/rs/scriptc/rs_math.rsh
+++ b/libs/rs/scriptc/rs_math.rsh
@@ -14,6 +14,12 @@
 extern uint32_t rsAllocationGetDimLOD(rs_allocation);
 extern uint32_t rsAllocationGetDimFaces(rs_allocation);
 
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation, uint32_t x);
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation, uint32_t x, uint32_t y);
+extern const void * __attribute__((overloadable))
+    rsGetElementAt(rs_allocation, uint32_t x, uint32_t y, uint32_t z);
 
 
 // Debugging