Adds support for multi-input kernels to Frameworks/RS.

This patch modifies Frameworks/RS in the following ways:
* Adjusted the data-layout of the C/C++ version of RsForEachStubParamStruct to
  accommodate a pointer to an array of input allocations and a pointer to an
  array of stride sizes for each of these allocatoins.
* Adds a new code path for Java code to pass multiple allocations to a RS
  kernel.
* Packs base pointers and step values for multi-input kernels into the new
  RsForEachStubParamStruct members.

Change-Id: I46d2834c37075b2a2407fd8b010546818a4540d1
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index 9755b9a..499f890 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -479,6 +479,109 @@
     }
 }
 
+void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+                                        const RsScriptCall* sc, MTLaunchStruct* mtls) {
+
+    //android::StopWatch kernel_time("kernel time");
+
+    if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
+        const size_t targetByteChunk = 16 * 1024;
+        mInForEach = true;
+        if (mtls->fep.dimY > 1) {
+            uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.yStrideOut) {
+                s2 = targetByteChunk / mtls->fep.yStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.yStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if(mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+         //   mtls->mSliceSize = 2;
+            launchThreads(wc_xy, mtls);
+        } else {
+            uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
+            uint32_t s2 = 0;
+
+            // This chooses our slice size to rate limit atomic ops to
+            // one per 16k bytes of reads/writes.
+            if (mtls->fep.eStrideOut) {
+                s2 = targetByteChunk / mtls->fep.eStrideOut;
+            } else {
+                s2 = targetByteChunk / mtls->fep.eStrideIn;
+            }
+            mtls->mSliceSize = rsMin(s1, s2);
+
+            if (mtls->mSliceSize < 1) {
+                mtls->mSliceSize = 1;
+            }
+
+            launchThreads(wc_x, mtls);
+        }
+        mInForEach = false;
+
+        //ALOGE("launch 1");
+    } else {
+        RsForEachStubParamStruct p;
+        memcpy(&p, &mtls->fep, sizeof(p));
+        uint32_t sig = mtls->sig;
+
+        // Allocate space for our input base pointers.
+        p.ins = new const void*[inLen];
+
+        // Allocate space for our input stride information.
+        p.eStrideIns = new uint32_t[inLen];
+
+        // Fill our stride information.
+        for (int index = inLen; --index >= 0;) {
+          p.eStrideIns[index] = mtls->fep.inStrides[index].eStride;
+        }
+
+        //ALOGE("launch 3");
+        outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
+        uint32_t offset_invariant = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0];
+
+        for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
+            uint32_t offset_part = offset_invariant * p.ar[0];
+
+            for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
+                for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
+                    uint32_t offset = offset_part + mtls->fep.dimY * p.z + p.y;
+
+                    p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
+                            (mtls->fep.eStrideOut * mtls->xStart);
+
+                    for (int index = inLen; --index >= 0;) {
+                        StridePair &strides = mtls->fep.inStrides[index];
+
+                        p.ins[index] = mtls->fep.ptrIns[index] +
+                                       (strides.yStride * offset) +
+                                       (strides.eStride * mtls->xStart);
+                    }
+
+                    /*
+                     * The fourth argument is zero here because multi-input
+                     * kernels get their stride information from a member of p
+                     * that points to an array.
+                     */
+                    fn(&p, mtls->xStart, mtls->xEnd, 0, mtls->fep.eStrideOut);
+                }
+            }
+        }
+
+        // Free our arrays.
+        delete[] p.ins;
+        delete[] p.eStrideIns;
+    }
+}
+
 RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
     //ALOGE("setTls %p", sc);
     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
@@ -595,5 +698,3 @@
     }
     return sgi;
 }
-
-
diff --git a/cpu_ref/rsCpuCore.h b/cpu_ref/rsCpuCore.h
index d2b47fb..c54dca2 100644
--- a/cpu_ref/rsCpuCore.h
+++ b/cpu_ref/rsCpuCore.h
@@ -35,6 +35,11 @@
 namespace renderscript {
 
 typedef struct {
+  uint32_t eStride;
+  uint32_t yStride;
+} StridePair;
+
+typedef struct {
     const void *in;
     void *out;
     const void *usr;
@@ -45,6 +50,10 @@
     uint32_t lod;
     RsAllocationCubemapFace face;
     uint32_t ar[16];
+
+    const void **ins;
+    uint32_t *eStrideIns;
+
     uint32_t lid;
 
     uint32_t dimX;
@@ -59,6 +68,9 @@
     uint32_t yStrideIn;
     uint32_t yStrideOut;
     uint32_t slot;
+
+    const uint8_t** ptrIns;
+    StridePair* inStrides;
 } RsForEachStubParamStruct;
 
 extern bool gArchUseSIMD;
@@ -99,6 +111,9 @@
     uint32_t zEnd;
     uint32_t arrayStart;
     uint32_t arrayEnd;
+
+    // Multi-input data.
+    const Allocation ** ains;
 } MTLaunchStruct;
 
 
@@ -126,6 +141,9 @@
     void launchThreads(const Allocation * ain, Allocation * aout,
                        const RsScriptCall *sc, MTLaunchStruct *mtls);
 
+    void launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
+                       const RsScriptCall* sc, MTLaunchStruct* mtls);
+
     virtual CpuScript * createScript(const ScriptC *s,
                                      char const *resName, char const *cacheDir,
                                      uint8_t const *bitcode, size_t bitcodeSize,
diff --git a/cpu_ref/rsCpuIntrinsic.cpp b/cpu_ref/rsCpuIntrinsic.cpp
index 7195714..d146b76 100644
--- a/cpu_ref/rsCpuIntrinsic.cpp
+++ b/cpu_ref/rsCpuIntrinsic.cpp
@@ -107,6 +107,35 @@
     postLaunch(slot, ain, aout, usr, usrLen, sc);
 }
 
+void RsdCpuScriptIntrinsic::invokeForEachMulti(uint32_t slot,
+                                               const Allocation ** ains,
+                                               size_t inLen,
+                                               Allocation * aout,
+                                               const void * usr,
+                                               uint32_t usrLen,
+                                               const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+    /*
+     * FIXME: Possibly create new preLaunch and postLaunch functions that take
+     *        all of the input allocation pointers.
+     */
+    preLaunch(slot, ains[0], aout, usr, usrLen, sc);
+
+    forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
+    mtls.script = this;
+    mtls.fep.slot = slot;
+
+    mtls.kernel = (void (*)())mRootPtr;
+    mtls.fep.usr = this;
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+
+    postLaunch(slot, ains[0], aout, usr, usrLen, sc);
+}
+
 void RsdCpuScriptIntrinsic::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
 
     mtls->script = this;
@@ -114,6 +143,3 @@
     mtls->kernel = (void (*)())mRootPtr;
     mtls->fep.usr = this;
 }
-
-
-
diff --git a/cpu_ref/rsCpuIntrinsic.h b/cpu_ref/rsCpuIntrinsic.h
index 1cf889c..85e2ddc 100644
--- a/cpu_ref/rsCpuIntrinsic.h
+++ b/cpu_ref/rsCpuIntrinsic.h
@@ -36,6 +36,15 @@
                        const void * usr,
                        uint32_t usrLen,
                        const RsScriptCall *sc);
+
+    virtual void invokeForEachMulti(uint32_t slot,
+                       const Allocation ** ain,
+                       size_t inLen,
+                       Allocation * aout,
+                       const void * usr,
+                       uint32_t usrLen,
+                       const RsScriptCall *sc);
+
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index cf7b377..b9f8aba 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -806,21 +806,34 @@
         return;
     }
 
-    if (ain) {
-        mtls->fep.dimX = ain->getType()->getDimX();
-        mtls->fep.dimY = ain->getType()->getDimY();
-        mtls->fep.dimZ = ain->getType()->getDimZ();
-        //mtls->dimArray = ain->getType()->getDimArray();
-    } else if (aout) {
-        mtls->fep.dimX = aout->getType()->getDimX();
-        mtls->fep.dimY = aout->getType()->getDimY();
-        mtls->fep.dimZ = aout->getType()->getDimZ();
-        //mtls->dimArray = aout->getType()->getDimArray();
+    if (ain != NULL) {
+        const Type *inType = ain->getType();
+
+        mtls->fep.dimX = inType->getDimX();
+        mtls->fep.dimY = inType->getDimY();
+        mtls->fep.dimZ = inType->getDimZ();
+
+    } else if (aout != NULL) {
+        const Type *outType = aout->getType();
+
+        mtls->fep.dimX = outType->getDimX();
+        mtls->fep.dimY = outType->getDimY();
+        mtls->fep.dimZ = outType->getDimZ();
+
     } else {
         mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
         return;
     }
 
+    if (ain != NULL && aout != NULL) {
+        if (!ain->hasSameDims(aout)) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+              "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+            return;
+        }
+    }
+
     if (!sc || (sc->xEnd == 0)) {
         mtls->xEnd = mtls->fep.dimX;
     } else {
@@ -888,6 +901,147 @@
     }
 }
 
+void RsdCpuScriptImpl::forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+                                        Allocation * aout,
+                                        const void * usr, uint32_t usrLen,
+                                        const RsScriptCall *sc,
+                                        MTLaunchStruct *mtls) {
+
+    memset(mtls, 0, sizeof(MTLaunchStruct));
+
+    // possible for this to occur if IO_OUTPUT/IO_INPUT with no bound surface
+    if (ains != NULL) {
+        for (int index = inLen; --index >= 0;) {
+            const Allocation* ain = ains[index];
+
+            if (ain != NULL && (const uint8_t *)ain->mHal.drvState.lod[0].mallocPtr == NULL) {
+                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null in allocations");
+                return;
+            }
+        }
+    }
+
+    if (aout && (const uint8_t *)aout->mHal.drvState.lod[0].mallocPtr == NULL) {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null out allocations");
+        return;
+    }
+
+    if (ains != NULL) {
+        const Allocation *ain0   = ains[0];
+        const Type       *inType = ain0->getType();
+
+        mtls->fep.dimX = inType->getDimX();
+        mtls->fep.dimY = inType->getDimY();
+        mtls->fep.dimZ = inType->getDimZ();
+
+        for (int Index = inLen; --Index >= 1;) {
+            if (!ain0->hasSameDims(ains[Index])) {
+                mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+                  "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+                return;
+            }
+        }
+
+    } else if (aout != NULL) {
+        const Type *outType = aout->getType();
+
+        mtls->fep.dimX = outType->getDimX();
+        mtls->fep.dimY = outType->getDimY();
+        mtls->fep.dimZ = outType->getDimZ();
+
+    } else {
+        mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT, "rsForEach called with null allocations");
+        return;
+    }
+
+    if (ains != NULL && aout != NULL) {
+        if (!ains[0]->hasSameDims(aout)) {
+            mCtx->getContext()->setError(RS_ERROR_BAD_SCRIPT,
+              "Failed to launch kernel; dimensions of input and output allocations do not match.");
+
+            return;
+        }
+    }
+
+    if (!sc || (sc->xEnd == 0)) {
+        mtls->xEnd = mtls->fep.dimX;
+    } else {
+        rsAssert(sc->xStart < mtls->fep.dimX);
+        rsAssert(sc->xEnd <= mtls->fep.dimX);
+        rsAssert(sc->xStart < sc->xEnd);
+        mtls->xStart = rsMin(mtls->fep.dimX, sc->xStart);
+        mtls->xEnd = rsMin(mtls->fep.dimX, sc->xEnd);
+        if (mtls->xStart >= mtls->xEnd) return;
+    }
+
+    if (!sc || (sc->yEnd == 0)) {
+        mtls->yEnd = mtls->fep.dimY;
+    } else {
+        rsAssert(sc->yStart < mtls->fep.dimY);
+        rsAssert(sc->yEnd <= mtls->fep.dimY);
+        rsAssert(sc->yStart < sc->yEnd);
+        mtls->yStart = rsMin(mtls->fep.dimY, sc->yStart);
+        mtls->yEnd = rsMin(mtls->fep.dimY, sc->yEnd);
+        if (mtls->yStart >= mtls->yEnd) return;
+    }
+
+    if (!sc || (sc->zEnd == 0)) {
+        mtls->zEnd = mtls->fep.dimZ;
+    } else {
+        rsAssert(sc->zStart < mtls->fep.dimZ);
+        rsAssert(sc->zEnd <= mtls->fep.dimZ);
+        rsAssert(sc->zStart < sc->zEnd);
+        mtls->zStart = rsMin(mtls->fep.dimZ, sc->zStart);
+        mtls->zEnd = rsMin(mtls->fep.dimZ, sc->zEnd);
+        if (mtls->zStart >= mtls->zEnd) return;
+    }
+
+    mtls->xEnd     = rsMax((uint32_t)1, mtls->xEnd);
+    mtls->yEnd     = rsMax((uint32_t)1, mtls->yEnd);
+    mtls->zEnd     = rsMax((uint32_t)1, mtls->zEnd);
+    mtls->arrayEnd = rsMax((uint32_t)1, mtls->arrayEnd);
+
+    rsAssert(!ains || (ains[0]->getType()->getDimZ() == 0));
+
+    mtls->rsc        = mCtx;
+    mtls->ains       = ains;
+    mtls->aout       = aout;
+    mtls->fep.usr    = usr;
+    mtls->fep.usrLen = usrLen;
+    mtls->mSliceSize = 1;
+    mtls->mSliceNum  = 0;
+
+    mtls->fep.ptrIns    = NULL;
+    mtls->fep.eStrideIn = 0;
+    mtls->isThreadable  = mIsThreadable;
+
+    if (ains) {
+        mtls->fep.ptrIns    = new const uint8_t*[inLen];
+        mtls->fep.inStrides = new StridePair[inLen];
+
+        for (int index = inLen; --index >= 0;) {
+            const Allocation *ain = ains[index];
+
+            mtls->fep.ptrIns[index] =
+              (const uint8_t*)ain->mHal.drvState.lod[0].mallocPtr;
+
+            mtls->fep.inStrides[index].eStride =
+              ain->getType()->getElementSizeBytes();
+            mtls->fep.inStrides[index].yStride =
+              ain->mHal.drvState.lod[0].stride;
+        }
+    }
+
+    mtls->fep.ptrOut = NULL;
+    mtls->fep.eStrideOut = 0;
+    if (aout) {
+        mtls->fep.ptrOut     = (uint8_t *)aout->mHal.drvState.lod[0].mallocPtr;
+        mtls->fep.eStrideOut = aout->getType()->getElementSizeBytes();
+        mtls->fep.yStrideOut = aout->mHal.drvState.lod[0].stride;
+    }
+}
+
 
 void RsdCpuScriptImpl::invokeForEach(uint32_t slot,
                                      const Allocation * ain,
@@ -905,6 +1059,24 @@
     mCtx->setTLS(oldTLS);
 }
 
+void RsdCpuScriptImpl::invokeForEachMulti(uint32_t slot,
+                                          const Allocation ** ains,
+                                          uint32_t inLen,
+                                          Allocation * aout,
+                                          const void * usr,
+                                          uint32_t usrLen,
+                                          const RsScriptCall *sc) {
+
+    MTLaunchStruct mtls;
+
+    forEachMtlsSetup(ains, inLen, aout, usr, usrLen, sc, &mtls);
+    forEachKernelSetup(slot, &mtls);
+
+    RsdCpuScriptImpl * oldTLS = mCtx->setTLS(this);
+    mCtx->launchThreads(ains, inLen, aout, sc, &mtls);
+    mCtx->setTLS(oldTLS);
+}
+
 void RsdCpuScriptImpl::forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls) {
     mtls->script = this;
     mtls->fep.slot = slot;
diff --git a/cpu_ref/rsCpuScript.h b/cpu_ref/rsCpuScript.h
index 666379d..f4ca1ed 100644
--- a/cpu_ref/rsCpuScript.h
+++ b/cpu_ref/rsCpuScript.h
@@ -76,6 +76,14 @@
                        const void * usr,
                        uint32_t usrLen,
                        const RsScriptCall *sc);
+
+    virtual void invokeForEachMulti(uint32_t slot,
+                                     const Allocation** ains,
+                                     uint32_t inLen,
+                                     Allocation* aout,
+                                     const void* usr,
+                                     uint32_t usrLen,
+                                     const RsScriptCall* sc);
     virtual void invokeInit();
     virtual void invokeFreeChildren();
 
@@ -95,6 +103,11 @@
     void forEachMtlsSetup(const Allocation * ain, Allocation * aout,
                           const void * usr, uint32_t usrLen,
                           const RsScriptCall *sc, MTLaunchStruct *mtls);
+
+    void forEachMtlsSetup(const Allocation ** ains, uint32_t inLen,
+                          Allocation * aout, const void * usr, uint32_t usrLen,
+                          const RsScriptCall *sc, MTLaunchStruct *mtls);
+
     virtual void forEachKernelSetup(uint32_t slot, MTLaunchStruct *mtls);
 
 
diff --git a/cpu_ref/rsd_cpu.h b/cpu_ref/rsd_cpu.h
index 675bb97..0076cb9 100644
--- a/cpu_ref/rsd_cpu.h
+++ b/cpu_ref/rsd_cpu.h
@@ -75,6 +75,15 @@
                            const void * usr,
                            uint32_t usrLen,
                            const RsScriptCall *sc) = 0;
+                           
+        virtual void invokeForEachMulti(uint32_t slot,
+                                         const Allocation** ains,
+                                         uint32_t inLen,
+                                         Allocation * aout,
+                                         const void * usr,
+                                         uint32_t usrLen,
+                                         const RsScriptCall *sc) = 0;
+        
         virtual void invokeInit() = 0;
         virtual void invokeFreeChildren() = 0;
 
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp
index 60532df..b62709e 100644
--- a/driver/rsdBcc.cpp
+++ b/driver/rsdBcc.cpp
@@ -77,6 +77,20 @@
     cs->invokeForEach(slot, ain, aout, usr, usrLen, sc);
 }
 
+void rsdScriptInvokeForEachMulti(const Context *rsc,
+                                 Script *s,
+                                 uint32_t slot,
+                                 const Allocation ** ains,
+                                 size_t inLen,
+                                 Allocation * aout,
+                                 const void * usr,
+                                 size_t usrLen,
+                                 const RsScriptCall *sc) {
+
+    RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
+    cs->invokeForEachMulti(slot, ains, inLen, aout, usr, usrLen, sc);
+}
+
 
 int rsdScriptInvokeRoot(const Context *dc, Script *s) {
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)s->mHal.drv;
@@ -145,4 +159,3 @@
     RsdCpuReference::CpuScript *cs = (RsdCpuReference::CpuScript *)sc->mHal.drv;
     return cs->getAllocationForPointer(ptr);
 }
-
diff --git a/driver/rsdBcc.h b/driver/rsdBcc.h
index 2917bce..3932aeb 100644
--- a/driver/rsdBcc.h
+++ b/driver/rsdBcc.h
@@ -43,6 +43,16 @@
                             size_t usrLen,
                             const RsScriptCall *sc);
 
+void rsdScriptInvokeForEachMulti(const android::renderscript::Context *rsc,
+                                 android::renderscript::Script *s,
+                                 uint32_t slot,
+                                 const android::renderscript::Allocation ** ains,
+                                 size_t inLen,
+                                 android::renderscript::Allocation * aout,
+                                 const void * usr,
+                                 size_t usrLen,
+                                 const RsScriptCall *sc);
+
 int rsdScriptInvokeRoot(const android::renderscript::Context *dc,
                         android::renderscript::Script *script);
 void rsdScriptInvokeInit(const android::renderscript::Context *dc,
diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp
index 43c2375..065597a 100644
--- a/driver/rsdCore.cpp
+++ b/driver/rsdCore.cpp
@@ -79,7 +79,8 @@
         rsdScriptSetGlobalVarWithElemDims,
         rsdScriptSetGlobalBind,
         rsdScriptSetGlobalObj,
-        rsdScriptDestroy
+        rsdScriptDestroy,
+        rsdScriptInvokeForEachMulti
     },
 
     {
diff --git a/java/tests/RsTest/Android.mk b/java/tests/RsTest/Android.mk
index 198693c..787e740 100644
--- a/java/tests/RsTest/Android.mk
+++ b/java/tests/RsTest/Android.mk
@@ -21,6 +21,8 @@
 
 LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
 
+LOCAL_RENDERSCRIPT_FLAGS := -target-api 0
+
 LOCAL_PACKAGE_NAME := RSTest
 
 include $(BUILD_PACKAGE)
diff --git a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
index e9d5268..3047a56 100644
--- a/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
+++ b/java/tests/RsTest/src/com/android/rs/test/RSTestCore.java
@@ -93,6 +93,7 @@
         /*unitTests.add(new UT_program_store(this, mRes, mCtx));
         unitTests.add(new UT_program_raster(this, mRes, mCtx));
         unitTests.add(new UT_mesh(this, mRes, mCtx));*/
+        unitTests.add(new UT_foreach_multi(this, mRes, mCtx));
         unitTests.add(new UT_fp_mad(this, mRes, mCtx));
 
         /*
diff --git a/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java b/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java
new file mode 100644
index 0000000..e2095f6
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/UT_foreach_multi.java
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.test;
+
+import android.content.Context;
+import android.content.res.Resources;
+import android.renderscript.*;
+import android.util.Log;
+
+public class UT_foreach_multi extends UnitTest {
+    private Resources mRes;
+    private Allocation Ain0;
+    private Allocation Ain1;
+    private Allocation Ain2;
+    private Allocation Ain3;
+
+    private Allocation Out0;
+    private Allocation Out1;
+    private Allocation Out2;
+
+    protected UT_foreach_multi(RSTestCore rstc, Resources res, Context ctx) {
+        super(rstc, "Foreach Multi-input", ctx);
+        mRes = res;
+    }
+
+    private void initializeGlobals(RenderScript RS, ScriptC_foreach_multi s) {
+        Type.Builder type32Builder = new Type.Builder(RS, Element.U32(RS));
+        Type.Builder type16Builder = new Type.Builder(RS, Element.U16(RS));
+
+        int Xdim = 5;
+        s.set_dimX(Xdim);
+        type32Builder.setX(Xdim);
+        type16Builder.setX(Xdim);
+
+        // 32-bit input allocations
+
+        Ain0 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain0(Ain0);
+        s.forEach_init_uint32_alloc(Ain0);
+
+        Ain1 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain1(Ain1);
+        s.forEach_init_uint32_alloc(Ain1);
+
+        Ain2 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_ain2(Ain2);
+        s.forEach_init_uint32_alloc(Ain2);
+
+        // 16-bit input allocation
+
+        Ain3 = Allocation.createTyped(RS, type16Builder.create());
+        s.set_ain3(Ain3);
+        s.forEach_init_uint16_alloc(Ain3);
+
+        // 32-bit output allocations
+
+        Out0 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout0(Out0);
+
+        Out1 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout1(Out1);
+
+        Out2 = Allocation.createTyped(RS, type32Builder.create());
+        s.set_aout2(Out2);
+
+        return;
+    }
+
+    public void run() {
+        RenderScript pRS = RenderScript.create(mCtx);
+        ScriptC_foreach_multi s = new ScriptC_foreach_multi(pRS);
+
+        pRS.setMessageHandler(mRsMessage);
+
+        initializeGlobals(pRS, s);
+
+        s.forEach_sum2(Ain0, Ain1, Out0);
+        s.forEach_sum3(Ain0, Ain1, Ain2, Out1);
+        s.forEach_sum_mixed(Ain0, Ain3, Out2);
+
+        s.invoke_test_outputs();
+        s.invoke_check_test_results();
+
+        pRS.finish();
+        waitForMessage();
+        pRS.destroy();
+    }
+}
diff --git a/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs b/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs
new file mode 100644
index 0000000..025d19d
--- /dev/null
+++ b/java/tests/RsTest/src/com/android/rs/test/foreach_multi.rs
@@ -0,0 +1,116 @@
+#include "shared.rsh"
+
+rs_allocation ain0, ain1, ain2;
+rs_allocation ain3;
+
+rs_allocation aout0, aout1, aout2;
+
+uint32_t dimX;
+
+static bool failed = false;
+
+uint32_t RS_KERNEL init_uint32_alloc(uint32_t x) {
+    return x;
+}
+
+uint16_t RS_KERNEL init_uint16_alloc(uint32_t x) {
+    return x;
+}
+
+uint32_t RS_KERNEL sum2(uint32_t in0, uint32_t in1, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    return in0 + in1;
+}
+
+uint32_t RS_KERNEL sum3(uint32_t in0, uint32_t in1, uint32_t in2, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+    _RS_ASSERT(in2 == x);
+
+    return in0 + in1 + in2;
+}
+
+
+uint32_t RS_KERNEL sum_mixed(uint32_t in0, uint16_t in1, uint32_t x) {
+    _RS_ASSERT(in0 == x);
+    _RS_ASSERT(in1 == x);
+
+    return in0 + in1;
+}
+
+static bool test_sum2_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout0, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_uint(ain1, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum2_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum2_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum3_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout1, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_uint(ain1, i) +
+                    rsGetElementAt_uint(ain2, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum3_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum3_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+static bool test_sum_mixed_output() {
+    bool failed = false;
+    uint32_t i;
+
+    for (i = 0; i < dimX; i++) {
+        _RS_ASSERT(rsGetElementAt_uint(aout2, i) ==
+                   (rsGetElementAt_uint(ain0, i) +
+                    rsGetElementAt_ushort(ain3, i)));
+    }
+
+    if (failed) {
+        rsDebug("test_sum_mixed_output FAILED", 0);
+    }
+    else {
+        rsDebug("test_sum_mixed_output PASSED", 0);
+    }
+
+    return failed;
+}
+
+void test_outputs() {
+    failed |= test_sum2_output();
+    failed |= test_sum3_output();
+    failed |= test_sum_mixed_output();
+}
+
+void check_test_results() {
+    if (failed) {
+        rsSendToClientBlocking(RS_MSG_TEST_FAILED);
+    } else {
+        rsSendToClientBlocking(RS_MSG_TEST_PASSED);
+    }
+}
diff --git a/rs.spec b/rs.spec
index 24c06ed..18ece8c 100644
--- a/rs.spec
+++ b/rs.spec
@@ -295,6 +295,15 @@
     param const RsScriptCall * sc
 }
 
+ScriptForEachMulti {
+    param RsScript s
+    param uint32_t slot
+    param RsAllocation * ains
+    param RsAllocation aout
+    param const void * usr
+    param const RsScriptCall * sc
+}
+
 ScriptSetVarI {
     param RsScript s
     param uint32_t slot
@@ -408,4 +417,3 @@
 AllocationIoReceive {
     param RsAllocation alloc
     }
-
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 3fbbfff..afa3a8a 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -531,6 +531,19 @@
 #endif
 }
 
+bool Allocation::hasSameDims(const Allocation *other) const {
+    const Type *type0 = this->getType(),
+               *type1 = other->getType();
+
+    return (type0->getCellCount() == type1->getCellCount()) &&
+           (type0->getDimLOD()    == type1->getDimLOD())    &&
+           (type0->getDimFaces()  == type1->getDimFaces())  &&
+           (type0->getDimYuv()    == type1->getDimYuv())    &&
+           (type0->getDimX()      == type1->getDimX())      &&
+           (type0->getDimY()      == type1->getDimY())      &&
+           (type0->getDimZ()      == type1->getDimZ());
+}
+
 
 /////////////////
 //
diff --git a/rsAllocation.h b/rsAllocation.h
index b997f9a..1b842b8 100644
--- a/rsAllocation.h
+++ b/rsAllocation.h
@@ -166,6 +166,8 @@
     void * getPointer(const Context *rsc, uint32_t lod, RsAllocationCubemapFace face,
                       uint32_t z, uint32_t array, size_t *stride);
 
+    bool hasSameDims(const Allocation *Other) const;
+
 protected:
     Vector<const Program *> mToDirtyList;
     ObjectBaseRef<const Type> mType;
@@ -202,4 +204,3 @@
 }
 }
 #endif
-
diff --git a/rsScript.cpp b/rsScript.cpp
index 27fe020..5918c59 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -198,6 +198,28 @@
 
 }
 
+void rsi_ScriptForEachMulti(Context *rsc, RsScript vs, uint32_t slot,
+                            RsAllocation *vains, size_t inLen,
+                            RsAllocation vaout, const void *params,
+                            size_t paramLen, const RsScriptCall *sc,
+                            size_t scLen) {
+    Script *s = static_cast<Script *>(vs);
+    // The rs.spec generated code does not handle the absence of an actual
+    // input for sc. Instead, it retains an existing pointer value (the prior
+    // field in the packed data object). This can cause confusion because
+    // drivers might now inspect bogus sc data.
+    if (scLen == 0) {
+        sc = NULL;
+    }
+
+    Allocation **ains = (Allocation**)(vains);
+
+    s->runForEach(rsc, slot,
+                  const_cast<const Allocation **>(ains), inLen,
+                  static_cast<Allocation *>(vaout), params, paramLen, sc);
+
+}
+
 void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
     Script *s = static_cast<Script *>(vs);
     s->Invoke(rsc, slot, NULL, 0);
@@ -260,4 +282,3 @@
 
 }
 }
-
diff --git a/rsScript.h b/rsScript.h
index 8030ab0..06e52de 100644
--- a/rsScript.h
+++ b/rsScript.h
@@ -116,6 +116,15 @@
                             size_t usrBytes,
                             const RsScriptCall *sc = NULL) = 0;
 
+    virtual void runForEach(Context* rsc,
+                            uint32_t slot,
+                            const Allocation** ains,
+                            size_t inLen,
+                            Allocation* aout,
+                            const void* usr,
+                            size_t usrBytes,
+                            const RsScriptCall *sc = NULL) = 0;
+
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) = 0;
     virtual void setupScript(Context *rsc) = 0;
     virtual uint32_t run(Context *) = 0;
@@ -135,4 +144,3 @@
 }
 }
 #endif
-
diff --git a/rsScriptC.cpp b/rsScriptC.cpp
index ab263c8..e7ff8c7 100644
--- a/rsScriptC.cpp
+++ b/rsScriptC.cpp
@@ -184,6 +184,38 @@
         delete AString;
 }
 
+void ScriptC::runForEach(Context *rsc,
+                         uint32_t slot,
+                         const Allocation ** ains,
+                         size_t inLen,
+                         Allocation * aout,
+                         const void * usr,
+                         size_t usrBytes,
+                         const RsScriptCall *sc) {
+    // Trace this function call.
+    // To avoid overhead we only build the string if tracing is actually
+    // enabled.
+    String8 *AString = NULL;
+    const char *String = "";
+    if (ATRACE_ENABLED()) {
+        AString = new String8("runForEach_");
+        AString->append(mHal.info.exportedForeachFuncList[slot].first);
+        String = AString->string();
+    }
+    ATRACE_NAME(String);
+    (void)String;
+
+    Context::PushState ps(rsc);
+
+    setupGLState(rsc);
+    setupScript(rsc);
+
+    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+
+    if (AString)
+        delete AString;
+}
+
 void ScriptC::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
     ATRACE_CALL();
 
diff --git a/rsScriptC.h b/rsScriptC.h
index d26cde0..d3d9d51 100644
--- a/rsScriptC.h
+++ b/rsScriptC.h
@@ -50,6 +50,15 @@
                             size_t usrBytes,
                             const RsScriptCall *sc = NULL);
 
+    virtual void runForEach(Context *rsc,
+                            uint32_t slot,
+                            const Allocation ** ains,
+                            size_t inLen,
+                            Allocation * aout,
+                            const void * usr,
+                            size_t usrBytes,
+                            const RsScriptCall *sc = NULL);
+
     virtual void serialize(Context *rsc, OStream *stream) const {    }
     virtual RsA3DClassID getClassId() const { return RS_A3D_CLASS_ID_SCRIPT_C; }
     static Type *createFromStream(Context *rsc, IStream *stream) { return NULL; }
diff --git a/rsScriptIntrinsic.cpp b/rsScriptIntrinsic.cpp
index ab439e6..86f1c50 100644
--- a/rsScriptIntrinsic.cpp
+++ b/rsScriptIntrinsic.cpp
@@ -67,6 +67,18 @@
     rsc->mHal.funcs.script.invokeForEach(rsc, this, slot, ain, aout, usr, usrBytes, sc);
 }
 
+void ScriptIntrinsic::runForEach(Context* rsc,
+                         uint32_t slot,
+                         const Allocation** ains,
+                         size_t inLen,
+                         Allocation* aout,
+                         const void* usr,
+                         size_t usrBytes,
+                         const RsScriptCall* sc) {
+
+    rsc->mHal.funcs.script.invokeForEachMulti(rsc, this, slot, ains, inLen, aout, usr, usrBytes, sc);
+}
+
 void ScriptIntrinsic::Invoke(Context *rsc, uint32_t slot, const void *data, size_t len) {
 }
 
diff --git a/rsScriptIntrinsic.h b/rsScriptIntrinsic.h
index 696f2db..66b6031 100644
--- a/rsScriptIntrinsic.h
+++ b/rsScriptIntrinsic.h
@@ -48,6 +48,15 @@
                             size_t usrBytes,
                             const RsScriptCall *sc = NULL);
 
+    virtual void runForEach(Context* rsc,
+                            uint32_t slot,
+                            const Allocation** ains,
+                            size_t inLen,
+                            Allocation* aout,
+                            const void* usr,
+                            size_t usrBytes,
+                            const RsScriptCall* sc = NULL);
+
     virtual void Invoke(Context *rsc, uint32_t slot, const void *data, size_t len);
     virtual void setupScript(Context *rsc);
     virtual uint32_t run(Context *);
diff --git a/rs_hal.h b/rs_hal.h
index e96e606..467c815 100644
--- a/rs_hal.h
+++ b/rs_hal.h
@@ -113,6 +113,15 @@
                              ObjectBase *data);
 
         void (*destroy)(const Context *rsc, Script *s);
+        void (*invokeForEachMulti)(const Context *rsc,
+                                   Script *s,
+                                   uint32_t slot,
+                                   const Allocation ** ains,
+                                   size_t inLen,
+                                   Allocation * aout,
+                                   const void * usr,
+                                   size_t usrLen,
+                                   const RsScriptCall *sc);
     } script;
 
     struct {
@@ -278,4 +287,3 @@
 #endif
 
 #endif
-