Merge "Moving renderscript GL code into the HAL This change affects  - shaders  - meshes  - fonts  - quad rendering"
diff --git a/rs.spec b/rs.spec
index 998296b..0c4e1ed 100644
--- a/rs.spec
+++ b/rs.spec
@@ -66,7 +66,7 @@
     direct
 }
 
-aTypeCreate {
+TypeCreate {
     direct
     param RsElement e
     param uint32_t dimX
@@ -77,7 +77,7 @@
     ret RsType
 }
 
-aAllocationCreateTyped {
+AllocationCreateTyped {
     direct
     param RsType vtype
     param RsAllocationMipmapControl mips
@@ -85,7 +85,7 @@
     ret RsAllocation
 }
 
-aAllocationCreateFromBitmap {
+AllocationCreateFromBitmap {
     direct
     param RsType vtype
     param RsAllocationMipmapControl mips
@@ -94,7 +94,7 @@
     ret RsAllocation
 }
 
-aAllocationCubeCreateFromBitmap {
+AllocationCubeCreateFromBitmap {
     direct
     param RsType vtype
     param RsAllocationMipmapControl mips
@@ -188,7 +188,6 @@
 
 
 Allocation1DData {
-	handcodeApi
 	param RsAllocation va
 	param uint32_t xoff
 	param uint32_t lod
@@ -197,7 +196,6 @@
 	}
 
 Allocation1DElementData {
-	handcodeApi
 	param RsAllocation va
 	param uint32_t x
 	param uint32_t lod
@@ -290,12 +288,19 @@
 	}
 
 ScriptInvokeV {
-	handcodeApi
 	param RsScript s
 	param uint32_t slot
 	param const void * data
 	}
 
+ScriptForEach { 
+    param RsScript s
+    param uint32_t slot
+    param RsAllocation ain
+    param RsAllocation aout
+    param const void * usr
+}
+
 ScriptSetVarI {
 	param RsScript s
 	param uint32_t slot
@@ -327,7 +332,6 @@
 	}
 
 ScriptSetVarV {
-	handcodeApi
 	param RsScript s
 	param uint32_t slot
 	param const void * data
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index 743b2c4..b5f6f56 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -649,11 +649,12 @@
 //
 #ifndef ANDROID_RS_SERIALIZE
 
-static void rsaAllocationGenerateScriptMips(RsContext con, RsAllocation va);
 
 namespace android {
 namespace renderscript {
 
+static void AllocationGenerateScriptMips(RsContext con, RsAllocation va);
+
 void rsi_AllocationUploadToTexture(Context *rsc, RsAllocation va, bool genmip, uint32_t baseMipLevel) {
     Allocation *alloc = static_cast<Allocation *>(va);
     alloc->deferredUploadToTexture(rsc);
@@ -740,7 +741,7 @@
 
 void rsi_AllocationGenerateMipmaps(Context *rsc, RsAllocation va) {
     Allocation *texAlloc = static_cast<Allocation *>(va);
-    rsaAllocationGenerateScriptMips(rsc, texAlloc);
+    AllocationGenerateScriptMips(rsc, texAlloc);
 }
 
 void rsi_AllocationCopyToBitmap(Context *rsc, RsAllocation va, void *data, size_t dataLen) {
@@ -795,10 +796,7 @@
     a->resize2D(rsc, dimX, dimY);
 }
 
-}
-}
-
-static void rsaAllocationGenerateScriptMips(RsContext con, RsAllocation va) {
+static void AllocationGenerateScriptMips(RsContext con, RsAllocation va) {
     Context *rsc = static_cast<Context *>(con);
     Allocation *texAlloc = static_cast<Allocation *>(va);
     uint32_t numFaces = texAlloc->getType()->getDimFaces() ? 6 : 1;
@@ -815,29 +813,20 @@
     }
 }
 
-const void * rsaAllocationGetType(RsContext con, RsAllocation va) {
-    Allocation *a = static_cast<Allocation *>(va);
-    a->getType()->incUserRef();
-
-    return a->getType();
-}
-
-RsAllocation rsaAllocationCreateTyped(RsContext con, RsType vtype,
-                                      RsAllocationMipmapControl mips,
-                                      uint32_t usages) {
-    Context *rsc = static_cast<Context *>(con);
+RsAllocation rsi_AllocationCreateTyped(Context *rsc, RsType vtype,
+                                       RsAllocationMipmapControl mips,
+                                       uint32_t usages) {
     Allocation * alloc = new Allocation(rsc, static_cast<Type *>(vtype), usages, mips);
     alloc->incUserRef();
     return alloc;
 }
 
-RsAllocation rsaAllocationCreateFromBitmap(RsContext con, RsType vtype,
-                                           RsAllocationMipmapControl mips,
-                                           const void *data, size_t data_length, uint32_t usages) {
-    Context *rsc = static_cast<Context *>(con);
+RsAllocation rsi_AllocationCreateFromBitmap(Context *rsc, RsType vtype,
+                                            RsAllocationMipmapControl mips,
+                                            const void *data, size_t data_length, uint32_t usages) {
     Type *t = static_cast<Type *>(vtype);
 
-    RsAllocation vTexAlloc = rsaAllocationCreateTyped(rsc, vtype, mips, usages);
+    RsAllocation vTexAlloc = rsi_AllocationCreateTyped(rsc, vtype, mips, usages);
     Allocation *texAlloc = static_cast<Allocation *>(vTexAlloc);
     if (texAlloc == NULL) {
         LOGE("Memory allocation failure");
@@ -846,23 +835,22 @@
 
     memcpy(texAlloc->getPtr(), data, t->getDimX() * t->getDimY() * t->getElementSizeBytes());
     if (mips == RS_ALLOCATION_MIPMAP_FULL) {
-        rsaAllocationGenerateScriptMips(rsc, texAlloc);
+        AllocationGenerateScriptMips(rsc, texAlloc);
     }
 
     texAlloc->deferredUploadToTexture(rsc);
     return texAlloc;
 }
 
-RsAllocation rsaAllocationCubeCreateFromBitmap(RsContext con, RsType vtype,
-                                               RsAllocationMipmapControl mips,
-                                               const void *data, size_t data_length, uint32_t usages) {
-    Context *rsc = static_cast<Context *>(con);
+RsAllocation rsi_AllocationCubeCreateFromBitmap(Context *rsc, RsType vtype,
+                                                RsAllocationMipmapControl mips,
+                                                const void *data, size_t data_length, uint32_t usages) {
     Type *t = static_cast<Type *>(vtype);
 
     // Cubemap allocation's faces should be Width by Width each.
     // Source data should have 6 * Width by Width pixels
     // Error checking is done in the java layer
-    RsAllocation vTexAlloc = rsaAllocationCreateTyped(rsc, t, mips, usages);
+    RsAllocation vTexAlloc = rsi_AllocationCreateTyped(rsc, vtype, mips, usages);
     Allocation *texAlloc = static_cast<Allocation *>(vTexAlloc);
     if (texAlloc == NULL) {
         LOGE("Memory allocation failure");
@@ -887,11 +875,21 @@
     }
 
     if (mips == RS_ALLOCATION_MIPMAP_FULL) {
-        rsaAllocationGenerateScriptMips(rsc, texAlloc);
+        AllocationGenerateScriptMips(rsc, texAlloc);
     }
 
     texAlloc->deferredUploadToTexture(rsc);
     return texAlloc;
 }
 
+}
+}
+
+const void * rsaAllocationGetType(RsContext con, RsAllocation va) {
+    Allocation *a = static_cast<Allocation *>(va);
+    a->getType()->incUserRef();
+
+    return a->getType();
+}
+
 #endif //ANDROID_RS_SERIALIZE
diff --git a/rsContext.cpp b/rsContext.cpp
index a07ded3..6d63f67 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -592,7 +592,7 @@
     *subID = d[0];
 
     //LOGE("getMessageToClient  %i %i", commandID, *subID);
-    if (bufferLen >= bytesData) {
+    if (bufferLen >= (*receiveLen)) {
         memcpy(data, d+1, *receiveLen);
         mIO.mToClient.next();
         return (RsMessageToClientType)commandID;
@@ -734,25 +734,21 @@
     rsc->destroyWorkerThreadResources();;
 }
 
-}
-}
-
-void rsContextDestroy(RsContext vcon) {
-    LOGV("rsContextDestroy %p", vcon);
-    Context *rsc = static_cast<Context *>(vcon);
+void rsi_ContextDestroy(Context *rsc) {
+    LOGV("rsContextDestroy %p", rsc);
     rsContextDestroyWorker(rsc);
     delete rsc;
-    LOGV("rsContextDestroy 2 %p", vcon);
+    LOGV("rsContextDestroy 2 %p", rsc);
 }
 
-RsContext rsContextCreate(RsDevice vdev, uint32_t version) {
+RsContext rsi_ContextCreate(RsDevice vdev, uint32_t version) {
     LOGV("rsContextCreate %p", vdev);
     Device * dev = static_cast<Device *>(vdev);
     Context *rsc = Context::createContext(dev, NULL);
     return rsc;
 }
 
-RsContext rsContextCreateGL(RsDevice vdev, uint32_t version,
+RsContext rsi_ContextCreateGL(RsDevice vdev, uint32_t version,
                             RsSurfaceConfig sc, uint32_t dpi) {
     LOGV("rsContextCreateGL %p", vdev);
     Device * dev = static_cast<Device *>(vdev);
@@ -762,32 +758,31 @@
     return rsc;
 }
 
-RsMessageToClientType rsContextPeekMessage(RsContext vrsc,
+RsMessageToClientType rsi_ContextPeekMessage(Context *rsc,
                                            size_t * receiveLen, size_t receiveLen_length,
                                            uint32_t * subID, size_t subID_length, bool wait) {
-    Context * rsc = static_cast<Context *>(vrsc);
     return rsc->peekMessageToClient(receiveLen, subID, wait);
 }
 
-RsMessageToClientType rsContextGetMessage(RsContext vrsc, void * data, size_t data_length,
+RsMessageToClientType rsi_ContextGetMessage(Context *rsc, void * data, size_t data_length,
                                           size_t * receiveLen, size_t receiveLen_length,
                                           uint32_t * subID, size_t subID_length, bool wait) {
-    Context * rsc = static_cast<Context *>(vrsc);
     rsAssert(subID_length == sizeof(uint32_t));
     rsAssert(receiveLen_length == sizeof(size_t));
     return rsc->getMessageToClient(data, receiveLen, subID, data_length, wait);
 }
 
-void rsContextInitToClient(RsContext vrsc) {
-    Context * rsc = static_cast<Context *>(vrsc);
+void rsi_ContextInitToClient(Context *rsc) {
     rsc->initToClient();
 }
 
-void rsContextDeinitToClient(RsContext vrsc) {
-    Context * rsc = static_cast<Context *>(vrsc);
+void rsi_ContextDeinitToClient(Context *rsc) {
     rsc->deinitToClient();
 }
 
+}
+}
+
 // Only to be called at a3d load time, before object is visible to user
 // not thread safe
 void rsaGetName(RsContext con, void * obj, const char **name) {
diff --git a/rsDevice.cpp b/rsDevice.cpp
index d7d03f6..849fd98 100644
--- a/rsDevice.cpp
+++ b/rsDevice.cpp
@@ -40,17 +40,20 @@
     }
 }
 
-RsDevice rsDeviceCreate() {
+namespace android {
+namespace renderscript {
+
+RsDevice rsi_DeviceCreate() {
     Device * d = new Device();
     return d;
 }
 
-void rsDeviceDestroy(RsDevice dev) {
+void rsi_DeviceDestroy(RsDevice dev) {
     Device * d = static_cast<Device *>(dev);
     delete d;
 }
 
-void rsDeviceSetConfig(RsDevice dev, RsDeviceParam p, int32_t value) {
+void rsi_DeviceSetConfig(RsDevice dev, RsDeviceParam p, int32_t value) {
     Device * d = static_cast<Device *>(dev);
     if (p == RS_DEVICE_PARAM_FORCE_SOFTWARE_GL) {
         d->mForceSW = value != 0;
@@ -59,3 +62,5 @@
     rsAssert(0);
 }
 
+}
+}
diff --git a/rsHandcode.h b/rsHandcode.h
index da51d95..e6b722c 100644
--- a/rsHandcode.h
+++ b/rsHandcode.h
@@ -7,90 +7,3 @@
     io->mToCore.commitSync(RS_CMD_ID_ContextFinish, size);
 }
 
-static inline void rsHCAPI_ScriptInvokeV (RsContext rsc, RsScript va, uint32_t slot, const void * data, size_t sizeBytes) {
-    ThreadIO *io = &((Context *)rsc)->mIO;
-    uint32_t size = sizeof(RS_CMD_ScriptInvokeV);
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        size += (sizeBytes + 3) & ~3;
-    }
-    RS_CMD_ScriptInvokeV *cmd = static_cast<RS_CMD_ScriptInvokeV *>(io->mToCore.reserve(size));
-    cmd->s = va;
-    cmd->slot = slot;
-    cmd->data_length = sizeBytes;
-    cmd->data = data;
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        cmd->data = (void *)(cmd+1);
-        memcpy(cmd+1, data, sizeBytes);
-        io->mToCore.commit(RS_CMD_ID_ScriptInvokeV, size);
-    } else {
-        io->mToCore.commitSync(RS_CMD_ID_ScriptInvokeV, size);
-    }
-}
-
-
-static inline void rsHCAPI_ScriptSetVarV (RsContext rsc, RsScript va, uint32_t slot, const void * data, size_t sizeBytes) {
-    ThreadIO *io = &((Context *)rsc)->mIO;
-    uint32_t size = sizeof(RS_CMD_ScriptSetVarV);
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        size += (sizeBytes + 3) & ~3;
-    }
-    RS_CMD_ScriptSetVarV *cmd = static_cast<RS_CMD_ScriptSetVarV *>(io->mToCore.reserve(size));
-    cmd->s = va;
-    cmd->slot = slot;
-    cmd->data_length = sizeBytes;
-    cmd->data = data;
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        cmd->data = (void *)(cmd+1);
-        memcpy(cmd+1, data, sizeBytes);
-        io->mToCore.commit(RS_CMD_ID_ScriptSetVarV, size);
-    } else {
-        io->mToCore.commitSync(RS_CMD_ID_ScriptSetVarV, size);
-    }
-}
-
-static inline void rsHCAPI_Allocation1DData (RsContext rsc, RsAllocation va, uint32_t xoff, uint32_t lod,
-                                             uint32_t count, const void * data, size_t sizeBytes) {
-    ThreadIO *io = &((Context *)rsc)->mIO;
-    uint32_t size = sizeof(RS_CMD_Allocation1DData);
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        size += (sizeBytes + 3) & ~3;
-    }
-    RS_CMD_Allocation1DData *cmd = static_cast<RS_CMD_Allocation1DData *>(io->mToCore.reserve(size));
-    cmd->va = va;
-    cmd->xoff = xoff;
-    cmd->lod = lod;
-    cmd->count = count;
-    cmd->data = data;
-    cmd->data_length = sizeBytes;
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        cmd->data = (void *)(cmd+1);
-        memcpy(cmd+1, data, sizeBytes);
-        io->mToCore.commit(RS_CMD_ID_Allocation1DData, size);
-    } else {
-        io->mToCore.commitSync(RS_CMD_ID_Allocation1DData, size);
-    }
-}
-
-static inline void rsHCAPI_Allocation1DElementData (RsContext rsc, RsAllocation va, uint32_t x, uint32_t lod,
-                                                    const void * data, size_t sizeBytes, uint32_t comp_offset) {
-    ThreadIO *io = &((Context *)rsc)->mIO;
-    uint32_t size = sizeof(RS_CMD_Allocation1DElementData);
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        size += (sizeBytes + 3) & ~3;
-    }
-    RS_CMD_Allocation1DElementData *cmd = static_cast<RS_CMD_Allocation1DElementData *>(io->mToCore.reserve(size));
-    cmd->va = va;
-    cmd->x = x;
-    cmd->lod = lod;
-    cmd->data = data;
-    cmd->comp_offset = comp_offset;
-    cmd->data_length = sizeBytes;
-    if (sizeBytes < DATA_SYNC_SIZE) {
-        cmd->data = (void *)(cmd+1);
-        memcpy(cmd+1, data, sizeBytes);
-        io->mToCore.commit(RS_CMD_ID_Allocation1DElementData, size);
-    } else {
-        io->mToCore.commitSync(RS_CMD_ID_Allocation1DElementData, size);
-    }
-}
-
diff --git a/rsScript.cpp b/rsScript.cpp
index b84014f..7641cab 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -87,6 +87,16 @@
     s->mEnviroment.mTimeZone = timeZone;
 }
 
+void rsi_ScriptForEach(Context *rsc, RsScript vs, uint32_t slot,
+                       RsAllocation vain, RsAllocation vaout,
+                       const void *params, uint32_t paramLen) {
+    Script *s = static_cast<Script *>(vs);
+    s->runForEach(rsc,
+                  static_cast<const Allocation *>(vain), static_cast<Allocation *>(vaout),
+                  params, paramLen);
+
+}
+
 void rsi_ScriptInvoke(Context *rsc, RsScript vs, uint32_t slot) {
     Script *s = static_cast<Script *>(vs);
     s->Invoke(rsc, slot, NULL, 0);
diff --git a/rsThreadIO.cpp b/rsThreadIO.cpp
index 6cf07de..6e959a7 100644
--- a/rsThreadIO.cpp
+++ b/rsThreadIO.cpp
@@ -58,7 +58,7 @@
             LOGE("playCoreCommands error con %p, cmd %i", con, cmdID);
             mToCore.printDebugData();
         }
-        gPlaybackFuncs[cmdID](con, data);
+        gPlaybackFuncs[cmdID](con, data, cmdSize << 2);
         mToCore.next();
     }
     return ret;
diff --git a/rsType.cpp b/rsType.cpp
index cd2be94..10e3182 100644
--- a/rsType.cpp
+++ b/rsType.cpp
@@ -274,17 +274,16 @@
 namespace android {
 namespace renderscript {
 
-}
-}
-
-RsType rsaTypeCreate(RsContext con, RsElement _e, uint32_t dimX,
+RsType rsi_TypeCreate(Context *rsc, RsElement _e, uint32_t dimX,
                      uint32_t dimY, uint32_t dimZ, bool mips, bool faces) {
-    Context *rsc = static_cast<Context *>(con);
     Element *e = static_cast<Element *>(_e);
 
     return Type::getType(rsc, e, dimX, dimY, dimZ, mips, faces);
 }
 
+}
+}
+
 void rsaTypeGetNativeData(RsContext con, RsType type, uint32_t *typeData, uint32_t typeDataSize) {
     rsAssert(typeDataSize == 6);
     // Pack the data in the follofing way mDimX; mDimY; mDimZ;
diff --git a/rsg_generator.c b/rsg_generator.c
index 14b380a..0e914fc 100644
--- a/rsg_generator.c
+++ b/rsg_generator.c
@@ -53,6 +53,10 @@
             fprintf(f, "*");
         }
     }
+}
+
+void printVarTypeAndName(FILE *f, const VarType *vt) {
+    printVarType(f, vt);
 
     if (vt->name[0]) {
         fprintf(f, " %s", vt->name);
@@ -65,7 +69,7 @@
         if (ct || assumePrevious) {
             fprintf(f, ", ");
         }
-        printVarType(f, &api->params[ct]);
+        printVarTypeAndName(f, &api->params[ct]);
     }
 }
 
@@ -86,16 +90,27 @@
 
         for (ct2=0; ct2 < api->paramCount; ct2++) {
             fprintf(f, "    ");
-            printVarType(f, &api->params[ct2]);
+            printVarTypeAndName(f, &api->params[ct2]);
             fprintf(f, ";\n");
         }
         fprintf(f, "};\n\n");
     }
 }
 
-void printFuncDecl(FILE *f, const ApiEntry *api, const char *prefix, int addContext) {
-    printVarType(f, &api->ret);
-    fprintf(f, " %s%s (", prefix, api->name);
+void printFuncDecl(FILE *f, const ApiEntry *api, const char *prefix, int addContext, int isFnPtr) {
+    printVarTypeAndName(f, &api->ret);
+    if (isFnPtr) {
+        char t[1024];
+        strcpy(t, api->name);
+        if (strlen(prefix) == 0) {
+            if (t[0] > 'A' && t[0] < 'Z') {
+                t[0] -= 'A' - 'a';
+            }
+        }
+        fprintf(f, " (* %s%s) (", prefix, api->name);
+    } else {
+        fprintf(f, " %s%s (", prefix, api->name);
+    }
     if (!api->nocontext) {
         if (addContext) {
             fprintf(f, "Context *");
@@ -110,12 +125,24 @@
 void printFuncDecls(FILE *f, const char *prefix, int addContext) {
     int ct;
     for (ct=0; ct < apiCount; ct++) {
-        printFuncDecl(f, &apis[ct], prefix, addContext);
+        printFuncDecl(f, &apis[ct], prefix, addContext, 0);
         fprintf(f, ";\n");
     }
     fprintf(f, "\n\n");
 }
 
+void printFuncPointers(FILE *f, int addContext) {
+    fprintf(f, "\n");
+    fprintf(f, "typedef struct RsApiEntrypoints {\n");
+    int ct;
+    for (ct=0; ct < apiCount; ct++) {
+        fprintf(f, "    ");
+        printFuncDecl(f, &apis[ct], "", addContext, 1);
+        fprintf(f, ";\n");
+    }
+    fprintf(f, "} RsApiEntrypoints_t;\n\n");
+}
+
 void printPlaybackFuncs(FILE *f, const char *prefix) {
     int ct;
     for (ct=0; ct < apiCount; ct++) {
@@ -127,6 +154,33 @@
     }
 }
 
+static int hasInlineDataPointers(const ApiEntry * api) {
+    int ret = 0;
+    int ct;
+    // Temporarly disable inbanding while we sort though the bugs.
+    if (1|| api->sync || api->ret.typeName[0]) {
+        return 0;
+    }
+    for (ct=0; ct < api->paramCount; ct++) {
+        const VarType *vt = &api->params[ct];
+
+        if (!vt->isConst && vt->ptrLevel) {
+            // Non-const pointers cannot be inlined.
+            return 0;
+        }
+        if (vt->ptrLevel > 1) {
+            // not handled yet.
+            return 0;
+        }
+
+        if (vt->isConst && vt->ptrLevel) {
+            // Non-const pointers cannot be inlined.
+            ret = 1;
+        }
+    }
+    return ret;
+}
+
 void printApiCpp(FILE *f) {
     int ct;
     int ct2;
@@ -142,52 +196,139 @@
     fprintf(f, "#include \"rsHandcode.h\"\n");
     fprintf(f, "\n");
 
+    printFuncPointers(f, 0);
+
+    // Generate RS funcs for local fifo
     for (ct=0; ct < apiCount; ct++) {
         int needFlush = 0;
         const ApiEntry * api = &apis[ct];
 
-        if (api->direct) {
-            continue;
-        }
-
-        printFuncDecl(f, api, "rs", 0);
+        fprintf(f, "static ");
+        printFuncDecl(f, api, "LF_", 0, 0);
         fprintf(f, "\n{\n");
-        if (api->handcodeApi) {
-            fprintf(f, "    rsHCAPI_%s(rsc", api->name);
+        if (api->handcodeApi || api->direct) {
+            if (api->handcodeApi) {
+                fprintf(f, "    rsHCAPI_%s(rsc", api->name);
+            } else {
+                fprintf(f, "    ");
+                if (api->ret.typeName[0]) {
+                    fprintf(f, "return ");
+                }
+                fprintf(f, "rsi_%s(", api->name);
+                if (!api->nocontext) {
+                    fprintf(f, "(Context *)rsc");
+                }
+            }
             for (ct2=0; ct2 < api->paramCount; ct2++) {
                 const VarType *vt = &api->params[ct2];
-                fprintf(f, ", %s", vt->name);
+                if (ct2 > 0 || !api->nocontext) {
+                    fprintf(f, ", ");
+                }
+                fprintf(f, "%s", vt->name);
             }
             fprintf(f, ");\n");
         } else {
             fprintf(f, "    ThreadIO *io = &((Context *)rsc)->mIO;\n");
-            //fprintf(f, "    LOGE(\"add command %s\\n\");\n", api->name);
-            fprintf(f, "    RS_CMD_%s *cmd = static_cast<RS_CMD_%s *>(io->mToCore.reserve(sizeof(RS_CMD_%s)));\n", api->name, api->name, api->name);
             fprintf(f, "    uint32_t size = sizeof(RS_CMD_%s);\n", api->name);
+            if (hasInlineDataPointers(api)) {
+                fprintf(f, "    uint32_t dataSize = 0;\n");
+                for (ct2=0; ct2 < api->paramCount; ct2++) {
+                    const VarType *vt = &api->params[ct2];
+                    if (vt->isConst && vt->ptrLevel) {
+                        fprintf(f, "    dataSize += %s_length;\n", vt->name);
+                    }
+                }
+            }
+
+            //fprintf(f, "    LOGE(\"add command %s\\n\");\n", api->name);
+            if (hasInlineDataPointers(api)) {
+                fprintf(f, "    RS_CMD_%s *cmd = static_cast<RS_CMD_%s *>(io->mToCore.reserve(dataSize + sizeof(RS_CMD_%s)));\n", api->name, api->name, api->name);
+                fprintf(f, "    uint8_t *payload = (uint8_t *)&cmd[1];\n");
+            } else {
+                fprintf(f, "    RS_CMD_%s *cmd = static_cast<RS_CMD_%s *>(io->mToCore.reserve(sizeof(RS_CMD_%s)));\n", api->name, api->name, api->name);
+            }
 
             for (ct2=0; ct2 < api->paramCount; ct2++) {
                 const VarType *vt = &api->params[ct2];
                 needFlush += vt->ptrLevel;
-                fprintf(f, "    cmd->%s = %s;\n", vt->name, vt->name);
+                if (vt->ptrLevel && hasInlineDataPointers(api)) {
+                    fprintf(f, "    if (dataSize < 1024) {\n");
+                    fprintf(f, "        memcpy(payload, %s, %s_length);\n", vt->name, vt->name);
+                    fprintf(f, "        cmd->%s = (", vt->name);
+                    printVarType(f, vt);
+                    fprintf(f, ")payload;\n");
+                    fprintf(f, "        payload += %s_length;\n", vt->name);
+                    fprintf(f, "    } else {\n");
+                    fprintf(f, "        cmd->%s = %s;\n", vt->name, vt->name);
+                    fprintf(f, "    }\n");
+
+                } else {
+                    fprintf(f, "    cmd->%s = %s;\n", vt->name, vt->name);
+                }
             }
             if (api->ret.typeName[0]) {
                 needFlush = 1;
             }
 
-            fprintf(f, "    io->mToCore.commit");
-            if (needFlush) {
-                fprintf(f, "Sync");
+            if (hasInlineDataPointers(api)) {
+                fprintf(f, "    if (dataSize < 1024) {\n");
+                fprintf(f, "        io->mToCore.commit(RS_CMD_ID_%s, size + dataSize);\n", api->name);
+                fprintf(f, "    } else {\n");
+                fprintf(f, "        io->mToCore.commitSync(RS_CMD_ID_%s, size);\n", api->name);
+                fprintf(f, "    }\n");
+            } else {
+                fprintf(f, "    io->mToCore.commit");
+                if (needFlush) {
+                    fprintf(f, "Sync");
+                }
+                fprintf(f, "(RS_CMD_ID_%s, size);\n", api->name);
             }
-            fprintf(f, "(RS_CMD_ID_%s, size);\n", api->name);
 
             if (api->ret.typeName[0]) {
                 fprintf(f, "    return reinterpret_cast<");
-                printVarType(f, &api->ret);
+                printVarTypeAndName(f, &api->ret);
                 fprintf(f, ">(io->mToCoreRet);\n");
             }
         }
         fprintf(f, "};\n\n");
     }
+
+    fprintf(f, "\n");
+    fprintf(f, "static RsApiEntrypoints_t s_LocalTable = {\n");
+    for (ct=0; ct < apiCount; ct++) {
+        fprintf(f, "    LF_%s,\n", apis[ct].name);
+    }
+    fprintf(f, "};\n");
+
+    fprintf(f, "static RsApiEntrypoints_t *s_CurrentTable = &s_LocalTable;\n\n");
+
+    for (ct=0; ct < apiCount; ct++) {
+        int needFlush = 0;
+        const ApiEntry * api = &apis[ct];
+
+        printFuncDecl(f, api, "rs", 0, 0);
+        fprintf(f, "\n{\n");
+        fprintf(f, "    ");
+        if (api->ret.typeName[0]) {
+            fprintf(f, "return ");
+        }
+        fprintf(f, "s_CurrentTable->%s(", api->name);
+
+        if (!api->nocontext) {
+            fprintf(f, "(Context *)rsc");
+        }
+
+        for (ct2=0; ct2 < api->paramCount; ct2++) {
+            const VarType *vt = &api->params[ct2];
+            if (ct2 > 0 || !api->nocontext) {
+                fprintf(f, ", ");
+            }
+            fprintf(f, "%s", vt->name);
+        }
+        fprintf(f, ");\n");
+        fprintf(f, "}\n\n");
+    }
+
 }
 
 void printPlaybackCpp(FILE *f) {
@@ -212,11 +353,12 @@
             continue;
         }
 
-        fprintf(f, "void rsp_%s(Context *con, const void *vp)\n", api->name);
+        fprintf(f, "void rsp_%s(Context *con, const void *vp, size_t cmdSizeBytes)\n", api->name);
         fprintf(f, "{\n");
 
         //fprintf(f, "    LOGE(\"play command %s\\n\");\n", api->name);
         fprintf(f, "    const RS_CMD_%s *cmd = static_cast<const RS_CMD_%s *>(vp);\n", api->name, api->name);
+
         fprintf(f, "    ");
         if (api->ret.typeName[0]) {
             fprintf(f, "con->mIO.mToCoreRet = (intptr_t)");
@@ -246,6 +388,8 @@
     fprintf(f, "};\n");
 }
 
+void yylex();
+
 int main(int argc, char **argv) {
     if (argc != 3) {
         fprintf(stderr, "usage: %s commandFile outFile\n", argv[0]);
@@ -280,7 +424,7 @@
             printStructures(f);
             printFuncDecls(f, "rsi_", 1);
             printPlaybackFuncs(f, "rsp_");
-            fprintf(f, "\n\ntypedef void (*RsPlaybackFunc)(Context *, const void *);\n");
+            fprintf(f, "\n\ntypedef void (*RsPlaybackFunc)(Context *, const void *, size_t sizeBytes);\n");
             fprintf(f, "extern RsPlaybackFunc gPlaybackFuncs[%i];\n", apiCount + 1);
 
             fprintf(f, "}\n");
@@ -306,6 +450,19 @@
             printPlaybackCpp(f);
         }
         break;
+
+        case '4': // rsgApiStream.cpp
+        {
+            printFileHeader(f);
+            printPlaybackCpp(f);
+        }
+
+        case '5': // rsgApiStreamReplay.cpp
+        {
+            printFileHeader(f);
+            printPlaybackCpp(f);
+        }
+        break;
     }
     fclose(f);
     return 0;