From 3aef90a6bce39a617e1f7c2abff27a8dc0799db4 Mon Sep 17 00:00:00 2001
From: marauder2k7 <ctrl-intelligence@live.co.uk>
Date: Mon, 22 Dec 2025 10:29:01 +0000
Subject: [PATCH] Update GFXTextureManager and GBitmap

GBitmap Changes:
Added all other formats to gbitmap that we support
gbitmap now supports cubemaps
added converters for all these other formats
added stb_image_resize for extrudemips so we can extrude mipmaps for all other formats

GFXTextureManager
Can now directly make cubemaps and texture arrays based on the GFXTextureProfile
API implementations for all functions that cubemaps and arrays needed
---
 Engine/source/T3D/lighting/IBLUtilities.cpp   |   24 +-
 Engine/source/T3D/lighting/IBLUtilities.h     |   12 +-
 .../source/T3D/lighting/reflectionProbe.cpp   |   41 +-
 Engine/source/T3D/lighting/reflectionProbe.h  |    6 +-
 Engine/source/assets/assetManager.cpp         |   15 +
 Engine/source/environment/scatterSky.cpp      |    2 +-
 Engine/source/environment/waterObject.cpp     |    4 +-
 Engine/source/gfx/D3D11/gfxD3D11Target.cpp    |  116 +-
 Engine/source/gfx/D3D11/gfxD3D11Target.h      |    4 +-
 .../gfx/D3D11/gfxD3D11TextureManager.cpp      |  280 +-
 .../source/gfx/D3D11/gfxD3D11TextureManager.h |    3 +-
 .../gfx/D3D11/gfxD3D11TextureObject.cpp       |  404 +--
 .../source/gfx/D3D11/gfxD3D11TextureObject.h  |   74 +-
 Engine/source/gfx/Null/gfxNullDevice.cpp      |   15 +-
 Engine/source/gfx/bitmap/bitmapUtils.cpp      |  520 +++-
 Engine/source/gfx/bitmap/bitmapUtils.h        |  137 +-
 Engine/source/gfx/bitmap/cubemapSaver.cpp     |   49 +-
 Engine/source/gfx/bitmap/cubemapSaver.h       |    6 +-
 Engine/source/gfx/bitmap/ddsFile.cpp          |  106 +-
 Engine/source/gfx/bitmap/ddsFile.h            |    2 +
 Engine/source/gfx/bitmap/gBitmap.cpp          |  806 +++---
 Engine/source/gfx/bitmap/gBitmap.h            |   53 +-
 .../source/gfx/bitmap/loaders/bitmapSTB.cpp   |  245 +-
 .../source/gfx/bitmap/loaders/stb/stb_image.h |  357 +--
 .../bitmap/loaders/stb/stb_image_resize2.h    | 2260 ++++++++++-------
 Engine/source/gfx/gfxAPI.cpp                  |   34 +
 Engine/source/gfx/gfxAPI.h                    |    2 +
 Engine/source/gfx/gfxShader.cpp               |    8 +-
 Engine/source/gfx/gfxShader.h                 |    4 +-
 Engine/source/gfx/gfxTarget.h                 |    2 +-
 Engine/source/gfx/gfxTextureHandle.cpp        |   12 +-
 Engine/source/gfx/gfxTextureHandle.h          |    7 +-
 Engine/source/gfx/gfxTextureManager.cpp       |   13 +-
 Engine/source/gfx/gfxTextureManager.h         |    9 +-
 Engine/source/gfx/gfxTextureObject.cpp        |    2 +
 Engine/source/gfx/gfxTextureObject.h          |   14 +-
 Engine/source/gfx/gfxTextureProfile.cpp       |   15 +
 Engine/source/gfx/gfxTextureProfile.h         |   22 +-
 Engine/source/gfx/gl/gfxGLTextureManager.cpp  |  381 ++-
 Engine/source/gfx/gl/gfxGLTextureManager.h    |    5 +-
 Engine/source/gfx/gl/gfxGLTextureObject.cpp   |  241 +-
 Engine/source/gfx/gl/gfxGLTextureObject.h     |   12 +-
 Engine/source/gfx/gl/gfxGLTextureTarget.cpp   |  229 +-
 Engine/source/gfx/gl/gfxGLTextureTarget.h     |   22 +-
 Engine/source/gfx/gl/sdl/gfxGLDevice.sdl.cpp  |   25 +-
 Engine/source/gfx/sim/cubemapData.cpp         |   32 +-
 Engine/source/gfx/sim/cubemapData.h           |    2 +-
 .../lighting/shadowMap/cubeLightShadowMap.cpp |   46 +-
 .../lighting/shadowMap/cubeLightShadowMap.h   |    9 -
 .../lighting/shadowMap/lightShadowMap.cpp     |    7 +
 .../lighting/shadowMap/lightShadowMap.h       |    1 +
 Engine/source/materials/materialDefinition.h  |    6 +-
 .../materials/processedCustomMaterial.cpp     |    4 +-
 Engine/source/materials/processedMaterial.h   |    2 +-
 .../materials/processedShaderMaterial.cpp     |    6 +-
 Engine/source/materials/sceneData.h           |    2 +-
 Engine/source/platformWin32/winAsmBlit.cpp    |    2 +-
 .../renderInstance/renderDeferredMgr.cpp      |    2 +-
 .../source/renderInstance/renderMeshMgr.cpp   |    2 +-
 .../source/renderInstance/renderPassManager.h |    2 +-
 .../source/renderInstance/renderProbeMgr.cpp  |   49 +-
 Engine/source/renderInstance/renderProbeMgr.h |   10 +-
 .../renderInstance/renderTranslucentMgr.cpp   |    2 +-
 Engine/source/scene/reflector.cpp             |   27 +-
 Engine/source/scene/reflector.h               |    6 +-
 Engine/source/ts/tsRenderState.h              |    6 +-
 66 files changed, 4235 insertions(+), 2590 deletions(-)

diff --git a/Engine/source/T3D/lighting/IBLUtilities.cpp b/Engine/source/T3D/lighting/IBLUtilities.cpp
index 27c0827f3..15793a392 100644
--- a/Engine/source/T3D/lighting/IBLUtilities.cpp
+++ b/Engine/source/T3D/lighting/IBLUtilities.cpp
@@ -31,7 +31,7 @@
 
 namespace IBLUtilities
 {
-   void GenerateIrradianceMap(GFXTextureTargetRef renderTarget, GFXCubemapHandle cubemap, GFXCubemapHandle &cubemapOut)
+   void GenerateIrradianceMap(GFXTextureTargetRef renderTarget, GFXTexHandle cubemap, GFXTexHandle &cubemapOut)
    {
       GFXTransformSaver saver;
 
@@ -65,11 +65,11 @@ namespace IBLUtilities
       GFX->setShaderConstBuffer(irrConsts);
       GFX->setStateBlock(irrStateBlock);
       GFX->setVertexBuffer(NULL);
-      GFX->setCubeTexture(0, cubemap);
+      GFX->setTexture(0, cubemap);
 
       for (U32 i = 0; i < 6; i++)
       {
-         renderTarget->attachTexture(GFXTextureTarget::Color0, cubemapOut, i);
+         renderTarget->attachTexture(GFXTextureTarget::Color0, cubemapOut, 0,0, i);
          irrConsts->setSafe(irrFaceSC, (S32)i);
          GFX->setActiveRenderTarget(renderTarget);
          GFX->clear(GFXClearTarget, LinearColorF::BLACK, 1.0f, 0);
@@ -80,7 +80,7 @@ namespace IBLUtilities
       GFX->popActiveRenderTarget();
    }
 
-   void GenerateAndSaveIrradianceMap(String outputPath, S32 resolution, GFXCubemapHandle cubemap, GFXCubemapHandle &cubemapOut)
+   void GenerateAndSaveIrradianceMap(String outputPath, S32 resolution, GFXTexHandle cubemap, GFXTexHandle &cubemapOut)
    {
       if (outputPath.isEmpty())
       {
@@ -101,7 +101,7 @@ namespace IBLUtilities
       }
    }
 
-   void SaveCubeMap(String outputPath, GFXCubemapHandle &cubemap)
+   void SaveCubeMap(String outputPath, GFXTexHandle &cubemap)
    {
       if (outputPath.isEmpty())
       {
@@ -118,7 +118,7 @@ namespace IBLUtilities
       }
    }
 
-   void GeneratePrefilterMap(GFXTextureTargetRef renderTarget, GFXCubemapHandle cubemap, U32 mipLevels, GFXCubemapHandle &cubemapOut)
+   void GeneratePrefilterMap(GFXTextureTargetRef renderTarget, GFXTexHandle cubemap, U32 mipLevels, GFXTexHandle &cubemapOut)
    {
       GFXTransformSaver saver;
 
@@ -153,9 +153,9 @@ namespace IBLUtilities
       GFX->pushActiveRenderTarget();
       GFX->setShader(prefilterShader);
       GFX->setShaderConstBuffer(prefilterConsts);
-      GFX->setCubeTexture(0, cubemap);
+      GFX->setTexture(0, cubemap);
 
-      U32 prefilterSize = cubemapOut->getSize();
+      U32 prefilterSize = cubemapOut->getWidth();
 
       U32 resolutionSize = prefilterSize;
 
@@ -171,7 +171,7 @@ namespace IBLUtilities
             prefilterConsts->setSafe(prefilterRoughnessSC, roughness);
             prefilterConsts->setSafe(prefilterMipSizeSC, mipSize);
             U32 size = prefilterSize * mPow(0.5f, mip);
-            renderTarget->attachTexture(GFXTextureTarget::Color0, cubemapOut, face);
+            renderTarget->attachTexture(GFXTextureTarget::Color0, cubemapOut, 0,0 ,face);
             GFX->setActiveRenderTarget(renderTarget, false);//we set the viewport ourselves
             GFX->setViewport(RectI(0, 0, size, size));
             GFX->clear(GFXClearTarget, LinearColorF::BLACK, 1.0f, 0);
@@ -183,7 +183,7 @@ namespace IBLUtilities
       GFX->popActiveRenderTarget();
    }
 
-   void GenerateAndSavePrefilterMap(String outputPath, S32 resolution, GFXCubemapHandle cubemap, U32 mipLevels, GFXCubemapHandle &cubemapOut)
+   void GenerateAndSavePrefilterMap(String outputPath, S32 resolution, GFXTexHandle cubemap, U32 mipLevels, GFXTexHandle &cubemapOut)
    {
       if (outputPath.isEmpty())
       {
@@ -504,7 +504,7 @@ namespace IBLUtilities
    //SH Calculations
    // From http://sunandblackcat.com/tipFullView.php?l=eng&topicid=32&topic=Spherical-Harmonics-From-Cube-Texture
    // With shader decode logic from https://github.com/nicknikolov/cubemap-sh
-   void calculateSHTerms(GFXCubemapHandle cubemap, LinearColorF SHTerms[9], F32 SHConstants[5])
+   void calculateSHTerms(GFXTexHandle cubemap, LinearColorF SHTerms[9], F32 SHConstants[5])
    {
       if (!cubemap)
          return;
@@ -525,7 +525,7 @@ namespace IBLUtilities
          VectorF(0.0f, 0.0f, -1.0f),
       };
 
-      U32 cubemapResolution = cubemap->getSize();
+      U32 cubemapResolution = cubemap->getWidth();
 
       GBitmap* cubeFaceBitmaps[6];
 
diff --git a/Engine/source/T3D/lighting/IBLUtilities.h b/Engine/source/T3D/lighting/IBLUtilities.h
index 1a9c69337..12f4e4ace 100644
--- a/Engine/source/T3D/lighting/IBLUtilities.h
+++ b/Engine/source/T3D/lighting/IBLUtilities.h
@@ -38,13 +38,13 @@
 
 namespace IBLUtilities
 {
-   void GenerateIrradianceMap(GFXTextureTargetRef renderTarget, GFXCubemapHandle cubemap, GFXCubemapHandle &cubemapOut);
-   void GenerateAndSaveIrradianceMap(String outputPath, S32 resolution, GFXCubemapHandle cubemap, GFXCubemapHandle &cubemapOut);
+   void GenerateIrradianceMap(GFXTextureTargetRef renderTarget, GFXTexHandle cubemap, GFXTexHandle& cubemapOut);
+   void GenerateAndSaveIrradianceMap(String outputPath, S32 resolution, GFXTexHandle cubemap, GFXTexHandle& cubemapOut);
 
-   void GeneratePrefilterMap(GFXTextureTargetRef renderTarget, GFXCubemapHandle cubemap, U32 mipLevels, GFXCubemapHandle &cubemapOut);
-   void GenerateAndSavePrefilterMap(String outputPath, S32 resolution, GFXCubemapHandle cubemap, U32 mipLevels, GFXCubemapHandle &cubemapOut);
+   void GeneratePrefilterMap(GFXTextureTargetRef renderTarget, GFXTexHandle cubemap, U32 mipLevels, GFXTexHandle &cubemapOut);
+   void GenerateAndSavePrefilterMap(String outputPath, S32 resolution, GFXTexHandle cubemap, U32 mipLevels, GFXTexHandle &cubemapOut);
 
-   void SaveCubeMap(String outputPath, GFXCubemapHandle &cubemap);
+   void SaveCubeMap(String outputPath, GFXTexHandle &cubemap);
 
    void bakeReflection(String outputPath, S32 resolution);
 
@@ -60,7 +60,7 @@ namespace IBLUtilities
    //SH Calculations
    // From http://sunandblackcat.com/tipFullView.php?l=eng&topicid=32&topic=Spherical-Harmonics-From-Cube-Texture
    // With shader decode logic from https://github.com/nicknikolov/cubemap-sh
-   void calculateSHTerms(GFXCubemapHandle cubemap, LinearColorF SHTerms[9], F32 SHConstants[5]);
+   void calculateSHTerms(GFXTexHandle cubemap, LinearColorF SHTerms[9], F32 SHConstants[5]);
 
    F32 texelSolidAngle(F32 aU, F32 aV, U32 width, U32 height);
 
diff --git a/Engine/source/T3D/lighting/reflectionProbe.cpp b/Engine/source/T3D/lighting/reflectionProbe.cpp
index 6a272c10f..aedf93bc8 100644
--- a/Engine/source/T3D/lighting/reflectionProbe.cpp
+++ b/Engine/source/T3D/lighting/reflectionProbe.cpp
@@ -86,6 +86,13 @@ ImplementEnumType(ReflectionModeEnum,
 //{ ReflectionProbe::DynamicCubemap, "Dynamic Cubemap", "Uses a cubemap baked from the probe's current position, updated at a set rate" },
    EndImplementEnumType;
 
+void ReflectionProbe::ProbeInfo::clear()
+{
+   mPrefilterCubemap.free();
+   mIrradianceCubemap.free();
+}
+
+
 //-----------------------------------------------------------------------------
 // Object setup and teardown
 //-----------------------------------------------------------------------------
@@ -146,11 +153,18 @@ ReflectionProbe::~ReflectionProbe()
    if (mReflectionModeType == StaticCubemap && mStaticCubemap)
       mStaticCubemap->deleteObject();
 
-   if (mIrridianceMap)
-      mIrridianceMap->deleteObject();
+   mProbeInfo.clear();
+
+   if (mIrridianceMap) {
+      if (mIrridianceMap->isProperlyAdded() && !mIrridianceMap->isRemoved())
+         mIrridianceMap->deleteObject();
+   }
 
    if (mPrefilterMap)
-      mPrefilterMap->deleteObject();
+   {
+      if (mPrefilterMap->isProperlyAdded() && !mPrefilterMap->isRemoved())
+         mPrefilterMap->deleteObject();
+   }
 }
 
 //-----------------------------------------------------------------------------
@@ -603,7 +617,7 @@ void ReflectionProbe::processBakedCubemap()
       return;
 
    String irrPath = getIrradianceMapPath();
-   if (Platform::isFile(irrPath))
+   if ((mIrridianceMap == nullptr || mIrridianceMap->mCubemap.isNull()) && Platform::isFile(irrPath))
    {
       mIrridianceMap->setCubemapFile(FileName(irrPath));
       mIrridianceMap->updateFaces();
@@ -616,7 +630,7 @@ void ReflectionProbe::processBakedCubemap()
    }
 
    String prefilPath = getPrefilterMapPath();
-   if (Platform::isFile(prefilPath))
+   if ((mPrefilterMap == nullptr || mPrefilterMap->mCubemap.isNull()) && Platform::isFile(prefilPath))
    {
       mPrefilterMap->setCubemapFile(FileName(prefilPath));
       mPrefilterMap->updateFaces();
@@ -631,7 +645,7 @@ void ReflectionProbe::processBakedCubemap()
    mProbeInfo.mPrefilterCubemap = mPrefilterMap->mCubemap;
    mProbeInfo.mIrradianceCubemap = mIrridianceMap->mCubemap;
 
-   if (mEnabled && mProbeInfo.mPrefilterCubemap->isInitialized() && mProbeInfo.mIrradianceCubemap->isInitialized())
+   if (mEnabled && !mProbeInfo.mPrefilterCubemap.isNull() && !mProbeInfo.mIrradianceCubemap.isNull())
    {
       //mProbeInfo.mIsEnabled = true;
 
@@ -698,7 +712,7 @@ void ReflectionProbe::processStaticCubemap()
          return;
       }
 
-      if (mStaticCubemap->mCubemap == nullptr)
+      if (mStaticCubemap->mCubemap.isNull())
       {
          mStaticCubemap->createMap();
          mStaticCubemap->updateFaces();
@@ -706,13 +720,13 @@ void ReflectionProbe::processStaticCubemap()
 
       if (mUseHDRCaptures)
       {
-         mIrridianceMap->mCubemap->initDynamic(mPrefilterSize, GFXFormatR16G16B16A16F);
-         mPrefilterMap->mCubemap->initDynamic(mPrefilterSize, GFXFormatR16G16B16A16F);
+         mIrridianceMap->mCubemap.set(mPrefilterSize, mPrefilterSize, GFXFormatR16G16B16A16F, &GFXCubemapRenderTargetProfile, "ReflectionProbe::mIrridianceMap_HDR");
+         mPrefilterMap->mCubemap.set(mPrefilterSize, mPrefilterSize, GFXFormatR16G16B16A16F, &GFXCubemapRenderTargetProfile, "ReflectionProbe::mPrefilterMap_HDR");
       }
       else
       {
-         mIrridianceMap->mCubemap->initDynamic(mPrefilterSize, GFXFormatR8G8B8A8);
-         mPrefilterMap->mCubemap->initDynamic(mPrefilterSize, GFXFormatR8G8B8A8);
+         mIrridianceMap->mCubemap.set(mPrefilterSize, mPrefilterSize, GFXFormatR8G8B8A8, &GFXCubemapRenderTargetProfile, "ReflectionProbe::mIrridianceMap");
+         mPrefilterMap->mCubemap.set(mPrefilterSize, mPrefilterSize, GFXFormatR8G8B8A8, &GFXCubemapRenderTargetProfile, "ReflectionProbe::mPrefilterMap");
       }
 
       GFXTextureTargetRef renderTarget = GFX->allocRenderToTextureTarget(false);
@@ -730,7 +744,7 @@ void ReflectionProbe::processStaticCubemap()
       mProbeInfo.mIrradianceCubemap = mIrridianceMap->mCubemap;
    }
 
-   if (mEnabled && mProbeInfo.mPrefilterCubemap->isInitialized() && mProbeInfo.mIrradianceCubemap->isInitialized())
+   if (mEnabled && mProbeInfo.mPrefilterCubemap.isValid() && mProbeInfo.mIrradianceCubemap.isValid())
    {
       mProbeInfo.mIsEnabled = true;
 
@@ -1009,7 +1023,7 @@ void ReflectionProbe::setPreviewMatParameters(SceneRenderState* renderState, Bas
    GFX->setTexture(0, deferredTexObject);
 
    //Set the cubemap
-   GFX->setCubeTexture(1, mPrefilterMap->mCubemap);
+   GFX->setTexture(1, mPrefilterMap->mCubemap);
 
    //Set the invViewMat
    MatrixSet &matrixSet = renderState->getRenderPass()->getMatrixSet();
@@ -1036,3 +1050,4 @@ DefineEngineMethod(ReflectionProbe, Bake, void, (), ,
       clientProbe->bake();
    }
 }
+
diff --git a/Engine/source/T3D/lighting/reflectionProbe.h b/Engine/source/T3D/lighting/reflectionProbe.h
index e6759d20f..da5dc8341 100644
--- a/Engine/source/T3D/lighting/reflectionProbe.h
+++ b/Engine/source/T3D/lighting/reflectionProbe.h
@@ -94,8 +94,8 @@ public:
 
       F32 mScore;
 
-      GFXCubemapHandle mPrefilterCubemap;
-      GFXCubemapHandle mIrradianceCubemap;
+      GFXTexHandle mPrefilterCubemap;
+      GFXTexHandle mIrradianceCubemap;
 
       /// The priority of this light used for
       /// light and shadow scoring.
@@ -233,7 +233,7 @@ protected:
    /// </summary>
    StringTableEntry mCubemapName;
    CubemapData *mStaticCubemap;
-   GFXCubemapHandle  mDynamicCubemap;
+   GFXTexHandle  mDynamicCubemap;
 
    //String cubeDescName;
    //U32 cubeDescId;
diff --git a/Engine/source/assets/assetManager.cpp b/Engine/source/assets/assetManager.cpp
index 45ddb0c38..aab6cdc94 100644
--- a/Engine/source/assets/assetManager.cpp
+++ b/Engine/source/assets/assetManager.cpp
@@ -112,6 +112,21 @@ bool AssetManager::onAdd()
 
 void AssetManager::onRemove()
 {
+    // Remove all private assets explicitly before purge.
+    Vector<AssetDefinition*> assetDefinitions;
+    
+    // at this point all module assets should have been unloaded.
+    for (typeDeclaredAssetsHash::iterator assetItr = mDeclaredAssets.begin(); assetItr != mDeclaredAssets.end(); ++assetItr)
+    {
+       assetDefinitions.push_back(assetItr->value);
+    }
+    
+    for (Vector<AssetDefinition*>::iterator assetItr = assetDefinitions.begin(); assetItr != assetDefinitions.end(); ++assetItr)
+    {
+       AssetDefinition* pAssetDefinition = *assetItr;
+       unloadAsset(pAssetDefinition);
+    }
+
     // Do we have an asset tags manifest?
     if ( !mAssetTagsManifest.isNull() )
     {
diff --git a/Engine/source/environment/scatterSky.cpp b/Engine/source/environment/scatterSky.cpp
index a42560e33..40d0e5137 100644
--- a/Engine/source/environment/scatterSky.cpp
+++ b/Engine/source/environment/scatterSky.cpp
@@ -1090,7 +1090,7 @@ void ScatterSky::_render( ObjectRenderInst *ri, SceneRenderState *state, BaseMat
       if ( !mNightCubemap->mCubemap )
          mNightCubemap->createMap();
 
-      GFX->setCubeTexture( 0, mNightCubemap->mCubemap );
+      GFX->setTexture( 0, mNightCubemap->mCubemap );
    }
    else
    {
diff --git a/Engine/source/environment/waterObject.cpp b/Engine/source/environment/waterObject.cpp
index 66113f43d..0155717db 100644
--- a/Engine/source/environment/waterObject.cpp
+++ b/Engine/source/environment/waterObject.cpp
@@ -776,9 +776,9 @@ void WaterObject::setCustomTextures( S32 matIdx, U32 pass, const WaterMatParams
    }
 
    if ( ( matIdx == WaterMat || matIdx == BasicWaterMat ) && mCubemap )   
-      GFX->setCubeTexture( paramHandles.mCubemapSamplerSC->getSamplerRegister(pass), mCubemap->mCubemap );
+      GFX->setTexture( paramHandles.mCubemapSamplerSC->getSamplerRegister(pass), mCubemap->mCubemap );
    else if(paramHandles.mCubemapSamplerSC->getSamplerRegister(pass) != -1 )
-      GFX->setCubeTexture( paramHandles.mCubemapSamplerSC->getSamplerRegister(pass), NULL );
+      GFX->setTexture( paramHandles.mCubemapSamplerSC->getSamplerRegister(pass), NULL );
 }
 
 void WaterObject::drawUnderwaterFilter( SceneRenderState *state )
diff --git a/Engine/source/gfx/D3D11/gfxD3D11Target.cpp b/Engine/source/gfx/D3D11/gfxD3D11Target.cpp
index a9cfcd789..fb9438a6e 100644
--- a/Engine/source/gfx/D3D11/gfxD3D11Target.cpp
+++ b/Engine/source/gfx/D3D11/gfxD3D11Target.cpp
@@ -38,6 +38,7 @@ GFXD3D11TextureTarget::GFXD3D11TextureTarget(bool genMips)
       mResolveTargets[i] = NULL;
       mTargetViews[i] = NULL;
       mTargetSRViews[i] = NULL;
+      mTargetArrayIdx[i] = 0;
    }
 
    mGenMips = genMips;
@@ -57,9 +58,9 @@ GFXD3D11TextureTarget::~GFXD3D11TextureTarget()
    zombify();
 }
 
-void GFXD3D11TextureTarget::attachTexture( RenderSlot slot, GFXTextureObject *tex, U32 mipLevel/*=0*/, U32 zOffset /*= 0*/ )
+void GFXD3D11TextureTarget::attachTexture(RenderSlot slot, GFXTextureObject* tex, U32 mipLevel /*= 0*/, U32 zOffset /*= 0*/, U32 faceIndex /*= 0*/)
 {
-   GFXDEBUGEVENT_SCOPE( GFXPCD3D11TextureTarget_attachTexture, ColorI::RED );
+   GFXDEBUGEVENT_SCOPE(GFXPCD3D11TextureTarget_attachTexture, ColorI::RED);
 
    AssertFatal(slot < MaxRenderSlotId, "GFXD3D11TextureTarget::attachTexture - out of range slot.");
 
@@ -76,17 +77,17 @@ void GFXD3D11TextureTarget::attachTexture( RenderSlot slot, GFXTextureObject *te
    SAFE_RELEASE(mTargetViews[slot]);
    SAFE_RELEASE(mTargets[slot]);
    SAFE_RELEASE(mTargetSRViews[slot]);
-   
    mResolveTargets[slot] = NULL;
+   mTargetArrayIdx[slot] = 0;
 
-   if(slot == Color0)
+   if (slot == Color0)
    {
       mTargetSize = Point2I::Zero;
       mTargetFormat = GFXFormatR8G8B8A8;
    }
 
    // Are we clearing?
-   if(!tex)
+   if (!tex)
    {
       // Yup - just exit, it'll stay NULL.      
       return;
@@ -96,7 +97,7 @@ void GFXD3D11TextureTarget::attachTexture( RenderSlot slot, GFXTextureObject *te
    mTargetSRViews[slot] = NULL;
 
    // Take care of default targets
-   if( tex == GFXTextureTarget::sDefaultDepthStencil )
+   if (tex == GFXTextureTarget::sDefaultDepthStencil)
    {
       mTargets[slot] = D3D11->mDeviceDepthStencil;
       mTargetViews[slot] = D3D11->mDeviceDepthStencilView;
@@ -108,81 +109,100 @@ void GFXD3D11TextureTarget::attachTexture( RenderSlot slot, GFXTextureObject *te
       // Cast the texture object to D3D...
       AssertFatal(dynamic_cast<GFXD3D11TextureObject*>(tex), "GFXD3D11TextureTarget::attachTexture - invalid texture object.");
 
-      GFXD3D11TextureObject *d3dto = dynamic_cast<GFXD3D11TextureObject*>(tex);
+      GFXD3D11TextureObject* d3dto = dynamic_cast<GFXD3D11TextureObject*>(tex);
+      bool isCube = d3dto->isCubeMap();
 
       // Grab the surface level.
-      if( slot == DepthStencil )
-      {       
+      if (slot == DepthStencil)
+      {
          mTargets[slot] = d3dto->getSurface();
-         if ( mTargets[slot] )
+         if (mTargets[slot])
             mTargets[slot]->AddRef();
 
          mTargetViews[slot] = d3dto->getDSView();
-         if( mTargetViews[slot])
-            mTargetViews[slot]->AddRef();         
+         if (mTargetViews[slot])
+            mTargetViews[slot]->AddRef();
 
       }
       else
-      {         
-         // getSurface will almost always return NULL. It will only return non-NULL
-         // if the surface that it needs to render to is different than the mip level
-         // in the actual texture. This will happen with MSAA.
-         if( d3dto->getSurface() == NULL )
+      {
+         if (!isCube)
          {
-            
-            mTargets[slot] = d3dto->get2DTex();
-            mTargets[slot]->AddRef();
-            mTargetViews[slot] = d3dto->getRTView();
-            mTargetViews[slot]->AddRef();
-         } 
-         else 
-         {
-            mTargets[slot] = d3dto->getSurface();
-            mTargets[slot]->AddRef();
-            mTargetViews[slot]->AddRef();
-            mResolveTargets[slot] = d3dto;
-
-            if ( tex && slot == Color0 )
+            if (d3dto->getSurface() == NULL)
             {
-               mTargetSize.set( tex->getSize().x, tex->getSize().y );
-               mTargetFormat = tex->getFormat();
+               mTargets[slot] = d3dto->get2DTex();
+               mTargets[slot]->AddRef();
+               mTargetViews[slot] = d3dto->getRTView();
+               mTargetViews[slot]->AddRef();
+            }
+            else
+            {
+               mTargets[slot] = d3dto->getSurface();
+               mTargets[slot]->AddRef();
+               mTargetViews[slot] = d3dto->getRTView();
+               mTargetViews[slot]->AddRef();
+               mResolveTargets[slot] = d3dto;
+            }
+         }
+         else
+         {
+            // Cubemap render target face
+            mGenMips = false;
+            AssertFatal(faceIndex < 6, "Invalid cubemap face index!");
+            ID3D11RenderTargetView* faceRTV = d3dto->getCubeFaceRTView(faceIndex);
+            AssertFatal(faceRTV, "Cubemap face RTV is null!");
+
+            mTargetArrayIdx[slot] = faceIndex;
+
+            if (d3dto->getSurface() == NULL)
+            {
+               mTargets[slot] = d3dto->get2DTex();
+               mTargets[slot]->AddRef();
+               mTargetViews[slot] = faceRTV;
+               mTargetViews[slot]->AddRef();
+            }
+            else
+            {
+               mTargets[slot] = d3dto->getSurface();
+               mTargets[slot]->AddRef();
+               mTargetViews[slot] = faceRTV;
+               mTargetViews[slot]->AddRef();
+               mResolveTargets[slot] = d3dto;
             }
          }
 
+         // For mip generation
          if (mGenMips)
          {
             mTargetSRViews[slot] = d3dto->getSRView();
-            mTargetSRViews[slot]->AddRef();
+            if (mTargetSRViews[slot])
+               mTargetSRViews[slot]->AddRef();
          }
       }
 
-      // Update surface size
-      if(slot == Color0)
+      // Update color target info
+      if (slot == Color0)
       {
-         ID3D11Texture2D *surface = mTargets[Color0];
-         if ( surface )
+         ID3D11Texture2D* surface = mTargets[Color0];
+         if (surface)
          {
             D3D11_TEXTURE2D_DESC sd;
             surface->GetDesc(&sd);
             mTargetSize = Point2I(sd.Width, sd.Height);
 
             S32 format = sd.Format;
-
             if (format == DXGI_FORMAT_R8G8B8A8_TYPELESS || format == DXGI_FORMAT_B8G8R8A8_TYPELESS)
-            {
                mTargetFormat = GFXFormatR8G8B8A8;
-               return;
+            else
+            {
+               GFXREVERSE_LOOKUP(GFXD3D11TextureFormat, GFXFormat, format);
+               mTargetFormat = (GFXFormat)format;
             }
-
-            GFXREVERSE_LOOKUP( GFXD3D11TextureFormat, GFXFormat, format );
-            mTargetFormat = (GFXFormat)format;
          }
       }
    }
-
 }
 
-
 void GFXD3D11TextureTarget::attachTexture( RenderSlot slot, GFXCubemap *tex, U32 face, U32 mipLevel/*=0*/ )
 {
    GFXDEBUGEVENT_SCOPE( GFXPCD3D11TextureTarget_attachTexture_Cubemap, ColorI::RED );
@@ -316,7 +336,9 @@ void GFXD3D11TextureTarget::resolveTo( GFXTextureObject *tex )
 
    D3D11_TEXTURE2D_DESC desc;
    mTargets[Color0]->GetDesc(&desc);
-   D3D11DEVICECONTEXT->CopySubresourceRegion(((GFXD3D11TextureObject*)(tex))->get2DTex(), 0, 0, 0, 0, mTargets[Color0], 0, NULL);
+   UINT mipLevels = desc.MipLevels ? desc.MipLevels : 1;
+   UINT subResource = D3D11CalcSubresource(0, mTargetArrayIdx[Color0], mipLevels);
+   D3D11DEVICECONTEXT->CopySubresourceRegion(((GFXD3D11TextureObject*)(tex))->get2DTex(), 0, 0, 0, 0, mTargets[Color0], subResource, NULL);
       
 }
 
diff --git a/Engine/source/gfx/D3D11/gfxD3D11Target.h b/Engine/source/gfx/D3D11/gfxD3D11Target.h
index 1173a4ee4..7d96ef34a 100644
--- a/Engine/source/gfx/D3D11/gfxD3D11Target.h
+++ b/Engine/source/gfx/D3D11/gfxD3D11Target.h
@@ -49,6 +49,8 @@ class GFXD3D11TextureTarget : public GFXTextureTarget
 
    GFXFormat mTargetFormat;
 
+   U32 mTargetArrayIdx[MaxRenderSlotId];
+
 public:
 
    GFXD3D11TextureTarget(bool genMips);
@@ -57,7 +59,7 @@ public:
    // Public interface.
    const Point2I getSize() override { return mTargetSize; }
    GFXFormat getFormat() override { return mTargetFormat; }
-   void attachTexture(RenderSlot slot, GFXTextureObject *tex, U32 mipLevel=0, U32 zOffset = 0) override;
+   void attachTexture(RenderSlot slot, GFXTextureObject* tex, U32 mipLevel = 0, U32 zOffset = 0, U32 faceIndex = 0) override;
    void attachTexture(RenderSlot slot, GFXCubemap *tex, U32 face, U32 mipLevel=0) override;
    void resolve() override;
 
diff --git a/Engine/source/gfx/D3D11/gfxD3D11TextureManager.cpp b/Engine/source/gfx/D3D11/gfxD3D11TextureManager.cpp
index e24a6f5dd..74ff91351 100644
--- a/Engine/source/gfx/D3D11/gfxD3D11TextureManager.cpp
+++ b/Engine/source/gfx/D3D11/gfxD3D11TextureManager.cpp
@@ -52,7 +52,8 @@ void GFXD3D11TextureManager::_innerCreateTexture( GFXD3D11TextureObject *retTex,
                                                GFXTextureProfile *profile, 
                                                U32 numMipLevels,
                                                bool forceMips,
-                                               S32 antialiasLevel)
+                                               S32 antialiasLevel,
+															  U32 arraySize)
 {
    U32 usage = 0;
    U32 bindFlags = 0;
@@ -67,6 +68,9 @@ void GFXD3D11TextureManager::_innerCreateTexture( GFXD3D11TextureObject *retTex,
    retTex->isManaged = false;
    DXGI_FORMAT d3dTextureFormat = GFXD3D11TextureFormat[format];
 
+   if (retTex->isCubeMap())
+      miscFlags |= D3D11_RESOURCE_MISC_TEXTURECUBE;
+
    if( retTex->mProfile->isDynamic() )
    {
       usage = D3D11_USAGE_DYNAMIC;
@@ -199,7 +203,7 @@ void GFXD3D11TextureManager::_innerCreateTexture( GFXD3D11TextureObject *retTex,
 			D3D11_TEXTURE2D_DESC desc;
 		  
 			ZeroMemory(&desc, sizeof(D3D11_TEXTURE2D_DESC));
-			desc.ArraySize = 1;
+			desc.ArraySize = arraySize * (retTex->isCubeMap() ? 6 : 1);
 			desc.BindFlags = bindFlags;
 			desc.CPUAccessFlags = cpuFlags;
 			desc.Format = d3dTextureFormat;
@@ -219,6 +223,7 @@ void GFXD3D11TextureManager::_innerCreateTexture( GFXD3D11TextureObject *retTex,
 
 			retTex->get2DTex()->GetDesc(&desc);
 			retTex->mMipLevels = desc.MipLevels;
+			retTex->mArraySize = arraySize;
 		}
 
 		// start creating the resource views...
@@ -267,6 +272,7 @@ GFXTextureObject *GFXD3D11TextureManager::_createTextureObject( U32 height,
                                                                U32 numMipLevels,
                                                                bool forceMips, 
                                                                S32 antialiasLevel,
+                                                               U32 arraySize,
                                                                GFXTextureObject *inTex )
 {
    GFXD3D11TextureObject *retTex;
@@ -278,11 +284,11 @@ GFXTextureObject *GFXD3D11TextureManager::_createTextureObject( U32 height,
    }      
    else
    {
-      retTex = new GFXD3D11TextureObject(GFX, profile);
+      retTex = new GFXD3D11TextureObject(GFX, profile, arraySize);
       retTex->registerResourceWithDevice(GFX);
    }
 
-   _innerCreateTexture(retTex, height, width, depth, format, profile, numMipLevels, forceMips, antialiasLevel);
+   _innerCreateTexture(retTex, height, width, depth, format, profile, numMipLevels, forceMips, antialiasLevel, arraySize);
 
    return retTex;
 }
@@ -295,7 +301,9 @@ bool GFXD3D11TextureManager::_loadTexture(GFXTextureObject *aTexture, GBitmap *p
 
    // Check with profiler to see if we can do automatic mipmap generation.
    const bool supportsAutoMips = GFX->getCardProfiler()->queryProfile("autoMipMapLevel", true);
-
+   
+   const bool isCube = texture->isCubeMap() && pDL->getNumFaces() > 1;
+   const U32 numFaces = isCube ? 6 : 1;
    // Helper bool
    const bool isCompressedTexFmt = ImageUtil::isCompressedFormat(aTexture->mFormat);
 
@@ -312,98 +320,101 @@ bool GFXD3D11TextureManager::_loadTexture(GFXTextureObject *aTexture, GBitmap *p
 
    bool isDynamic = texture->mProfile->isDynamic();
    // Fill the texture...
-   for( U32 i = 0; i < maxDownloadMip; i++ )
+   for (U32 face = 0; face < numFaces; ++face)
    {
-	   U32 subResource = D3D11CalcSubresource(i, 0, aTexture->mMipLevels);
+      for (U32 i = 0; i < maxDownloadMip; i++)
+      {
+         U32 subResource = D3D11CalcSubresource(i, face, aTexture->mMipLevels);
 
-	   if(!isDynamic)
-	   {
-		   U8* copyBuffer = NULL;
+         if (!isDynamic)
+         {
+            U8* copyBuffer = NULL;
 
-		   switch(texture->mFormat)
-			{
+            switch (texture->mFormat)
+            {
             case GFXFormatR8G8B8:
             case GFXFormatR8G8B8_SRGB:
-				{
-					PROFILE_SCOPE(Swizzle24_Upload);
+            {
+               PROFILE_SCOPE(Swizzle24_Upload);
 
-					U8* Bits = new U8[pDL->getWidth(i) * pDL->getHeight(i) * 4];
-					dMemcpy(Bits, pDL->getBits(i), pDL->getWidth(i) * pDL->getHeight(i) * 3);
-					bitmapConvertRGB_to_RGBX(&Bits, pDL->getWidth(i) * pDL->getHeight(i));
-					copyBuffer = new U8[pDL->getWidth(i) * pDL->getHeight(i) * 4];
-					
-					dev->getDeviceSwizzle32()->ToBuffer(copyBuffer, Bits, pDL->getWidth(i) * pDL->getHeight(i) * 4);
-					dev->getDeviceContext()->UpdateSubresource(texture->get2DTex(), subResource, NULL, copyBuffer, pDL->getWidth() * 4, pDL->getHeight() *4);
+               U8* Bits = new U8[pDL->getWidth(i) * pDL->getHeight(i) * 4];
+               dMemcpy(Bits, pDL->getBits(i, face), pDL->getWidth(i) * pDL->getHeight(i) * 3);
+               bitmapConvertRGB_to_RGBX(&Bits, pDL->getWidth(i) * pDL->getHeight(i));
+               copyBuffer = new U8[pDL->getWidth(i) * pDL->getHeight(i) * 4];
+
+               dev->getDeviceSwizzle32()->ToBuffer(copyBuffer, Bits, pDL->getWidth(i) * pDL->getHeight(i) * 4);
+               dev->getDeviceContext()->UpdateSubresource(texture->get2DTex(), subResource, NULL, copyBuffer, pDL->getWidth() * 4, pDL->getHeight() * 4);
                SAFE_DELETE_ARRAY(Bits);
-					break;
-				}
-
-				case GFXFormatR8G8B8A8:
-				case GFXFormatR8G8B8X8:
-            case GFXFormatR8G8B8A8_SRGB:
-				{
-               PROFILE_SCOPE(Swizzle32_Upload);
-               copyBuffer = new U8[pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel()];
-               dev->getDeviceSwizzle32()->ToBuffer(copyBuffer, pDL->getBits(i), pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel());
-               dev->getDeviceContext()->UpdateSubresource(texture->get2DTex(), subResource, NULL, copyBuffer, pDL->getWidth() * pDL->getBytesPerPixel(), pDL->getHeight() *pDL->getBytesPerPixel());
-					break;
-				}
-
-				default:
-				{
-               // Just copy the bits in no swizzle or padding
-               PROFILE_SCOPE(SwizzleNull_Upload);
-               AssertFatal( pDL->getFormat() == texture->mFormat, "Format mismatch");
-               dev->getDeviceContext()->UpdateSubresource(texture->get2DTex(), subResource, NULL, pDL->getBits(i), pDL->getWidth() *pDL->getBytesPerPixel(), pDL->getHeight() *pDL->getBytesPerPixel());
-				}
-			}
-
-         SAFE_DELETE_ARRAY(copyBuffer);
-	    }
-	  
-	   else
-	   {
-			D3D11_MAPPED_SUBRESOURCE mapping;
-			HRESULT res =  dev->getDeviceContext()->Map(texture->get2DTex(), subResource, D3D11_MAP_WRITE, 0, &mapping);
-
-			AssertFatal(res, "tex2d map call failure");
-
-			switch( texture->mFormat )
-			{
-				case GFXFormatR8G8B8:
-            case GFXFormatR8G8B8_SRGB:
-				{
-					PROFILE_SCOPE(Swizzle24_Upload);
-
-					U8* Bits = new U8[pDL->getWidth(i) * pDL->getHeight(i) * 4];
-					dMemcpy(Bits, pDL->getBits(i), pDL->getWidth(i) * pDL->getHeight(i) * 3);
-					bitmapConvertRGB_to_RGBX(&Bits, pDL->getWidth(i) * pDL->getHeight(i));					
-
-					dev->getDeviceSwizzle32()->ToBuffer(mapping.pData, Bits, pDL->getWidth(i) * pDL->getHeight(i) * 4);
-               SAFE_DELETE_ARRAY(Bits);
-				}
-				break;
+               break;
+            }
 
             case GFXFormatR8G8B8A8:
             case GFXFormatR8G8B8X8:
             case GFXFormatR8G8B8A8_SRGB:
             {
                PROFILE_SCOPE(Swizzle32_Upload);
-               dev->getDeviceSwizzle32()->ToBuffer(mapping.pData, pDL->getBits(i), pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel());
+               copyBuffer = new U8[pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel()];
+               dev->getDeviceSwizzle32()->ToBuffer(copyBuffer, pDL->getBits(i, face), pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel());
+               dev->getDeviceContext()->UpdateSubresource(texture->get2DTex(), subResource, NULL, copyBuffer, pDL->getWidth() * pDL->getBytesPerPixel(), pDL->getHeight() * pDL->getBytesPerPixel());
+               break;
             }
-				break;
 
-				default:
-				{
+            default:
+            {
                // Just copy the bits in no swizzle or padding
                PROFILE_SCOPE(SwizzleNull_Upload);
-               AssertFatal( pDL->getFormat() == texture->mFormat, "Format mismatch");
-               dMemcpy(mapping.pData, pDL->getBits(i), pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel());
-				}
-			}
+               AssertFatal(pDL->getFormat() == texture->mFormat, "Format mismatch");
+               dev->getDeviceContext()->UpdateSubresource(texture->get2DTex(), subResource, NULL, pDL->getBits(i, face), pDL->getWidth() * pDL->getBytesPerPixel(), pDL->getHeight() * pDL->getBytesPerPixel());
+            }
+            }
 
-			dev->getDeviceContext()->Unmap(texture->get2DTex(), subResource);
-	   }
+            SAFE_DELETE_ARRAY(copyBuffer);
+         }
+
+         else
+         {
+            D3D11_MAPPED_SUBRESOURCE mapping;
+            HRESULT res = dev->getDeviceContext()->Map(texture->get2DTex(), subResource, D3D11_MAP_WRITE, 0, &mapping);
+
+            AssertFatal(res, "tex2d map call failure");
+
+            switch (texture->mFormat)
+            {
+            case GFXFormatR8G8B8:
+            case GFXFormatR8G8B8_SRGB:
+            {
+               PROFILE_SCOPE(Swizzle24_Upload);
+
+               U8* Bits = new U8[pDL->getWidth(i) * pDL->getHeight(i) * 4];
+               dMemcpy(Bits, pDL->getBits(i, face), pDL->getWidth(i) * pDL->getHeight(i) * 3);
+               bitmapConvertRGB_to_RGBX(&Bits, pDL->getWidth(i) * pDL->getHeight(i));
+
+               dev->getDeviceSwizzle32()->ToBuffer(mapping.pData, Bits, pDL->getWidth(i) * pDL->getHeight(i) * 4);
+               SAFE_DELETE_ARRAY(Bits);
+            }
+            break;
+
+            case GFXFormatR8G8B8A8:
+            case GFXFormatR8G8B8X8:
+            case GFXFormatR8G8B8A8_SRGB:
+            {
+               PROFILE_SCOPE(Swizzle32_Upload);
+               dev->getDeviceSwizzle32()->ToBuffer(mapping.pData, pDL->getBits(i, face), pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel());
+            }
+            break;
+
+            default:
+            {
+               // Just copy the bits in no swizzle or padding
+               PROFILE_SCOPE(SwizzleNull_Upload);
+               AssertFatal(pDL->getFormat() == texture->mFormat, "Format mismatch");
+               dMemcpy(mapping.pData, pDL->getBits(i, face), pDL->getWidth(i) * pDL->getHeight(i) * pDL->getBytesPerPixel());
+            }
+            }
+
+            dev->getDeviceContext()->Unmap(texture->get2DTex(), subResource);
+         }
+      }
    }
 
    D3D11_TEXTURE2D_DESC desc;
@@ -487,7 +498,7 @@ bool GFXD3D11TextureManager::_refreshTexture(GFXTextureObject *texture)
    if(texture->mProfile->isRenderTarget() || texture->mProfile->isDynamic() || texture->mProfile->isZTarget())
    {
       realTex->release();
-      _innerCreateTexture(realTex, texture->getHeight(), texture->getWidth(), texture->getDepth(), texture->mFormat, texture->mProfile, texture->mMipLevels, false, texture->mAntialiasLevel);
+      _innerCreateTexture(realTex, texture->getHeight(), texture->getWidth(), texture->getDepth(), texture->mFormat, texture->mProfile, texture->mMipLevels, false, texture->mAntialiasLevel, texture->mArraySize);
       usedStrategies++;
    }
 
@@ -519,14 +530,31 @@ bool GFXD3D11TextureManager::_loadTexture(GFXTextureObject *aTexture, DDSFile *d
    GFXD3D11TextureObject *texture = static_cast<GFXD3D11TextureObject*>(aTexture);
    GFXD3D11Device* dev = static_cast<GFXD3D11Device *>(GFX);
    // Fill the texture...
-   for( U32 i = 0; i < aTexture->mMipLevels; i++ )
+   const bool isCube = texture->isCubeMap() && dds->isCubemap();
+   const U32 numFaces = isCube ? 6 : 1;
+
+   // Loop over faces and mips
+   for (U32 face = 0; face < numFaces; ++face)
    {
-      PROFILE_SCOPE(GFXD3DTexMan_loadSurface);
+      for (U32 mip = 0; mip < aTexture->mMipLevels; ++mip)
+      {
+         PROFILE_SCOPE(GFXD3DTexMan_loadSurface);
 
-		AssertFatal( dds->mSurfaces.size() > 0, "Assumption failed. DDSFile has no surfaces." );
+         // DDSFile must have data for each face
+         AssertFatal(dds->mSurfaces.size() > face, "DDSFile missing cubemap face data.");
+         AssertFatal(dds->mSurfaces[face]->mMips.size() > mip, "DDSFile missing mip level.");
 
-		U32 subresource = D3D11CalcSubresource(i, 0, aTexture->mMipLevels);
-		dev->getDeviceContext()->UpdateSubresource(texture->get2DTex(), subresource, 0, dds->mSurfaces[0]->mMips[i], dds->getSurfacePitch(i), 0);
+         const U32 subresource = D3D11CalcSubresource(mip, face, aTexture->mMipLevels);
+
+         dev->getDeviceContext()->UpdateSubresource(
+            texture->get2DTex(),         // resource
+            subresource,                 // subresource index
+            nullptr,                     // box (nullptr for full subresource)
+            dds->mSurfaces[face]->mMips[mip], // source data pointer
+            dds->getSurfacePitch(mip),  // row pitch
+            0                            // depth pitch
+         );
+      }
    }
 
    D3D11_TEXTURE2D_DESC desc;
@@ -541,14 +569,14 @@ bool GFXD3D11TextureManager::_loadTexture(GFXTextureObject *aTexture, DDSFile *d
 void GFXD3D11TextureManager::createResourceView(U32 height, U32 width, U32 depth, DXGI_FORMAT format, U32 numMipLevels,U32 usageFlags, GFXTextureObject *inTex)
 {
 	GFXD3D11TextureObject *tex = static_cast<GFXD3D11TextureObject*>(inTex);
-	ID3D11Resource* resource = NULL;
-	
-	if(tex->get2DTex())
-		resource = tex->get2DTex();
-	else if(tex->getSurface())
-		resource = tex->getSurface();
-	else
-		resource = tex->get3DTex();
+   ID3D11Resource* resource;
+
+   if (tex->get2DTex())
+      resource = tex->get2DTex();
+   else if (tex->getSurface())
+      resource = tex->getSurface();
+   else
+      resource = tex->get3DTex();
 
 	HRESULT hr;
 	//TODO: add MSAA support later.
@@ -567,11 +595,40 @@ void GFXD3D11TextureManager::createResourceView(U32 height, U32 width, U32 depth
 			desc.Texture3D.MipLevels = -1;
 			desc.Texture3D.MostDetailedMip = 0;
 		}
-		else
+      else if (tex->isCubeMap())
+      {
+         if (tex->getArraySize() == 1)
+         {
+            desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE;
+            desc.TextureCube.MipLevels = -1;
+            desc.TextureCube.MostDetailedMip = 0;
+         }
+         else
+         {
+				desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBEARRAY;
+				desc.TextureCubeArray.MostDetailedMip = 0;
+				desc.TextureCubeArray.MipLevels = -1;
+				desc.TextureCubeArray.First2DArrayFace = 0;
+            desc.TextureCubeArray.NumCubes = tex->getArraySize();
+         }
+         
+      }
+      else
 		{
-			desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
-			desc.Texture2D.MipLevels = -1;
-			desc.Texture2D.MostDetailedMip = 0;
+         if (tex->getArraySize() == 1)
+         {
+            desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+				desc.Texture2D.MipLevels = -1;
+            desc.Texture2D.MostDetailedMip = 0;
+         }
+         else
+         {
+            desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2DARRAY;
+            desc.Texture2DArray.MipLevels = -1;
+            desc.Texture2DArray.MostDetailedMip = 0;
+            desc.Texture2DArray.FirstArraySlice = 0;
+            desc.Texture2DArray.ArraySize = tex->getArraySize();
+         }
 		}
 		
 		hr = D3D11DEVICE->CreateShaderResourceView(resource,&desc, tex->getSRViewPtr());
@@ -580,12 +637,29 @@ void GFXD3D11TextureManager::createResourceView(U32 height, U32 width, U32 depth
 
 	if(usageFlags & D3D11_BIND_RENDER_TARGET)
 	{
-		D3D11_RENDER_TARGET_VIEW_DESC desc;
-		desc.Format = format;
-		desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D;
-		desc.Texture2D.MipSlice = 0;
-		hr = D3D11DEVICE->CreateRenderTargetView(resource, &desc, tex->getRTViewPtr());
-		AssertFatal(SUCCEEDED(hr), "CreateRenderTargetView:: failed to create view!");
+      if (tex->isCubeMap())
+      {
+         for (U32 face = 0; face < 6; face++)
+         {
+            D3D11_RENDER_TARGET_VIEW_DESC desc;
+            desc.Format = format;
+            desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2DARRAY;
+            desc.Texture2DArray.ArraySize = 1;
+            desc.Texture2DArray.FirstArraySlice = face;
+            desc.Texture2DArray.MipSlice = 0;
+            hr = D3D11DEVICE->CreateRenderTargetView(resource, &desc, tex->getCubeFaceRTViewPtr(face));
+            AssertFatal(SUCCEEDED(hr), "CreateRenderTargetView:: failed to create view!");
+         }
+      }
+      else
+      {
+         D3D11_RENDER_TARGET_VIEW_DESC desc;
+         desc.Format = format;
+         desc.ViewDimension = D3D11_RTV_DIMENSION_TEXTURE2D;
+         desc.Texture2D.MipSlice = 0;
+         hr = D3D11DEVICE->CreateRenderTargetView(resource, &desc, tex->getRTViewPtr());
+         AssertFatal(SUCCEEDED(hr), "CreateRenderTargetView:: failed to create view!");
+      }
 	}
 
 	if(usageFlags & D3D11_BIND_DEPTH_STENCIL)
diff --git a/Engine/source/gfx/D3D11/gfxD3D11TextureManager.h b/Engine/source/gfx/D3D11/gfxD3D11TextureManager.h
index b035bd6b7..f9d1a4f61 100644
--- a/Engine/source/gfx/D3D11/gfxD3D11TextureManager.h
+++ b/Engine/source/gfx/D3D11/gfxD3D11TextureManager.h
@@ -45,6 +45,7 @@ protected:
                                              U32 numMipLevels,
                                              bool forceMips = false,
                                              S32 antialiasLevel = 0,
+                                             U32 arraySize = 1,
                                              GFXTextureObject *inTex = NULL ) override;
    
    bool _loadTexture(GFXTextureObject *texture, DDSFile *dds) override;
@@ -56,7 +57,7 @@ protected:
 private:
    U32 mCurTexSet[GFX_TEXTURE_STAGE_COUNT];
 
-   void _innerCreateTexture(GFXD3D11TextureObject *obj, U32 height, U32 width, U32 depth, GFXFormat format, GFXTextureProfile *profile, U32 numMipLevels, bool forceMips = false, S32 antialiasLevel = 0);
+   void _innerCreateTexture(GFXD3D11TextureObject *obj, U32 height, U32 width, U32 depth, GFXFormat format, GFXTextureProfile *profile, U32 numMipLevels, bool forceMips = false, S32 antialiasLevel = 0, U32 arraySize = 1);
 };
 
 #endif
diff --git a/Engine/source/gfx/D3D11/gfxD3D11TextureObject.cpp b/Engine/source/gfx/D3D11/gfxD3D11TextureObject.cpp
index 24af3e0ae..2694aa85b 100644
--- a/Engine/source/gfx/D3D11/gfxD3D11TextureObject.cpp
+++ b/Engine/source/gfx/D3D11/gfxD3D11TextureObject.cpp
@@ -33,26 +33,16 @@ U32 GFXD3D11TextureObject::mTexCount = 0;
 //	GFXFormatR8G8B8 has now the same behaviour as GFXFormatR8G8B8X8. 
 //	This is because 24 bit format are now deprecated by microsoft, for data alignment reason there's no changes beetween 24 and 32 bit formats.
 //	DirectX 10-11 both have 24 bit format no longer.
-
-
-GFXD3D11TextureObject::GFXD3D11TextureObject( GFXDevice * d, GFXTextureProfile *profile) : GFXTextureObject( d, profile )
+GFXD3D11TextureObject::GFXD3D11TextureObject( GFXDevice * d, GFXTextureProfile *profile, const U32 arraySize) : GFXTextureObject( d, profile )
 {
 #ifdef D3D11_DEBUG_SPEW
    mTexCount++;
    Con::printf("+ texMake %d %x", mTexCount, this);
 #endif
-
-   mD3DTexture = NULL;
-   mLocked = false;
-
-   mD3DSurface = NULL;
+   isManaged = false;
    dMemset(&mLockRect, 0, sizeof(mLockRect));
    dMemset(&mLockBox, 0, sizeof(mLockBox));
-   mLockedSubresource = 0;
-   mDSView = NULL;
-   mRTView = NULL;
-   mSRView = NULL;
-   isManaged = false;
+   mArraySize = arraySize;
 }
 
 GFXD3D11TextureObject::~GFXD3D11TextureObject()
@@ -64,53 +54,76 @@ GFXD3D11TextureObject::~GFXD3D11TextureObject()
 #endif
 }
 
-GFXLockedRect *GFXD3D11TextureObject::lock(U32 mipLevel /*= 0*/, RectI *inRect /*= NULL*/)
+ID3D11Texture2D* GFXD3D11TextureObject::get2DTex() const
 {
-   AssertFatal( !mLocked, "GFXD3D11TextureObject::lock - The texture is already locked!" );
+   ComPtr<ID3D11Texture2D> tex2D;
+   if (mD3DTexture) mD3DTexture.As(&tex2D);
+   return tex2D.Get();
+}
 
-   if( !mStagingTex ||
+ID3D11Texture3D* GFXD3D11TextureObject::get3DTex() const
+{
+   ComPtr<ID3D11Texture3D> tex3D;
+   if (mD3DTexture) mD3DTexture.As(&tex3D);
+   return tex3D.Get();
+}
+
+ID3D11Texture2D** GFXD3D11TextureObject::get2DTexPtr()
+{
+   return reinterpret_cast<ID3D11Texture2D**>(mD3DTexture.GetAddressOf());
+}
+
+ID3D11Texture3D** GFXD3D11TextureObject::get3DTexPtr()
+{
+   return reinterpret_cast<ID3D11Texture3D**>(mD3DTexture.GetAddressOf());
+}
+
+ID3D11RenderTargetView** GFXD3D11TextureObject::getCubeFaceRTViewPtr(U32 face)
+{
+   AssertFatal(isCubeMap(), "Not a cubemap texture!");
+   AssertFatal(face < 6, "Invalid cubemap face index!");
+   return mCubeRTV[face].GetAddressOf();
+}
+
+GFXLockedRect* GFXD3D11TextureObject::lock(U32 mipLevel /*= 0*/, RectI* inRect /*= NULL*/, U32 faceIndex /*= 0*/)
+{
+   AssertFatal(!mLocked, "GFXD3D11TextureObject::lock - Texture is already locked!");
+   AssertFatal(faceIndex < 6 || !isCubeMap(), "Invalid cubemap face index!");
+
+   // Ensure staging texture exists and matches size
+   if (!mStagingTex.isValid() ||
       mStagingTex->getWidth() != getWidth() ||
       mStagingTex->getHeight() != getHeight() ||
       mStagingTex->getDepth() != getDepth())
    {
-      if (getDepth() != 0)
-      {
-         mStagingTex.set(getWidth(), getHeight(), getDepth(), mFormat, &GFXSystemMemTextureProfile, avar("%s() - mLockTex (line %d)", __FUNCTION__, __LINE__, 0));
-      }
-      else
-      {
-         mStagingTex.set(getWidth(), getHeight(), mFormat, &GFXSystemMemTextureProfile, avar("%s() - mLockTex (line %d)", __FUNCTION__, __LINE__));
-      }
+      mStagingTex.set(getWidth(), getHeight(), mFormat, &GFXSystemMemTextureProfile,
+         avar("%s() - stagingTex", __FUNCTION__));
    }
 
-   ID3D11DeviceContext* pContext = D3D11DEVICECONTEXT;
    D3D11_MAPPED_SUBRESOURCE mapInfo;
-   U32 offset = 0;
-   mLockedSubresource = D3D11CalcSubresource(mipLevel, 0, getMipLevels());
-   GFXD3D11TextureObject* pD3DStagingTex = (GFXD3D11TextureObject*)&(*mStagingTex);
 
-   //map staging texture
-   HRESULT hr = pContext->Map(pD3DStagingTex->getResource(), mLockedSubresource, D3D11_MAP_WRITE, 0, &mapInfo);      
+   mLockedSubresource = D3D11CalcSubresource(mipLevel, faceIndex, getMipLevels());
+   GFXD3D11TextureObject* staging = (GFXD3D11TextureObject*)&(*mStagingTex);
 
+   HRESULT hr = D3D11DEVICECONTEXT->Map(staging->getResource(), mLockedSubresource, D3D11_MAP_WRITE, 0, &mapInfo);
    if (FAILED(hr))
       AssertFatal(false, "GFXD3D11TextureObject:lock - failed to map render target resource!");
 
-
    const bool is3D = mStagingTex->getDepth() != 0;
    const U32 width = mTextureSize.x >> mipLevel;
    const U32 height = mTextureSize.y >> mipLevel;
    const U32 depth = is3D ? mTextureSize.z >> mipLevel : 1;
+   U32 offset = 0;
 
-   //calculate locked box region and offset
    if (inRect)
    {
-      if ((inRect->point.x + inRect->extent.x > width) || (inRect->point.y + inRect->extent.y > height))
-         AssertFatal(false, "GFXD3D11TextureObject::lock - Rectangle too big!");
+      AssertFatal(inRect->point.x + inRect->extent.x <= width, "GFXD3D11TextureObject::lock - Invalid lock rect width!");
+      AssertFatal(inRect->point.y + inRect->extent.y <= height, "GFXD3D11TextureObject::lock - Invalid lock rect height!");
 
       mLockBox.top = inRect->point.y;
       mLockBox.left = inRect->point.x;
-      mLockBox.bottom = inRect->point.y + inRect->extent.y;
       mLockBox.right = inRect->point.x + inRect->extent.x;
+      mLockBox.bottom = inRect->point.y + inRect->extent.y;
       mLockBox.back = depth;
       mLockBox.front = 0;
 
@@ -121,49 +134,57 @@ GFXLockedRect *GFXD3D11TextureObject::lock(U32 mipLevel /*= 0*/, RectI *inRect /
    {
       mLockBox.top = 0;
       mLockBox.left = 0;
-      mLockBox.bottom = height;
       mLockBox.right = width;
+      mLockBox.bottom = height;
       mLockBox.back = depth;
       mLockBox.front = 0;
+
    }
 
    mLocked = true;
    mLockRect.pBits = static_cast<U8*>(mapInfo.pData) + offset;
    mLockRect.Pitch = mapInfo.RowPitch;
 
-   return (GFXLockedRect*)&mLockRect;
+   return reinterpret_cast<GFXLockedRect*>(&mLockRect);
 }
 
-void GFXD3D11TextureObject::unlock(U32 mipLevel)
+void GFXD3D11TextureObject::unlock(U32 mipLevel /*= 0*/, U32 faceIndex /*= 0*/)
 {
-   AssertFatal( mLocked, "GFXD3D11TextureObject::unlock - Attempting to unlock a surface that has not been locked" );
+   AssertFatal(mLocked, "GFXD3D11TextureObject::unlock - Texture is not locked!");
+   AssertFatal(faceIndex < 6 || !isCubeMap(), "Invalid cubemap face index!");
 
-   //profile in the unlock function because all the heavy lifting is done here
-   PROFILE_START(GFXD3D11TextureObject_lockRT);
+   PROFILE_START(GFXD3D11TextureObject_unlock);
 
-   ID3D11DeviceContext* pContext = D3D11DEVICECONTEXT;
-   GFXD3D11TextureObject* pD3DStagingTex = (GFXD3D11TextureObject*)&(*mStagingTex);
-   ID3D11Resource* pStagingResource = pD3DStagingTex->getResource();
-   const bool is3D = mStagingTex->getDepth() != 0;
+   GFXD3D11TextureObject* staging = (GFXD3D11TextureObject*)&(*mStagingTex);
 
-   //unmap staging texture
-   pContext->Unmap(pStagingResource, mLockedSubresource);
-   //copy lock box region from the staging texture to our regular texture
-   pContext->CopySubresourceRegion(mD3DTexture, mLockedSubresource, mLockBox.left, mLockBox.top, is3D ? mLockBox.back : 0, pStagingResource, mLockedSubresource, &mLockBox);
+   D3D11DEVICECONTEXT->Unmap(staging->getResource(), mLockedSubresource);
 
-   PROFILE_END();
+   // Copy from staging back to GPU texture
+   D3D11DEVICECONTEXT->CopySubresourceRegion(
+      mD3DTexture.Get(),
+      mLockedSubresource,
+      0, 0, 0,
+      staging->getResource(),
+      mLockedSubresource,
+      &mLockBox
+   );
 
    mLockedSubresource = 0;
    mLocked = false;
+   
+   PROFILE_END();
 }
 
 void GFXD3D11TextureObject::release()
 {
-   SAFE_RELEASE(mSRView);
-   SAFE_RELEASE(mRTView);
-   SAFE_RELEASE(mDSView);
-   SAFE_RELEASE(mD3DTexture);
-   SAFE_RELEASE(mD3DSurface);
+   mSRView.Reset();
+   mRTView.Reset();
+   mDSView.Reset();
+   mD3DTexture.Reset();
+   mD3DSurface.Reset();
+
+   for (auto& faceRTV : mCubeRTV)
+      faceRTV.Reset();
 }
 
 void GFXD3D11TextureObject::zombify()
@@ -189,149 +210,206 @@ bool GFXD3D11TextureObject::copyToBmp(GBitmap* bmp)
    if (!bmp)
       return false;
 
-   // check format limitations
-   // at the moment we only support RGBA for the source (other 4 byte formats should
-   // be easy to add though)
-   AssertFatal(mFormat == GFXFormatR16G16B16A16F || mFormat == GFXFormatR8G8B8A8 || mFormat == GFXFormatR8G8B8A8_LINEAR_FORCE || mFormat == GFXFormatR8G8B8A8_SRGB || mFormat == GFXFormatR8G8B8, "copyToBmp: invalid format");
-   if (mFormat != GFXFormatR16G16B16A16F && mFormat != GFXFormatR8G8B8A8 && mFormat != GFXFormatR8G8B8A8_LINEAR_FORCE && mFormat != GFXFormatR8G8B8A8_SRGB && mFormat != GFXFormatR8G8B8)
-      return false;
+   AssertFatal(mFormat == GFXFormatR16G16B16A16F || mFormat == GFXFormatR8G8B8A8 ||
+      mFormat == GFXFormatR8G8B8A8_LINEAR_FORCE || mFormat == GFXFormatR8G8B8A8_SRGB ||
+      mFormat == GFXFormatR8G8B8,
+      "GFXD3D11TextureObject::copyToBmp - Unsupported source format.");
 
    PROFILE_START(GFXD3D11TextureObject_copyToBmp);
 
-   AssertFatal(bmp->getWidth() == getWidth(), avar("GFXGLTextureObject::copyToBmp - Width mismatch: %i vs %i", bmp->getWidth(), getWidth()));
-   AssertFatal(bmp->getHeight() == getHeight(), avar("GFXGLTextureObject::copyToBmp - Height mismatch: %i vs %i", bmp->getHeight(), getHeight()));
-   const U32 mipLevels = getMipLevels();
+   AssertFatal(bmp->getWidth() == getWidth(), "Width mismatch between texture and bitmap.");
+   AssertFatal(bmp->getHeight() == getHeight(), "Height mismatch between texture and bitmap.");
 
+   const U32 mipLevels = getMipLevels();
    bmp->setHasTransparency(mHasTransparency);
 
-   // set some constants
-   U32 sourceBytesPerPixel = 4;
-   U32 destBytesPerPixel = 0;
+   // Figure out bytes per pixel
+   const bool isFP16 = (bmp->getFormat() == GFXFormatR16G16B16A16F);
+   const U32 destBpp = (bmp->getFormat() == GFXFormatR8G8B8 ? 3 :
+      bmp->getFormat() == GFXFormatR16G16B16A16F ? 8 : 4);
+   const U32 srcBpp = (mFormat == GFXFormatR16G16B16A16F ? 8 : 4);
 
-   const GFXFormat fmt = bmp->getFormat();
-   bool fp16 = false;//is rgba16f format?
-   if (fmt == GFXFormatR16G16B16A16F)
-   {
-      destBytesPerPixel = 8;
-      sourceBytesPerPixel = 8;
-      fp16 = true;
-   }
-   else if (fmt == GFXFormatR8G8B8A8 || fmt == GFXFormatR8G8B8A8_LINEAR_FORCE || fmt == GFXFormatR8G8B8A8_SRGB)
-      destBytesPerPixel = 4;
-   else if(bmp->getFormat() == GFXFormatR8G8B8)
-      destBytesPerPixel = 3;
-   else
-      // unsupported
-      AssertFatal(false, "GFXD3D11TextureObject::copyToBmp - unsupported bitmap format");
-   
-   //create temp staging texture
-   D3D11_TEXTURE2D_DESC desc;
-   static_cast<ID3D11Texture2D*>(mD3DTexture)->GetDesc(&desc);
+   // --- Create staging texture ---
+   D3D11_TEXTURE2D_DESC desc = {};
+   reinterpret_cast<ID3D11Texture2D*>(mD3DTexture.Get())->GetDesc(&desc);
    desc.BindFlags = 0;
-   desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
+   desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
    desc.Usage = D3D11_USAGE_STAGING;
    desc.MiscFlags = 0;
 
-   ID3D11Texture2D* pStagingTexture = NULL;
-   HRESULT hr = D3D11DEVICE->CreateTexture2D(&desc, NULL, &pStagingTexture);
+   ComPtr<ID3D11Texture2D> stagingTex;
+   HRESULT hr = D3D11DEVICE->CreateTexture2D(&desc, nullptr, stagingTex.GetAddressOf());
    if (FAILED(hr))
    {
-      Con::errorf("GFXD3D11TextureObject::copyToBmp - Failed to create staging texture"); 
+      Con::errorf("GFXD3D11TextureObject::copyToBmp - Failed to create staging texture (0x%X)", hr);
       return false;
    }
 
-   //copy the classes texture to the staging texture
-   D3D11DEVICECONTEXT->CopyResource(pStagingTexture, mD3DTexture);
+   // --- Copy texture (handle cubemap or 2D) ---
+   const U32 faceCount = isCubeMap() && bmp->getNumFaces() == 6 ? 6 : 1;
 
-   for (U32 mip = 0; mip < mipLevels; mip++)
+   for (U32 face = 0; face < faceCount; ++face)
    {
-      const U32 width = bmp->getWidth(mip);
-      const U32 height = bmp->getHeight(mip);
-      //map the staging resource
-      D3D11_MAPPED_SUBRESOURCE mappedRes;
-      const U32 subResource = D3D11CalcSubresource(mip, 0, mipLevels);
-      hr = D3D11DEVICECONTEXT->Map(pStagingTexture, subResource, D3D11_MAP_READ, 0, &mappedRes);
-      if (FAILED(hr))
+      for (U32 mip = 0; mip < mipLevels; ++mip)
       {
-         //cleanup
-         SAFE_RELEASE(pStagingTexture);
-         Con::errorf("GFXD3D11TextureObject::copyToBmp - Failed to map staging texture");
-         return false;
-      }
+         const U32 srcSubRes = D3D11CalcSubresource(mip, face, mipLevels);
+         // Always map mip-level 0..mipLevels-1 on *slice 0* of the staging texture
+         const U32 dstSubRes = D3D11CalcSubresource(mip, face, mipLevels);
 
-      // set pointers
-      const U8* srcPtr = (U8*)mappedRes.pData;
-      U8* destPtr = bmp->getWritableBits(mip);
+         D3D11DEVICECONTEXT->CopySubresourceRegion(
+            stagingTex.Get(), dstSubRes, 0, 0, 0,
+            mD3DTexture.Get(), srcSubRes, nullptr);
 
-      // we will want to skip over any D3D cache data in the source texture
-      const S32 sourceCacheSize = mappedRes.RowPitch - width * sourceBytesPerPixel;
-      AssertFatal(sourceCacheSize >= 0, "GFXD3D11TextureObject::copyToBmp - cache size is less than zero?");
-
-      // copy data into bitmap
-      for (U32 row = 0; row < height; ++row)
-      {
-         for (U32 col = 0; col < width; ++col)
+         D3D11_MAPPED_SUBRESOURCE mapped = {};
+         hr = D3D11DEVICECONTEXT->Map(stagingTex.Get(), dstSubRes, D3D11_MAP_READ, 0, &mapped);
+         if (FAILED(hr))
          {
-            //we can just copy data straight in with RGBA16F format
-            if (fp16)
-            {
-               dMemcpy(destPtr, srcPtr, sizeof(U16) * 4);
-            }
-            else
-            {            
-               destPtr[0] = srcPtr[2]; // red
-               destPtr[1] = srcPtr[1]; // green
-               destPtr[2] = srcPtr[0]; // blue 
-               if (destBytesPerPixel == 4)
-                  destPtr[3] = srcPtr[3]; // alpha
-            }
-
-            // go to next pixel in src
-            srcPtr += sourceBytesPerPixel;
-
-            // go to next pixel in dest
-            destPtr += destBytesPerPixel;
+            Con::errorf("GFXD3D11TextureObject::copyToBmp - Failed to map staging texture (0x%X)", hr);
+            return false;
          }
-         // skip past the cache data for this row (if any)
-         srcPtr += sourceCacheSize;
+
+         const U8* src = static_cast<const U8*>(mapped.pData);
+         U8* dst = bmp->getWritableBits(mip, face);
+
+         const U32 width = bmp->getWidth(mip);
+         const U32 height = bmp->getHeight(mip);
+
+         for (U32 y = 0; y < height; ++y)
+         {
+            const U8* srcRow = src;
+            U8* dstRow = dst;
+
+            for (U32 x = 0; x < width; ++x)
+            {
+               if (isFP16)
+               {
+                  dMemcpy(dstRow, srcRow, sizeof(U16) * 4);
+               }
+               else
+               {
+                  // Convert BGRA → RGB(A)
+                  dstRow[0] = srcRow[2];
+                  dstRow[1] = srcRow[1];
+                  dstRow[2] = srcRow[0];
+                  if (destBpp == 4)
+                     dstRow[3] = srcRow[3];
+               }
+               srcRow += srcBpp;
+               dstRow += destBpp;
+            }
+
+            src += mapped.RowPitch;
+            dst += width * destBpp;
+         }
+
+         D3D11DEVICECONTEXT->Unmap(stagingTex.Get(), dstSubRes);
       }
-
-      // assert if we stomped or underran memory
-      AssertFatal(U32(destPtr - bmp->getWritableBits(mip)) == width * height * destBytesPerPixel, "GFXD3D11TextureObject::copyToBmp - memory error");
-      AssertFatal(U32(srcPtr - (U8*)mappedRes.pData) == height * mappedRes.RowPitch, "GFXD3D11TextureObject::copyToBmp - memory error");
-
-      D3D11DEVICECONTEXT->Unmap(pStagingTexture, subResource);
    }
 
-   SAFE_RELEASE(pStagingTexture);
    PROFILE_END();
-
    return true;
 }
 
-ID3D11ShaderResourceView* GFXD3D11TextureObject::getSRView()
+void GFXD3D11TextureObject::generateMipMaps()
 {
-	return mSRView;
-}
-ID3D11RenderTargetView* GFXD3D11TextureObject::getRTView()
-{
-	return mRTView;
-}
-ID3D11DepthStencilView* GFXD3D11TextureObject::getDSView()
-{
-	return mDSView;
+   //Generate mips
+   D3D11DEVICECONTEXT->GenerateMips(mSRView.Get());
+   //get mip level count
+   D3D11_SHADER_RESOURCE_VIEW_DESC viewDesc;
+   mSRView->GetDesc(&viewDesc);
+   mMipLevels = viewDesc.TextureCube.MipLevels;
 }
 
-ID3D11ShaderResourceView** GFXD3D11TextureObject::getSRViewPtr()
+void GFXD3D11TextureObject::updateTextureSlot(const GFXTexHandle& texHandle, const U32 slot, const S32 faceIdx /*=-1*/)
 {
-	return &mSRView;
-}
-ID3D11RenderTargetView** GFXD3D11TextureObject::getRTViewPtr()
-{
-	return &mRTView;
+   AssertFatal(slot < getArraySize(), "updateTextureSlot - destination slot out of bounds");
+   AssertFatal(mFormat == texHandle->getFormat(), "updateTextureSlot - format mismatch");
+   AssertFatal(getMipLevels() == texHandle->getMipLevels(), "updateTextureSlot - mip level mismatch");
+
+   GFXD3D11TextureObject* srcTex = static_cast<GFXD3D11TextureObject*>(texHandle.getPointer());
+
+   ID3D11Resource* dstRes = get2DTex();
+   ID3D11Resource* srcRes = srcTex->get2DTex();
+
+   const UINT mipLevels = getMipLevels();
+
+   const bool dstIsCube = isCubeMap();
+   const bool srcIsCube = srcTex->isCubeMap();
+
+   const UINT dstArraySize = getArraySize();
+   const UINT srcArraySize = srcTex->getArraySize();
+
+   // Determine number of faces to copy
+   const UINT faceCount = srcIsCube ? 6 : 1;
+   const UINT startFace = (faceIdx >= 0) ? faceIdx : 0;
+   const UINT endFace = (faceIdx >= 0) ? faceIdx + 1 : faceCount;
+
+   for (UINT face = startFace; face < endFace; ++face)
+   {
+      // Compute source slice
+      const UINT srcSlice = srcIsCube
+         ? (srcArraySize > 1 ? face + slot * 6 : face)   // only add slot*6 if it's a cubemap array
+         : (srcArraySize > 1 ? face + slot : 0);        // otherwise, single 2D texture or 2D array
+
+      const UINT dstSlice = dstIsCube
+         ? (dstArraySize > 1 ? face + slot * 6 : face)  // only add slot*6 if it's a cubemap array
+         : (dstArraySize > 1 ? face + slot : 0);        // otherwise, single 2D texture or 2D array
+
+      for (UINT mip = 0; mip < mipLevels; ++mip)
+      {
+         const UINT srcSubresource = D3D11CalcSubresource(mip, srcSlice, mipLevels);
+         const UINT dstSubresource = D3D11CalcSubresource(mip, dstSlice, mipLevels);
+
+         D3D11DEVICECONTEXT->CopySubresourceRegion(dstRes, dstSubresource, 0, 0, 0, srcRes, srcSubresource, nullptr);
+      }
+   }
 }
 
-ID3D11DepthStencilView** GFXD3D11TextureObject::getDSViewPtr()
+void GFXD3D11TextureObject::copyTo(GFXTextureObject* dstTex)
 {
-	return &mDSView;
+   AssertFatal(dstTex, "GFXD3D11TextureObject::copyTo - destination is null");
+
+   GFXD3D11TextureObject* pDstTex = static_cast<GFXD3D11TextureObject*>(dstTex);
+
+   ID3D11Texture2D* srcTex = (ID3D11Texture2D*)mD3DTexture.Get();
+   ID3D11Texture2D* dstTex2D = pDstTex->get2DTex();
+
+   D3D11_TEXTURE2D_DESC srcDesc, dstDesc;
+   srcTex->GetDesc(&srcDesc);
+   dstTex2D->GetDesc(&dstDesc);
+
+   // Sanity check – sizes and formats must match for a full copy.
+   AssertFatal(srcDesc.Width == dstDesc.Width && srcDesc.Height == dstDesc.Height,
+      "GFXD3D11TextureObject::copyTo - Mismatched texture dimensions");
+   AssertFatal(srcDesc.Format == dstDesc.Format,
+      "GFXD3D11TextureObject::copyTo - Mismatched formats");
+
+   UINT srcMipLevels = srcDesc.MipLevels ? srcDesc.MipLevels : 1;
+   UINT dstMipLevels = dstDesc.MipLevels ? dstDesc.MipLevels : 1;
+   UINT mipLevels = getMin(srcMipLevels, dstMipLevels);
+
+   UINT srcArraySize = srcDesc.ArraySize;
+   UINT dstArraySize = dstDesc.ArraySize;
+   UINT arraySize = getMin(srcArraySize, dstArraySize);
+
+   // Handle cube maps and cube map arrays
+   bool isCubeSrc = (srcDesc.MiscFlags & D3D11_RESOURCE_MISC_TEXTURECUBE) != 0;
+
+   // In cubemaps, ArraySize is always 6 * numCubes
+   if (isCubeSrc) arraySize = srcArraySize; // 6 or 6*nCubes
+
+   for (UINT arraySlice = 0; arraySlice < arraySize; ++arraySlice)
+   {
+      for (UINT mip = 0; mip < mipLevels; ++mip)
+      {
+         UINT srcSubresource = D3D11CalcSubresource(mip, arraySlice, srcMipLevels);
+         UINT dstSubresource = D3D11CalcSubresource(mip, arraySlice, dstMipLevels);
+
+         D3D11DEVICECONTEXT->CopySubresourceRegion(
+            dstTex2D, dstSubresource,
+            0, 0, 0,
+            srcTex, srcSubresource,
+            nullptr);
+      }
+   }
 }
diff --git a/Engine/source/gfx/D3D11/gfxD3D11TextureObject.h b/Engine/source/gfx/D3D11/gfxD3D11TextureObject.h
index a99cb7e58..6849beba6 100644
--- a/Engine/source/gfx/D3D11/gfxD3D11TextureObject.h
+++ b/Engine/source/gfx/D3D11/gfxD3D11TextureObject.h
@@ -27,55 +27,77 @@
 #include "gfx/gfxTextureHandle.h"
 #include "gfx/gfxTextureManager.h"
 
+#include <wrl/client.h>
+using Microsoft::WRL::ComPtr;
+
 class GFXD3D11TextureObject : public GFXTextureObject
 {
 protected:
    static U32 mTexCount;
+
    GFXTexHandle mStagingTex;
    DXGI_MAPPED_RECT mLockRect;
    D3D11_BOX mLockBox;
-   bool mLocked;
+   bool mLocked = false;
 
-   U32 mLockedSubresource;
-   ID3D11Resource *mD3DTexture;
+   U32 mLockedSubresource = 0;
 
-   // used for z buffers...
-   ID3D11Texture2D *mD3DSurface;
+   // Main GPU texture resource (2D / 3D / Cubemap)
+   ComPtr<ID3D11Resource> mD3DTexture;
 
-   ID3D11ShaderResourceView* mSRView; // for shader resource input
-   ID3D11RenderTargetView* mRTView; // for render targets
-   ID3D11DepthStencilView* mDSView; //render target view for depth stencil
+   // Used for Z-targets
+   ComPtr<ID3D11Texture2D> mD3DSurface;
 
+   // Views
+   ComPtr<ID3D11ShaderResourceView> mSRView; // Shader resource
+   ComPtr<ID3D11RenderTargetView>   mRTView; // Render target
+   ComPtr<ID3D11DepthStencilView>   mDSView; // Depth stencil
+
+   // Cubemap face render target views (optional)
+   ComPtr<ID3D11RenderTargetView>   mCubeRTV[6];
 public:
 
-   GFXD3D11TextureObject( GFXDevice * d, GFXTextureProfile *profile);
+   GFXD3D11TextureObject( GFXDevice * d, GFXTextureProfile *profile, const U32 arraySize = 1);
    ~GFXD3D11TextureObject();
 
-   ID3D11Resource*      getResource(){ return mD3DTexture; }
-   ID3D11Texture2D*     get2DTex(){ return (ID3D11Texture2D*) mD3DTexture; }
-   ID3D11Texture2D**    get2DTexPtr(){ return (ID3D11Texture2D**) &mD3DTexture; }
-   ID3D11Texture3D*		get3DTex(){ return (ID3D11Texture3D*) mD3DTexture; }
-   ID3D11Texture3D**	get3DTexPtr(){ return (ID3D11Texture3D**) &mD3DTexture; }
-   
-   ID3D11ShaderResourceView* getSRView();
-   ID3D11RenderTargetView* getRTView();
-   ID3D11DepthStencilView* getDSView();
+   // Accessors
+   ID3D11Resource* getResource() const { return mD3DTexture.Get(); }
+   ID3D11Texture2D* get2DTex() const;
+   ID3D11Texture3D* get3DTex() const;
+   ID3D11Texture2D** get2DTexPtr();
+   ID3D11Texture3D** get3DTexPtr();
 
-   ID3D11ShaderResourceView** getSRViewPtr();
-   ID3D11RenderTargetView** getRTViewPtr();
-   ID3D11DepthStencilView** getDSViewPtr();
-  
+   ID3D11ShaderResourceView* getSRView() const { return mSRView.Get(); }
+   ID3D11RenderTargetView* getRTView() const { return mRTView.Get(); }
+   ID3D11DepthStencilView* getDSView() const { return mDSView.Get(); }
+
+   ID3D11ShaderResourceView** getSRViewPtr() { return mSRView.GetAddressOf(); }
+   ID3D11RenderTargetView** getRTViewPtr() { return mRTView.GetAddressOf(); }
+   ID3D11DepthStencilView** getDSViewPtr() { return mDSView.GetAddressOf(); }
+
+   // Cubemap face RTV access (for render-to-cubemap)
+   ID3D11RenderTargetView* getCubeFaceRTView(U32 face) const
+   {
+      AssertFatal(isCubeMap(), "Not a cubemap texture!");
+      AssertFatal(face < 6, "Invalid cubemap face index!");
+      return mCubeRTV[face].Get();
+   }
+
+   ID3D11RenderTargetView** getCubeFaceRTViewPtr(U32 face);
 
    void release();
 
    bool isManaged; //setting to true tells this texture not to be released from being zombify
 
-   GFXLockedRect * lock(U32 mipLevel = 0, RectI *inRect = NULL) override;
-   void unlock(U32 mipLevel = 0 ) override;
+   GFXLockedRect* lock(U32 mipLevel = 0, RectI* inRect = NULL, U32 faceIndex = 0) override;
+   void unlock(U32 mipLevel = 0, U32 faceIndex = 0) override;
 
    bool			copyToBmp(GBitmap* bmp) override;
-   ID3D11Texture2D*		getSurface() {return mD3DSurface;}
-   ID3D11Texture2D**	getSurfacePtr() {return &mD3DSurface;}
+   void generateMipMaps() override;
+   void updateTextureSlot(const GFXTexHandle& texHandle, const U32 slot, const S32 face = -1) override;
+   void copyTo(GFXTextureObject* dstTex) override;
+   ID3D11Texture2D*		getSurface() {return mD3DSurface.Get();}
+   ID3D11Texture2D**	getSurfacePtr() {return mD3DSurface.GetAddressOf();}
 
    // GFXResource
    void zombify() override;
diff --git a/Engine/source/gfx/Null/gfxNullDevice.cpp b/Engine/source/gfx/Null/gfxNullDevice.cpp
index fc0a75259..c98ffeeac 100644
--- a/Engine/source/gfx/Null/gfxNullDevice.cpp
+++ b/Engine/source/gfx/Null/gfxNullDevice.cpp
@@ -72,10 +72,12 @@ public:
    void pureVirtualCrash() override {}
 #endif
 
-   GFXLockedRect * lock( U32 mipLevel = 0, RectI *inRect = NULL ) override { return NULL; };
-   void unlock( U32 mipLevel = 0) override {};
+   GFXLockedRect * lock( U32 mipLevel = 0, RectI *inRect = NULL, U32 faceIndex = 0)override { return NULL; };
+   void unlock( U32 mipLevel = 0, U32 faceIndex = 0)override {};
    bool copyToBmp(GBitmap *) override { return false; };
-
+   void updateTextureSlot(const GFXTexHandle& texHandle, const U32 slot, const S32 face = -1) override {};
+   void copyTo(GFXTextureObject* dstTex) override {};
+   void generateMipMaps() override {};
    void zombify() override {}
    void resurrect() override {}
 };
@@ -94,8 +96,8 @@ public:
    GFXTextureObject* createTexture(DDSFile* dds, GFXTextureProfile* profile, bool deleteDDS) override { return nullptr; }
    GFXTextureObject* createTexture(const Torque::Path& path, GFXTextureProfile* profile) override { return nullptr; }
    GFXTextureObject* createTexture(U32 width, U32 height, void* pixels, GFXFormat format, GFXTextureProfile* profile) override { return nullptr; }
-   GFXTextureObject* createTexture(U32 width, U32 height, U32 depth, GFXFormat format, GFXTextureProfile* profile, U32 numMipLevels = 1) override { return nullptr; }
-   GFXTextureObject* createTexture(U32 width, U32 height, GFXFormat format, GFXTextureProfile* profile, U32 numMipLevels, S32 antialiasLevel) override { return nullptr; }
+   GFXTextureObject* createTexture(U32 width, U32 height, U32 depth, GFXFormat format, GFXTextureProfile* profile, U32 numMipLevels = 1, U32 arraySize = 1) override { return nullptr; }
+   GFXTextureObject* createTexture(U32 width, U32 height, GFXFormat format, GFXTextureProfile* profile, U32 numMipLevels, S32 antialiasLevel, U32 arraySize = 1) override { return nullptr; }
    GFXTextureObject* createCompositeTexture(GBitmap* bmp[4], U32 inputKey[4], const String& resourceName, GFXTextureProfile* profile, bool deleteBmp) override { return nullptr; }
 protected:
       GFXTextureObject *_createTextureObject( U32 height, 
@@ -105,7 +107,8 @@ protected:
                                                       GFXTextureProfile *profile, 
                                                       U32 numMipLevels, 
                                                       bool forceMips = false, 
-                                                      S32 antialiasLevel = 0, 
+                                                      S32 antialiasLevel = 0,
+                                                      U32 arraySize = 1,
                                                       GFXTextureObject *inTex = NULL ) override
       { 
          GFXNullTextureObject *retTex;
diff --git a/Engine/source/gfx/bitmap/bitmapUtils.cpp b/Engine/source/gfx/bitmap/bitmapUtils.cpp
index 3dd638852..903fde81e 100644
--- a/Engine/source/gfx/bitmap/bitmapUtils.cpp
+++ b/Engine/source/gfx/bitmap/bitmapUtils.cpp
@@ -24,6 +24,11 @@
 
 #include "platform/platform.h"
 
+#ifndef STB_IMAGE_RESIZE2_IMPLEMENTATION
+#define STB_IMAGE_RESIZE2_IMPLEMENTATION
+#define STBIR_PROFILE
+#include "gfx/bitmap/loaders/stb/stb_image_resize2.h"
+#endif // !STB_IMAGE_RESIZE2_IMPLEMENTATION
 
 void bitmapExtrude5551_c(const void *srcMip, void *mip, U32 srcHeight, U32 srcWidth)
 {
@@ -67,7 +72,7 @@ void bitmapExtrude5551_c(const void *srcMip, void *mip, U32 srcHeight, U32 srcWi
       {
          U32 a = src[0];
          U32 c = src[stride];
-#if defined(TORQUE_OS_MAC)
+#if defined(TORQUE_BIG_ENDIAN)
             dst[y] = ((( (a >> 10) + (c >> 10)) >> 1) << 10) |
                      ((( ((a >> 5) & 0x1F) + ((c >> 5) & 0x1f)) >> 1) << 5) |
                      ((( ((a >> 0) & 0x1F) + ((c >> 0) & 0x1f)) >> 1) << 0);
@@ -81,151 +86,434 @@ void bitmapExtrude5551_c(const void *srcMip, void *mip, U32 srcHeight, U32 srcWi
    }
 }
 
-
 //--------------------------------------------------------------------------
-void bitmapExtrudeRGB_c(const void *srcMip, void *mip, U32 srcHeight, U32 srcWidth)
+
+template <typename T>
+void bitmapExtrudeGeneric(
+   const T* src, T* dst,
+   U32 srcWidth, U32 srcHeight,
+   U32 channels, U32 bpp)
 {
-   const U8 *src = (const U8 *) srcMip;
-   U8 *dst = (U8 *) mip;
-   U32 stride = srcHeight != 1 ? (srcWidth) * 3 : 0;
+   U32 srcRowStride = srcHeight != 1 ? (srcWidth * bpp) / sizeof(T) : 0;
+   U32 dstWidth = srcWidth > 1 ? srcWidth / 2 : 1;
+   U32 dstHeight = srcHeight > 1 ? srcHeight / 2 : 1;
+   U32 dstRowStride = dstHeight != 1 ? (dstWidth * bpp) / sizeof(T) : 0;
 
-   U32 width  = srcWidth  >> 1;
-   U32 height = srcHeight >> 1;
-   if (width  == 0) width  = 1;
-   if (height == 0) height = 1;
-
-   if (srcWidth != 1)
+   for (U32 y = 0; y < dstHeight; ++y)
    {
-      for(U32 y = 0; y < height; y++)
+      for (U32 x = 0; x < dstWidth; ++x)
       {
-         for(U32 x = 0; x < width; x++)
+         for (U32 c = 0; c < channels; ++c)
          {
-            *dst++ = (U32(*src) + U32(src[3]) + U32(src[stride]) + U32(src[stride+3]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[3]) + U32(src[stride]) + U32(src[stride+3]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[3]) + U32(src[stride]) + U32(src[stride+3]) + 2) >> 2;
-            src += 4;
-         }
-         src += stride;   // skip
-      }
-   }
-   else
-   {
-      for(U32 y = 0; y < height; y++)
-      {
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src += 4;
+            U32 x0 = x * 2;
+            U32 y0 = y * 2;
+            U32 x1 = (x0 + 1 < srcWidth) ? x0 + 1 : x0;
+            U32 y1 = (y0 + 1 < srcHeight) ? y0 + 1 : y0;
 
-         src += stride;   // skip
+            if constexpr (std::is_floating_point_v<T>)
+            {
+               T sum = 0;
+               sum += src[y0 * srcRowStride + x0 * channels + c];
+               sum += src[y0 * srcRowStride + x1 * channels + c];
+               sum += src[y1 * srcRowStride + x0 * channels + c];
+               sum += src[y1 * srcRowStride + x1 * channels + c];
+
+               dst[y * dstRowStride + x * channels + c] = sum * 0.25f;
+            }
+            else
+            {
+               U32 sum = 0;
+               sum += src[y0 * srcRowStride + x0 * channels + c];
+               sum += src[y0 * srcRowStride + x1 * channels + c];
+               sum += src[y1 * srcRowStride + x0 * channels + c];
+               sum += src[y1 * srcRowStride + x1 * channels + c];
+               dst[y * dstRowStride + x * channels + c] = T((sum + 2) >> 2);
+            }
+         }
       }
    }
 }
 
-//--------------------------------------------------------------------------
-void bitmapExtrudeRGBA_c(const void *srcMip, void *mip, U32 srcHeight, U32 srcWidth)
+// 8-bit RGBA
+auto bitmapExtrudeU8_RGBA = [](const void* src, void* dst, U32 h, U32 w, U32 bpp) {
+   bitmapExtrudeGeneric((const U8*)src, (U8*)dst, w, h, 4, bpp);
+};
+
+// 16-bit RGBA (U16 / F32 stored as U16)
+auto bitmapExtrudeU16_RGBA = [](const void* src, void* dst, U32 h, U32 w, U32 bpp) {
+   bitmapExtrudeGeneric((const U16*)src, (U16*)dst, w, h, 4, bpp);
+};
+
+// 32-bit float RGBA
+auto bitmapExtrudeF32_RGBA = [](const void* src, void* dst, U32 h, U32 w, U32 bpp) {
+   bitmapExtrudeGeneric((const F32*)src, (F32*)dst, w, h, 4, bpp);
+};
+
+// RGB U8
+auto bitmapExtrudeU8_RGB = [](const void* src, void* dst, U32 h, U32 w, U32 bpp) {
+   bitmapExtrudeGeneric((const U8*)src, (U8*)dst, w, h, 3, bpp);
+};
+
+void (*bitmapExtrude5551)(const void* srcMip, void* mip, U32 height, U32 width) = bitmapExtrude5551_c;
+void (*bitmapExtrudeRGB)(const void* srcMip, void* mip, U32 srcHeight, U32 srcWidth, U32 bpp) = bitmapExtrudeU8_RGB;
+void (*bitmapExtrudeRGBA)(const void* srcMip, void* mip, U32 srcHeight, U32 srcWidth, U32 bpp) = bitmapExtrudeU8_RGBA;
+void (*bitmapExtrude16BitRGBA)(const void* srcMip, void* mip, U32 srcHeight, U32 srcWidth, U32 bpp) = bitmapExtrudeU16_RGBA;
+void (*bitmapExtrudeFPRGBA)(const void* srcMip, void* mip, U32 srcHeight, U32 srcWidth, U32 bpp) = bitmapExtrudeU16_RGBA;
+void (*bitmapExtrudeF32RGBA)(const void* srcMip, void* mip, U32 srcHeight, U32 srcWidth, U32 bpp) = bitmapExtrudeF32_RGBA;
+
+struct StbResizeDesc
 {
-   const U8 *src = (const U8 *) srcMip;
-   U8 *dst = (U8 *) mip;
-   U32 stride = srcHeight != 1 ? (srcWidth) * 4 : 0;
+   stbir_datatype datatype;
+   stbir_pixel_layout layout;
+   U32 bytesPerPixel;
+};
 
-   U32 width  = srcWidth  >> 1;
-   U32 height = srcHeight >> 1;
-   if (width  == 0) width  = 1;
-   if (height == 0) height = 1;
-
-   if (srcWidth != 1)
+inline bool getStbResizeDesc(GFXFormat fmt, StbResizeDesc& out)
+{
+   switch (fmt)
    {
-      for(U32 y = 0; y < height; y++)
+      // ---- 1 channel ----
+   case GFXFormatA8:
+   case GFXFormatL8:
+      out = { STBIR_TYPE_UINT8, STBIR_1CHANNEL, 1 };
+      return true;
+
+   case GFXFormatL16:
+      out = { STBIR_TYPE_UINT16, STBIR_1CHANNEL, 2 };
+      return true;
+
+   case GFXFormatR16F:
+      out = { STBIR_TYPE_HALF_FLOAT, STBIR_1CHANNEL, 2 };
+      return true;
+
+   case GFXFormatR32F:
+      out = { STBIR_TYPE_FLOAT, STBIR_1CHANNEL, 4 };
+      return true;
+
+      // ---- 2 channel ----
+   case GFXFormatA8L8:
+      out = { STBIR_TYPE_UINT8, STBIR_2CHANNEL, 2 };
+      return true;
+
+   case GFXFormatR16G16:
+      out = { STBIR_TYPE_UINT16, STBIR_2CHANNEL, 4 };
+      return true;
+
+   case GFXFormatR16G16F:
+      out = { STBIR_TYPE_HALF_FLOAT, STBIR_2CHANNEL, 4 };
+      return true;
+
+      // ---- RGB ----
+   case GFXFormatR8G8B8:
+      out = { STBIR_TYPE_UINT8, STBIR_RGB, 3 };
+      return true;
+
+   case GFXFormatR8G8B8_SRGB:
+      out = { STBIR_TYPE_UINT8_SRGB, STBIR_RGB, 3 };
+      return true;
+
+      // ---- RGBA / RGBX ----
+   case GFXFormatR8G8B8A8:
+   case GFXFormatR8G8B8X8:
+      out = { STBIR_TYPE_UINT8, STBIR_RGBA, 4 };
+      return true;
+
+   case GFXFormatR8G8B8A8_SRGB:
+      out = { STBIR_TYPE_UINT8_SRGB_ALPHA, STBIR_RGBA, 4 };
+      return true;
+
+   case GFXFormatB8G8R8A8:
+      out = { STBIR_TYPE_UINT8, STBIR_BGRA, 4 };
+      return true;
+
+      // ---- 16-bit RGBA ----
+   case GFXFormatR16G16B16A16:
+      out = { STBIR_TYPE_UINT16, STBIR_RGBA, 8 };
+      return true;
+
+   case GFXFormatR16G16B16A16F:
+      out = { STBIR_TYPE_HALF_FLOAT, STBIR_RGBA, 8 };
+      return true;
+
+      // ---- 32-bit RGBA ----
+   case GFXFormatR32G32B32A32F:
+      out = { STBIR_TYPE_FLOAT, STBIR_RGBA, 16 };
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+void bitmapStbResizeToOutput(const void* src, U32 srcHeight, U32 srcWidth, void* out, U32 outHeight, U32 outWidth, U32 bpp, GFXFormat format)
+{
+   StbResizeDesc desc;
+   if (!getStbResizeDesc(format, desc))
+   {
+      return;
+   }
+
+   const int srcStride = srcWidth * bpp;
+   const int dstStride = outWidth * bpp;
+
+   stbir_resize(
+      src,
+      srcWidth,
+      srcHeight,
+      srcStride,
+      out,
+      outWidth,
+      outHeight,
+      dstStride,
+      desc.layout,
+      desc.datatype,
+      STBIR_EDGE_CLAMP,
+      STBIR_FILTER_MITCHELL);
+}
+
+void(*bitmapResizeToOutput)(const void* src, U32 srcHeight, U32 srcWidth, void* out, U32 outHeight, U32 outWidth, U32 bpp, GFXFormat format) = bitmapStbResizeToOutput;
+
+//--------------------------------------------------------------------------------
+// Format description
+
+//--------------------------------------------------------------------------------
+// Channel semantics
+enum ChannelSemantic : U8
+{
+   CH_NONE,
+   CH_L,
+   CH_A,
+   CH_R,
+   CH_G,
+   CH_B
+};
+
+//--------------------------------------------------------------------------------
+// Bitmap format descriptor
+struct GBitmapFormatDesc
+{
+   U8 channels;
+   ChannelSemantic semantic[4]; // per-channel meaning
+   stbir_datatype datatype;
+   bool srgb;
+   bool premultiplied;
+   bool isFloat;
+   U8 bytesPerChannel;
+
+   bool is8()  const { return !isFloat && bytesPerChannel == 1; }
+   bool is16() const { return !isFloat && bytesPerChannel == 2; }
+};
+
+//--------------------------------------------------------------------------------
+// Table mapping GFXFormat -> descriptor
+GBitmapFormatDesc getFormatDesc(GFXFormat fmt)
+{
+   switch (fmt)
+   {
+      // 8-bit formats
+   case GFXFormatA8:
+      return { 1, {CH_A, CH_NONE, CH_NONE, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatL8:
+      return { 1, {CH_L, CH_NONE, CH_NONE, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatA4L4:
+      return { 2, {CH_L, CH_A, CH_NONE, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+
+      // 16-bit formats
+   case GFXFormatR5G6B5:
+      return { 3, {CH_R, CH_G, CH_B, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatR5G5B5A1:
+      return { 4, {CH_R, CH_G, CH_B, CH_A}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatR5G5B5X1:
+      return { 4, {CH_R, CH_G, CH_B, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatA8L8:
+      return { 2, {CH_L, CH_A, CH_NONE, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatL16:
+      return { 1, {CH_L, CH_NONE, CH_NONE, CH_NONE}, STBIR_TYPE_UINT16, false, false, false, 2 };
+   case GFXFormatR16F:
+      return { 1, {CH_R, CH_NONE, CH_NONE, CH_NONE}, STBIR_TYPE_HALF_FLOAT, false, false, true, 2 };
+   case GFXFormatD16:
+      return { 1, {CH_L, CH_NONE, CH_NONE, CH_NONE}, STBIR_TYPE_UINT16, false, false, false, 2 };
+
+      // 24-bit formats
+   case GFXFormatR8G8B8:
+      return { 3, {CH_R, CH_G, CH_B, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatR8G8B8_SRGB:
+      return { 3, {CH_R, CH_G, CH_B, CH_NONE}, STBIR_TYPE_UINT8_SRGB, true, false, false, 1 };
+
+      // 32-bit formats
+   case GFXFormatR8G8B8A8:
+   case GFXFormatR8G8B8X8:
+      return { 4, {CH_R, CH_G, CH_B, CH_A}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatB8G8R8A8:
+      return { 4, {CH_B, CH_G, CH_R, CH_A}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   case GFXFormatR8G8B8A8_SRGB:
+      return { 4, {CH_R, CH_G, CH_B, CH_A}, STBIR_TYPE_UINT8_SRGB_ALPHA, true, false, false, 1 };
+   case GFXFormatR16G16:
+      return { 2, {CH_R, CH_G, CH_NONE, CH_NONE}, STBIR_TYPE_UINT16, false, false, false, 2 };
+   case GFXFormatR16G16F:
+      return { 2, {CH_R, CH_G, CH_NONE, CH_NONE}, STBIR_TYPE_HALF_FLOAT, false, false, true, 2 };
+
+      // 64-bit formats
+   case GFXFormatR16G16B16A16:
+      return { 4, {CH_R, CH_G, CH_B, CH_A}, STBIR_TYPE_UINT16, false, false, false, 2 };
+   case GFXFormatR16G16B16A16F:
+      return { 4, {CH_R, CH_G, CH_B, CH_A}, STBIR_TYPE_HALF_FLOAT, false, false, true, 2 };
+
+      // 128-bit formats
+   case GFXFormatR32G32B32A32F:
+      return { 4, {CH_R, CH_G, CH_B, CH_A}, STBIR_TYPE_FLOAT, false, false, true, 4 };
+
+   default: // fallback
+      return { 1, {CH_L, CH_NONE, CH_NONE, CH_NONE}, STBIR_TYPE_UINT8, false, false, false, 1 };
+   }
+}
+
+//--------------------------------------------------------------------------------
+// Conversion plan
+struct ConversionPlan
+{
+   bool bitDepthChange;
+   bool channelRepack;
+   bool colorspaceChange;
+};
+
+ConversionPlan decideConversion(const GBitmapFormatDesc& src, const GBitmapFormatDesc& dst)
+{
+   ConversionPlan plan = {};
+   plan.bitDepthChange = src.bytesPerChannel != dst.bytesPerChannel || src.isFloat != dst.isFloat;
+   plan.channelRepack = src.channels != dst.channels || dMemcmp(src.semantic, dst.semantic, sizeof(src.semantic)) != 0;
+   plan.colorspaceChange = src.srgb != dst.srgb;
+   return plan;
+}
+
+//--------------------------------------------------------------------------------
+// Linear representation
+struct LinearPixel
+{
+   float r = 0.f, g = 0.f, b = 0.f, a = 1.f;
+};
+
+inline float srgbToLinear(float c)
+{
+   return (c <= 0.04045f) ? c / 12.92f : powf((c + 0.055f) / 1.055f, 2.4f);
+}
+
+inline float linearToSrgb(float c)
+{
+   return (c <= 0.0031308f) ? c * 12.92f : 1.055f * powf(c, 1.f / 2.4f) - 0.055f;
+}
+
+//--------------------------------------------------------------------------------
+// Load a pixel from src format into LinearPixel
+static inline LinearPixel loadPixel(const void* src, const GBitmapFormatDesc& fmt, U32 index)
+{
+   LinearPixel p;
+   const U8* base = (const U8*)src + index * fmt.channels * fmt.bytesPerChannel;
+
+   for (U32 c = 0; c < fmt.channels; ++c)
+   {
+      float v = 0.f;
+      if (fmt.is8())
+         v = float(base[c]) / 255.f;
+      else if (fmt.is16())
+         v = float(convert16To8(*(const U16*)(base + c * 2))) / 255.f;
+      else if (fmt.isFloat)
       {
-         for(U32 x = 0; x < width; x++)
-         {
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride+4]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride+4]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride+4]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride+4]) + 2) >> 2;
-            src += 5;
-         }
-         src += stride;   // skip
+         if (fmt.bytesPerChannel == 2) // half float
+            v = convertHalfToFloat(*(const U16*)(base + c * 2));
+         else // full float
+            v = *(const float*)(base + c * 4);
+      }
+
+      if (fmt.srgb && fmt.semantic[c] != CH_A)
+         v = srgbToLinear(v);
+
+      switch (fmt.semantic[c])
+      {
+      case CH_R: p.r = v; break;
+      case CH_G: p.g = v; break;
+      case CH_B: p.b = v; break;
+      case CH_A: p.a = v; break;
+      case CH_L: p.r = p.g = p.b = v; break;
+      default: break;
       }
    }
-   else
-   {
-      for(U32 y = 0; y < height; y++)
-      {
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src += 5;
+   return p;
+}
 
-         src += stride;   // skip
+//--------------------------------------------------------------------------------
+// Store a LinearPixel into dst format
+static inline void storePixel(void* dst, const GBitmapFormatDesc& fmt, U32 index, const LinearPixel& p)
+{
+   U8* base = (U8*)dst + index * fmt.channels * fmt.bytesPerChannel;
+   for (U32 c = 0; c < fmt.channels; ++c)
+   {
+      float v = 0.f;
+      switch (fmt.semantic[c])
+      {
+      case CH_R: v = p.r; break;
+      case CH_G: v = p.g; break;
+      case CH_B: v = p.b; break;
+      case CH_A: v = p.a; break;
+      case CH_L: v = (p.r + p.g + p.b) / 3.f; break;
+      default: break;
+      }
+
+      if (fmt.srgb && fmt.semantic[c] != CH_A)
+         v = linearToSrgb(v);
+
+      if (fmt.is8())
+         base[c] = uint8_t(mClamp(v * 255.f, 0.f, 255.f));
+      else if (fmt.is16())
+         *(U16*)(base + c * 2) = convert8To16(uint8_t(mClamp(v * 255.f, 0.f, 255.f)));
+      else if (fmt.isFloat)
+      {
+         if (fmt.bytesPerChannel == 2) // half float
+            *(U16*)(base + c * 2) = convertFloatToHalf(v);
+         else
+            *(float*)(base + c * 4) = v;
       }
    }
 }
 
-void bitmapExtrudeFPRGBA_c(const void *srcMip, void *mip, U32 srcHeight, U32 srcWidth)
+//--------------------------------------------------------------------------------
+// Main generalized converter
+bool bitmapConvertFormat(U8** srcBuffer, U32 pixels, const GBitmapFormatDesc& srcFmt, const GBitmapFormatDesc& dstFmt)
 {
-   const U16 *src = (const U16 *)srcMip;
-   U16 *dst = (U16 *)mip;
-   U32 stride = srcHeight != 1 ? (srcWidth) * 8 : 0;
+   ConversionPlan plan = decideConversion(srcFmt, dstFmt);
+   if (!plan.bitDepthChange && !plan.channelRepack && !plan.colorspaceChange)
+      return true; // nothing to do
 
-   U32 width = srcWidth >> 1;
-   U32 height = srcHeight >> 1;
-   if (width == 0) width = 1;
-   if (height == 0) height = 1;
+   void* dstBuffer = *srcBuffer;
 
-   if (srcWidth != 1)
+   if (plan.bitDepthChange || plan.channelRepack)
+      dstBuffer = new U8[pixels * dstFmt.channels * dstFmt.bytesPerChannel];
+
+   for (U32 i = 0; i < pixels; ++i)
    {
-      for (U32 y = 0; y < height; y++)
-      {
-         for (U32 x = 0; x < width; x++)
-         {
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride + 4]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride + 4]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride + 4]) + 2) >> 2;
-            src++;
-            *dst++ = (U32(*src) + U32(src[4]) + U32(src[stride]) + U32(src[stride + 4]) + 2) >> 2;
-            src += 5;
-         }
-         src += stride;   // skip
-      }
+      LinearPixel p = loadPixel(*srcBuffer, srcFmt, i);
+      storePixel(dstBuffer, dstFmt, i, p);
    }
-   else
-   {
-      for (U32 y = 0; y < height; y++)
-      {
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src++;
-         *dst++ = (U32(*src) + U32(src[stride]) + 1) >> 1;
-         src += 5;
 
-         src += stride;   // skip
-      }
+   if (dstBuffer != *srcBuffer)
+   {
+      delete[](U8*)* srcBuffer;
+      *srcBuffer = (U8*)dstBuffer;
    }
+
+   return true;
 }
 
-void (*bitmapExtrude5551)(const void *srcMip, void *mip, U32 height, U32 width) = bitmapExtrude5551_c;
-void (*bitmapExtrudeRGB)(const void *srcMip, void *mip, U32 srcHeight, U32 srcWidth) = bitmapExtrudeRGB_c;
-void (*bitmapExtrudeRGBA)(const void *srcMip, void *mip, U32 srcHeight, U32 srcWidth) = bitmapExtrudeRGBA_c;
-void (*bitmapExtrudeFPRGBA)(const void *srcMip, void *mip, U32 srcHeight, U32 srcWidth) = bitmapExtrudeFPRGBA_c;
+//--------------------------------------------------------------------------------
+// Entry point for GBitmap::setFormat
+bool bitmapALLConvertToOutput(U8** src, U32 pixels, GFXFormat srcFormat, GFXFormat dstFormat)
+{
+   const GBitmapFormatDesc& srcFmt = getFormatDesc(srcFormat);
+   const GBitmapFormatDesc& dstFmt = getFormatDesc(dstFormat);
+   return bitmapConvertFormat(src, pixels, srcFmt, dstFmt);
+}
 
+bool(*bitmapConvertToOutput)(U8** src, U32 pixels, GFXFormat srcFormat, GFXFormat dstFormat) = bitmapALLConvertToOutput;
 
 //--------------------------------------------------------------------------
 
@@ -238,7 +526,7 @@ void bitmapConvertRGB_to_1555_c(U8 *src, U32 pixels)
       U32 g = src[1] >> 3;
       U32 b = src[2] >> 3;
 
-#if defined(TORQUE_OS_MAC)
+#if defined(TORQUE_BIG_ENDIAN)
       *dst++ = 0x8000 | (b << 10) | (g << 5) | (r << 0);
 #else
       *dst++ = b | (g << 5) | (r << 10) | 0x8000;
@@ -260,7 +548,7 @@ void bitmapConvertRGB_to_5551_c(U8 *src, U32 pixels)
       U32 g = src[1] >> 3;
       U32 b = src[2] >> 3;
 
-#if defined(TORQUE_OS_MAC)
+#if defined(TORQUE_BIG_ENDIAN)
       *dst++ = (1 << 15) | (b << 10) | (g << 5) | (r << 0);
 #else
       *dst++ = (b << 1) | (g << 6) | (r << 11) | 1;
diff --git a/Engine/source/gfx/bitmap/bitmapUtils.h b/Engine/source/gfx/bitmap/bitmapUtils.h
index 489a8f296..10053fdf7 100644
--- a/Engine/source/gfx/bitmap/bitmapUtils.h
+++ b/Engine/source/gfx/bitmap/bitmapUtils.h
@@ -22,21 +22,148 @@
 
 #ifndef _BITMAPUTILS_H_
 #define _BITMAPUTILS_H_
-
+#ifndef _PLATFORM_H_
+#include "platform/platform.h"
+#endif
 #ifndef _TORQUE_TYPES_H_
 #include "platform/types.h"
 #endif
+#ifndef _GFXENUMS_H_
+#include "gfx/gfxEnums.h"
+#endif
+#ifndef _MMATHFN_H_
+#include "math/mMathFn.h"
+#endif
 
 extern void (*bitmapExtrude5551)(const void *srcMip, void *mip, U32 height, U32 width);
-extern void (*bitmapExtrudeRGB)(const void *srcMip, void *mip, U32 height, U32 width);
-extern void (*bitmapExtrudeRGBA)(const void *srcMip, void *mip, U32 height, U32 width);
-extern void(*bitmapExtrudeFPRGBA)(const void *srcMip, void *mip, U32 height, U32 width);
+extern void (*bitmapExtrudeRGB)(const void *srcMip, void *mip, U32 height, U32 width, U32 bpp);
+extern void (*bitmapExtrudeRGBA)(const void *srcMip, void *mip, U32 height, U32 width, U32 bpp);
+extern void (*bitmapExtrude16BitRGBA)(const void *srcMip, void *mip, U32 height, U32 width, U32 bpp);
+extern void(*bitmapExtrudeFPRGBA)(const void *srcMip, void *mip, U32 height, U32 width, U32 bpp);
+extern void(*bitmapExtrudeF32RGBA)(const void *srcMip, void *mip, U32 height, U32 width, U32 bpp);
+
+extern void(*bitmapResizeToOutput)(const void* src, U32 srcHeight, U32 srcWidth, void* out, U32 outHeight, U32 outWidth, U32 bpp, GFXFormat format);
+extern bool(*bitmapConvertToOutput)(U8** src, U32 pixels, GFXFormat srcFormat, GFXFormat dstFormat);
+
 extern void (*bitmapConvertRGB_to_5551)(U8 *src, U32 pixels);
 extern void (*bitmapConvertRGB_to_1555)(U8 *src, U32 pixels);
 extern void (*bitmapConvertRGB_to_RGBX)( U8 **src, U32 pixels );
 extern void (*bitmapConvertRGBX_to_RGB)( U8 **src, U32 pixels );
 extern void (*bitmapConvertA8_to_RGBA)( U8 **src, U32 pixels );
 
-void bitmapExtrudeRGB_c(const void *srcMip, void *mip, U32 height, U32 width);
+//-----------------------------------------------------------------------------
+// Half <-> Float Conversion Utilities
+//-----------------------------------------------------------------------------
+
+inline F32 convertHalfToFloat(U16 h)
+{
+   U32 sign = (h >> 15) & 0x00000001;
+   U32 exp = (h >> 10) & 0x0000001F;
+   U32 mant = h & 0x000003FF;
+
+   U32 outSign = sign << 31;
+   U32 outExp, outMant;
+
+   if (exp == 0)
+   {
+      if (mant == 0)
+      {
+         // Zero
+         outExp = 0;
+         outMant = 0;
+      }
+      else
+      {
+         // Subnormal number -> normalize
+         exp = 1;
+         while ((mant & 0x00000400) == 0)
+         {
+            mant <<= 1;
+            exp -= 1;
+         }
+         mant &= 0x000003FF;
+         outExp = (exp + (127 - 15)) << 23;
+         outMant = mant << 13;
+      }
+   }
+   else if (exp == 31)
+   {
+      // Inf or NaN
+      outExp = 0xFF << 23;
+      outMant = mant ? (mant << 13) : 0;
+   }
+   else
+   {
+      // Normalized
+      outExp = (exp + (127 - 15)) << 23;
+      outMant = mant << 13;
+   }
+
+   U32 out = outSign | outExp | outMant;
+   F32 result;
+   dMemcpy(&result, &out, sizeof(F32));
+   return result;
+}
+
+inline U16 convertFloatToHalf(F32 f)
+{
+   U32 bits;
+   dMemcpy(&bits, &f, sizeof(U32));
+
+   U32 sign = (bits >> 16) & 0x00008000;
+   U32 exp = ((bits >> 23) & 0x000000FF) - (127 - 15);
+   U32 mant = bits & 0x007FFFFF;
+
+   if (exp <= 0)
+   {
+      if (exp < -10)
+         return (U16)sign; // Too small => 0
+      mant = (mant | 0x00800000) >> (1 - exp);
+      return (U16)(sign | (mant >> 13));
+   }
+   else if (exp == 0xFF - (127 - 15))
+   {
+      if (mant == 0)
+      {
+         // Inf
+         return (U16)(sign | 0x7C00);
+      }
+      else
+      {
+         // NaN
+         mant >>= 13;
+         return (U16)(sign | 0x7C00 | mant | (mant == 0));
+      }
+   }
+   else
+   {
+      if (exp > 30)
+      {
+         // Overflow => Inf
+         return (U16)(sign | 0x7C00);
+      }
+      return (U16)(sign | (exp << 10) | (mant >> 13));
+   }
+}
+
+// Convert a single 16-bit value (0..65535) to 8-bit (0..255)
+inline U8 convert16To8(U16 v16)
+{
+   // Take the top 8 bits as approximation
+   return U8(v16 >> 8);
+}
+
+// Convert a single 8-bit value (0..255) to 16-bit (0..65535)
+inline U16 convert8To16(U8 v8)
+{
+   // Replicate into high and low byte: 0->0, 255->0xFFFF
+   return (U16(v8) << 8) | v8;
+}
+
+inline U8 floatTo8(F32 v)
+{
+   return U8(mClamp(v * 255.f, 0.f, 255.f));
+}
+
 
 #endif //_BITMAPUTILS_H_
diff --git a/Engine/source/gfx/bitmap/cubemapSaver.cpp b/Engine/source/gfx/bitmap/cubemapSaver.cpp
index 734fe19cc..b180c1a9a 100644
--- a/Engine/source/gfx/bitmap/cubemapSaver.cpp
+++ b/Engine/source/gfx/bitmap/cubemapSaver.cpp
@@ -34,9 +34,7 @@
 
 namespace CubemapSaver
 {
-   const U32 CubeFaces = 6;
-
-   bool save(GFXCubemapHandle cubemap, const Torque::Path &path, GFXFormat compressionFormat)
+   bool save(GFXTexHandle cubemap, const Torque::Path &path, GFXFormat compressionFormat)
    {
       if (!cubemap.isValid())
       {
@@ -44,43 +42,24 @@ namespace CubemapSaver
          return false;
       }
 
-
-      GFXCubemap *pCubemap = cubemap.getPointer();
-      const U32 faceSize = pCubemap->getSize();
-      const U32 mipLevels = pCubemap->getMipMapLevels();
-
-      GFXFormat targetFmt = pCubemap->getFormat();
-      //setup render targets
-      GFXTexHandle pTextures[CubeFaces];
-
-      for (U32 face = 0; face < CubeFaces; face++)
-      {
-         pTextures[face].set(faceSize, faceSize, targetFmt,
-            &GFXTexturePersistentProfile, avar("%s() - (line %d)", __FUNCTION__, __LINE__),
-            mipLevels, GFXTextureManager::AA_MATCH_BACKBUFFER);
-
-         // yep t3d has funky z up, need to change the face order
-         GFX->copyResource(pTextures[face], pCubemap, GFXCubemap::zUpFaceIndex(face) );
-      }
-
-      GBitmap *pBitmaps[CubeFaces];
       bool error = false;
       const bool compressedFormat = ImageUtil::isCompressedFormat(compressionFormat);
+      const U32 faceSize = cubemap->getWidth();
+      const U32 mipLevels = cubemap->getMipLevels();
+      GFXFormat targetFmt = cubemap->getFormat();
       const bool hasMips = mipLevels > 1 ? true : false;
-      for (U32 i = 0; i < CubeFaces; i++)
+
+      GBitmap* temp = new GBitmap(faceSize, faceSize, hasMips, targetFmt, 6);
+      bool result = cubemap.copyToBmp(temp);
+      if (!result)
       {
-         pBitmaps[i] = new GBitmap(faceSize, faceSize, hasMips, targetFmt);
-         bool result = pTextures[i].copyToBmp(pBitmaps[i]);
-         if (!result)
-         {
-            Con::errorf("CubemapSaver: cubemap number %u failed to copy", i);
-            error = true;
-         }
+         Con::errorf("CubemapSaver: cubemap failed to copy");
+         error = true;
       }
 
       if (!error)
       {
-         DDSFile *pDds = DDSFile::createDDSCubemapFileFromGBitmaps(pBitmaps);
+         DDSFile *pDds = DDSFile::createDDSFileFromGBitmap(temp);
          if (pDds)
          {
             // compressed and floating point don't need swizzling
@@ -103,14 +82,12 @@ namespace CubemapSaver
       }
 
       //cleanup
-      for (U32 i = 0; i < CubeFaces; i++)
-         SAFE_DELETE(pBitmaps[i]);
-
+      SAFE_DELETE(temp);
 
       return true;
    }
 
-   bool getBitmaps(GFXCubemapHandle cubemap, GFXFormat compressionFormat, GBitmap* faceBitmaps[6])
+   bool getBitmaps(GFXTexHandle cubemap, GFXFormat compressionFormat, GBitmap* faceBitmaps[6])
    {
       if (!cubemap.isValid())
       {
diff --git a/Engine/source/gfx/bitmap/cubemapSaver.h b/Engine/source/gfx/bitmap/cubemapSaver.h
index 1361f75ba..8385eeba6 100644
--- a/Engine/source/gfx/bitmap/cubemapSaver.h
+++ b/Engine/source/gfx/bitmap/cubemapSaver.h
@@ -33,9 +33,9 @@
 namespace CubemapSaver
 {
    // save cubemap handle to a dds cubemap with optional compression
-   bool save(GFXCubemapHandle cubemap, const Torque::Path &path, GFXFormat compressionFormat = GFXFormatR8G8B8A8);
+   bool save(GFXTexHandle cubemap, const Torque::Path &path, GFXFormat compressionFormat = GFXFormatR8G8B8A8);
 
-   bool getBitmaps(GFXCubemapHandle cubemap, GFXFormat compressionFormat, GBitmap* faceBitmaps[6]);
+   bool getBitmaps(GFXTexHandle cubemap, GFXFormat compressionFormat, GBitmap* faceBitmaps[6]);
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Engine/source/gfx/bitmap/ddsFile.cpp b/Engine/source/gfx/bitmap/ddsFile.cpp
index f20445058..b518af948 100644
--- a/Engine/source/gfx/bitmap/ddsFile.cpp
+++ b/Engine/source/gfx/bitmap/ddsFile.cpp
@@ -652,6 +652,12 @@ Resource<DDSFile> DDSFile::load( const Torque::Path &path, U32 dropMipCount )
 
 //------------------------------------------------------------------------------
 
+bool DDSFile::isCompressedFormat(GFXFormat fmt)
+{
+   return (fmt >= GFXFormatBC1 && fmt <= GFXFormatBC5) ||
+      (fmt >= GFXFormatBC1_SRGB && fmt <= GFXFormatBC3_SRGB);
+}
+
 DDSFile *DDSFile::createDDSFileFromGBitmap( const GBitmap *gbmp )
 {
    if( gbmp == NULL )
@@ -666,6 +672,11 @@ DDSFile *DDSFile::createDDSFileFromGBitmap( const GBitmap *gbmp )
    ret->mDepth = 0;
    ret->mFormat = gbmp->getFormat();
    ret->mFlags.set(RGBData);
+   if (gbmp->getNumFaces() == 6)
+   {
+      ret->mFlags.set(RGBData | CubeMapFlag | CubeMap_PosX_Flag | CubeMap_NegX_Flag | CubeMap_PosY_Flag |
+         CubeMap_NegY_Flag | CubeMap_PosZ_Flag | CubeMap_NegZ_Flag);
+   }
    ret->mBytesPerPixel = gbmp->getBytesPerPixel();
    ret->mMipMapCount = gbmp->getNumMipLevels();
    ret->mHasTransparency = gbmp->getHasTransparency();
@@ -685,36 +696,39 @@ DDSFile *DDSFile::createDDSFileFromGBitmap( const GBitmap *gbmp )
    if( ret->mMipMapCount > 1 )
       ret->mFlags.set(MipMapsFlag);
 
-   // One surface per GBitmap
-   ret->mSurfaces.push_back( new SurfaceData() );
-
-   // Load the mips
-   for( S32 i = 0; i < ret->mMipMapCount; i++ )
+   for (U32 face = 0; face < gbmp->getNumFaces(); face++)
    {
-      const U32 mipSz = ret->getSurfaceSize(i);
-      ret->mSurfaces.last()->mMips.push_back( new U8[mipSz] );
+      // One surface per GBitmap
+      ret->mSurfaces.push_back(new SurfaceData());
 
-      U8 *mipMem = ret->mSurfaces.last()->mMips.last();
-      
-      // If this is a straight copy, just do it, otherwise (ugh)
-      if( ret->mFormat == gbmp->getFormat() )
-         dMemcpy( mipMem, gbmp->getBits(i), mipSz );
-      else
+      // Load the mips
+      for (S32 i = 0; i < ret->mMipMapCount; i++)
       {
-         // Assumption:
-         AssertFatal( gbmp->getBytesPerPixel() + 1 == ret->mBytesPerPixel, "Assumption failed, not 24->32 bit straight convert." );
+         const U32 mipSz = ret->getSurfaceSize(i);
+         ret->mSurfaces.last()->mMips.push_back(new U8[mipSz]);
 
-         for( S32 pxl = 0; pxl < gbmp->getWidth(i) * gbmp->getHeight(i); pxl++ )
+         U8* mipMem = ret->mSurfaces.last()->mMips.last();
+
+         // If this is a straight copy, just do it, otherwise (ugh)
+         if (ret->mFormat == gbmp->getFormat())
+            dMemcpy(mipMem, gbmp->getBits(i, face), mipSz);
+         else
          {
-            U8 *dst = &mipMem[pxl * ret->mBytesPerPixel];
-            const U8 *src = &gbmp->getBits(i)[pxl * gbmp->getBytesPerPixel()];
-            dMemcpy( dst, src, gbmp->getBytesPerPixel() * sizeof(U8) );
-            dst[ret->mBytesPerPixel - 1] = 255;
-         } 
-      }
+            // Assumption:
+            AssertFatal(gbmp->getBytesPerPixel() + 1 == ret->mBytesPerPixel, "Assumption failed, not 24->32 bit straight convert.");
 
-      // Uncomment to debug-dump each mip level
-      //ret->mSurfaces.last()->dumpImage( ret, i, avar( "%d_Gbmp_xmip%d", ret, i ) );
+            for (S32 pxl = 0; pxl < gbmp->getWidth(i) * gbmp->getHeight(i); pxl++)
+            {
+               U8* dst = &mipMem[pxl * ret->mBytesPerPixel];
+               const U8* src = &gbmp->getBits(i, face)[pxl * gbmp->getBytesPerPixel()];
+               dMemcpy(dst, src, gbmp->getBytesPerPixel() * sizeof(U8));
+               dst[ret->mBytesPerPixel - 1] = 255;
+            }
+         }
+
+         // Uncomment to debug-dump each mip level
+         //ret->mSurfaces.last()->dumpImage( ret, i, avar( "%d_Gbmp_xmip%d", ret, i ) );
+      }
    }
 
    return ret;
@@ -777,22 +791,50 @@ DDSFile *DDSFile::createDDSCubemapFileFromGBitmaps(GBitmap **gbmps)
 
 bool DDSFile::decompressToGBitmap(GBitmap *dest)
 {
+   const bool isCube = isCubemap();
+   const U32 numFaces = isCube ? 6 : 1;
    // TBD: do we support other formats?
-   if (mFormat != GFXFormatBC1 && mFormat != GFXFormatBC2 && mFormat != GFXFormatBC3)
-      return false;
+   if (!isCompressedFormat(mFormat))
+   {
+      dest->allocateBitmapWithMips(getWidth(), getHeight(), getMipLevels(), mFormat, numFaces);
+      U32 numMips = getMipLevels();
 
-   dest->allocateBitmapWithMips(getWidth(), getHeight(), getMipLevels(), GFXFormatR8G8B8A8);
+      for (U32 face = 0; face < numFaces; face++)
+      {
+         for (U32 i = 0; i < numMips; i++)
+         {
+            U8* addr = dest->getAddress(0, 0, i, face);
+
+            const U8* mipBuffer = mSurfaces[face]->mMips[i];
+            const U32 mipWidth = getWidth(i);
+            const U32 mipHeight = getHeight(i);
+
+            const U32 bpp = dest->getBytesPerPixel();
+            const U32 rowBytes = mipWidth * bpp;
+
+            for (U32 y = 0; y < mipHeight; ++y)
+            {
+               dMemcpy(addr + y * rowBytes, mipBuffer + y * rowBytes, rowBytes);
+            }
+         }
+      }
+      return true;
+   }
+
+   dest->allocateBitmapWithMips(getWidth(), getHeight(), getMipLevels(), GFXFormatR8G8B8A8, numFaces);
 
    // Decompress and copy mips...
 
    U32 numMips = getMipLevels();
-
-   for (U32 i = 0; i < numMips; i++)
+   for (U32 face = 0; face < numFaces; face++)
    {
-      U8 *addr = dest->getAddress(0, 0, i);
-      const U8 *mipBuffer = mSurfaces[0]->mMips[i];
-      ImageUtil::decompress(mipBuffer, addr, getWidth(i), getHeight(i), mFormat);
+      for (U32 i = 0; i < numMips; i++)
+      {
+         U8* addr = dest->getAddress(0, 0, i, face);
+         const U8* mipBuffer = mSurfaces[face]->mMips[i];
+         ImageUtil::decompress(mipBuffer, addr, getWidth(i), getHeight(i), mFormat);
 
+      }
    }
 
    return true;
diff --git a/Engine/source/gfx/bitmap/ddsFile.h b/Engine/source/gfx/bitmap/ddsFile.h
index 84b515f86..5ad0a1baa 100644
--- a/Engine/source/gfx/bitmap/ddsFile.h
+++ b/Engine/source/gfx/bitmap/ddsFile.h
@@ -205,6 +205,8 @@ struct DDSFile
       mSurfaces.clear();
    }
 
+   bool isCompressedFormat(GFXFormat fmt);
+
    static DDSFile *createDDSFileFromGBitmap( const GBitmap *gbmp );
    //Create a single cubemap texture from 6 GBitmap
    static DDSFile *createDDSCubemapFileFromGBitmaps(GBitmap **gbmps);
diff --git a/Engine/source/gfx/bitmap/gBitmap.cpp b/Engine/source/gfx/bitmap/gBitmap.cpp
index 4c85537ad..4340b38e2 100644
--- a/Engine/source/gfx/bitmap/gBitmap.cpp
+++ b/Engine/source/gfx/bitmap/gBitmap.cpp
@@ -49,9 +49,11 @@ GBitmap::GBitmap()
    mHeight(0),
    mBytesPerPixel(0),
    mNumMipLevels(0),
-   mHasTransparency(false)
+   mHasTransparency(false),
+   mNumFaces(1)
 {
    std::fill_n(mMipLevelOffsets, c_maxMipLevels, 0xffffffff);
+   std::fill_n(mFaceOffsets, 6, 0xffffffff);
 }
 
 GBitmap::GBitmap(const GBitmap& rCopy)
@@ -67,33 +69,42 @@ GBitmap::GBitmap(const GBitmap& rCopy)
    mBytesPerPixel = rCopy.mBytesPerPixel;
    mNumMipLevels  = rCopy.mNumMipLevels;
    dMemcpy(mMipLevelOffsets, rCopy.mMipLevelOffsets, sizeof(mMipLevelOffsets));
+   dMemcpy(mFaceOffsets, rCopy.mFaceOffsets, sizeof(mFaceOffsets));
 
    mHasTransparency = rCopy.mHasTransparency;
+   mNumFaces = rCopy.mNumFaces;
 }
 
 
 GBitmap::GBitmap(const U32  in_width,
                  const U32  in_height,
                  const bool in_extrudeMipLevels,
-                 const GFXFormat in_format)
+                 const GFXFormat in_format,
+                 const U32 in_numFaces)
  : mBits(NULL),
-   mByteSize(0)
+   mByteSize(0),
+   mNumFaces(in_numFaces)
 {
    for (U32 i = 0; i < c_maxMipLevels; i++)
       mMipLevelOffsets[i] = 0xffffffff;
 
-   allocateBitmap(in_width, in_height, in_extrudeMipLevels, in_format);
+   for(U32 i = 0; i < 6; i++)
+      mFaceOffsets[i] = 0xffffffff;
+
+   allocateBitmap(in_width, in_height, in_extrudeMipLevels, in_format, in_numFaces);
 
    mHasTransparency = false;
 }
 
 GBitmap::GBitmap(const U32  in_width,
                  const U32  in_height,
-                 const U8*  data )
+                 const U8*  data,
+                 const U32 in_numFaces)
  : mBits(NULL),
-   mByteSize(0)
+   mByteSize(0),
+   mNumFaces(in_numFaces)
 {
-   allocateBitmap(in_width, in_height, false, GFXFormatR8G8B8A8);
+   allocateBitmap(in_width, in_height, false, GFXFormatR8G8B8A8, in_numFaces);
 
    mHasTransparency = false;
 
@@ -126,6 +137,65 @@ GBitmap::~GBitmap()
 
 //--------------------------------------------------------------------------
 
+U32 GBitmap::getFormatBytesPerPixel(GFXFormat fmt)
+{
+   switch (fmt)
+   {
+      // 8-bit formats
+   case GFXFormatA8:
+   case GFXFormatL8:
+   case GFXFormatA4L4:
+      return 1;
+
+      // 16-bit formats
+   case GFXFormatR5G6B5:
+   case GFXFormatR5G5B5A1:
+   case GFXFormatR5G5B5X1:
+   case GFXFormatA8L8:
+   case GFXFormatL16:
+   case GFXFormatR16F:
+   case GFXFormatD16:
+      return 2;
+
+      // 24-bit formats
+   case GFXFormatR8G8B8:
+   case GFXFormatR8G8B8_SRGB:
+      return 3;
+
+      // 32-bit formats
+   case GFXFormatR8G8B8A8:
+   case GFXFormatR8G8B8X8:
+   case GFXFormatB8G8R8A8:
+   case GFXFormatR8G8B8A8_SRGB:
+   case GFXFormatR32F:
+   case GFXFormatR10G10B10A2:
+   case GFXFormatR11G11B10:
+   case GFXFormatD24X8:
+   case GFXFormatD24S8:
+   case GFXFormatD24FS8:
+   case GFXFormatR16G16:
+   case GFXFormatR16G16F:
+   case GFXFormatR8G8B8A8_LINEAR_FORCE:
+      return 4;
+
+      // 64-bit formats
+   case GFXFormatR16G16B16A16:
+   case GFXFormatR16G16B16A16F:
+   case GFXFormatD32FS8X24:
+      return 8;
+
+      // 128-bit formats
+   case GFXFormatR32G32B32A32F:
+      return 16;
+
+   default:
+      AssertWarn(false, "getFormatBytesPerPixel() - Unknown or compressed format");
+      return 4;
+   }
+}
+
+//--------------------------------------------------------------------------
+
 void GBitmap::sRegisterFormat( const GBitmap::Registration &reg )
 {
    U32 insert = sRegistrations.size();
@@ -268,7 +338,7 @@ void GBitmap::copyRect(const GBitmap *src, const RectI &srcRect, const Point2I &
 }
 
 //--------------------------------------------------------------------------
-void GBitmap::allocateBitmap(const U32 in_width, const U32 in_height, const bool in_extrudeMipLevels, const GFXFormat in_format )
+void GBitmap::allocateBitmap(const U32 in_width, const U32 in_height, const bool in_extrudeMipLevels, const GFXFormat in_format, const U32 in_numFaces)
 {
    //-------------------------------------- Some debug checks...
    U32 svByteSize = mByteSize;
@@ -284,37 +354,14 @@ void GBitmap::allocateBitmap(const U32 in_width, const U32 in_height, const bool
    mInternalFormat = in_format;
    mWidth          = in_width;
    mHeight         = in_height;
+   mNumFaces       = in_numFaces;
 
-   mBytesPerPixel = 1;
-   switch (mInternalFormat) 
-   {
-     case GFXFormatA8:
-     case GFXFormatL8:           mBytesPerPixel = 1;
-      break;
-     case GFXFormatR8G8B8:       mBytesPerPixel = 3;
-      break;
-     case GFXFormatR8G8B8A8_LINEAR_FORCE:
-     case GFXFormatR8G8B8X8:
-     case GFXFormatR8G8B8A8:     mBytesPerPixel = 4;
-      break;
-	 case GFXFormatL16:
-     case GFXFormatR5G6B5:
-     case GFXFormatR5G5B5A1:     mBytesPerPixel = 2;
-      break;
-     case GFXFormatR16G16B16A16F:
-      case GFXFormatR16G16B16A16: mBytesPerPixel = 8;
-         break;
-      default:
-         AssertFatal(false, "GBitmap::GBitmap: misunderstood format specifier");
-         break;
-   }
+   mBytesPerPixel = getFormatBytesPerPixel(mInternalFormat);
 
    // Set up the mip levels, if necessary...
    mNumMipLevels       = 1;
-   U32 allocPixels = in_width * in_height * mBytesPerPixel;
    mMipLevelOffsets[0] = 0;
 
-
    if (in_extrudeMipLevels == true) 
    {
       U32 currWidth  = in_width;
@@ -330,7 +377,6 @@ void GBitmap::allocateBitmap(const U32 in_width, const U32 in_height, const bool
          if (currHeight == 0) currHeight = 1;
 
          mNumMipLevels++;
-         allocPixels += currWidth * currHeight * mBytesPerPixel;
       }
 
       U32 expectedMips = mFloor(mLog2(mMax(in_width, in_height))) + 1;
@@ -338,8 +384,17 @@ void GBitmap::allocateBitmap(const U32 in_width, const U32 in_height, const bool
    }
    AssertFatal(mNumMipLevels <= c_maxMipLevels, "GBitmap::allocateBitmap: too many miplevels");
 
+   U32 faceStride = 0;
+   for (U32 mip = 0; mip < mNumMipLevels; mip++)
+      faceStride += getWidth(mip) * getHeight(mip) * mBytesPerPixel;
+
+   for (U32 face = 0; face < mNumFaces; face++)
+      mFaceOffsets[face] = face * faceStride;
+
+   U32 allocBytes = faceStride * mNumFaces;
+
    // Set up the memory...
-   mByteSize = allocPixels;
+   mByteSize = allocBytes;
    mBits    = new U8[mByteSize];
 
    dMemset(mBits, 0xFF, mByteSize);
@@ -352,7 +407,7 @@ void GBitmap::allocateBitmap(const U32 in_width, const U32 in_height, const bool
 }
 
 //--------------------------------------------------------------------------
-void GBitmap::allocateBitmapWithMips(const U32 in_width, const U32 in_height, const U32 in_numMips, const GFXFormat in_format)
+void GBitmap::allocateBitmapWithMips(const U32 in_width, const U32 in_height, const U32 in_numMips, const GFXFormat in_format, const U32 in_numFaces)
 {
    //-------------------------------------- Some debug checks...
    U32 svByteSize = mByteSize;
@@ -363,36 +418,14 @@ void GBitmap::allocateBitmapWithMips(const U32 in_width, const U32 in_height, co
    mInternalFormat = in_format;
    mWidth = in_width;
    mHeight = in_height;
+   mNumFaces = in_numFaces;
 
-   mBytesPerPixel = 1;
-   switch (mInternalFormat)
-   {
-   case GFXFormatA8:
-   case GFXFormatL8:           mBytesPerPixel = 1;
-      break;
-   case GFXFormatR8G8B8:       mBytesPerPixel = 3;
-      break;
-   case GFXFormatR8G8B8X8:
-   case GFXFormatR8G8B8A8:     mBytesPerPixel = 4;
-      break;
-   case GFXFormatL16:
-   case GFXFormatR5G6B5:
-   case GFXFormatR5G5B5A1:     mBytesPerPixel = 2;
-      break;
-   case GFXFormatR16G16B16A16F:
-   case GFXFormatR16G16B16A16: mBytesPerPixel = 8;
-      break;
-   default:
-      AssertFatal(false, "GBitmap::GBitmap: misunderstood format specifier");
-      break;
-   }
+   mBytesPerPixel = getFormatBytesPerPixel(mInternalFormat);
 
    // Set up the mip levels, if necessary...
    mNumMipLevels = 1;
-   U32 allocPixels = in_width * in_height * mBytesPerPixel;
    mMipLevelOffsets[0] = 0;
 
-
    if (in_numMips != 0)
    {
       U32 currWidth = in_width;
@@ -408,13 +441,21 @@ void GBitmap::allocateBitmapWithMips(const U32 in_width, const U32 in_height, co
          if (currHeight == 0) currHeight = 1;
 
          mNumMipLevels++;
-         allocPixels += currWidth * currHeight * mBytesPerPixel;
       } while ((currWidth != 1 || currHeight != 1) && (mNumMipLevels != in_numMips));
    }
    AssertFatal(mNumMipLevels <= c_maxMipLevels, "GBitmap::allocateBitmap: too many miplevels");
 
+   U32 faceStride = 0;
+   for (U32 mip = 0; mip < mNumMipLevels; mip++)
+      faceStride += getWidth(mip) * getHeight(mip) * mBytesPerPixel;
+
+   for (U32 face = 0; face < mNumFaces; face++)
+      mFaceOffsets[face] = face * faceStride;
+
+   U32 allocBytes = faceStride * mNumFaces;
+
    // Set up the memory...
-   mByteSize = allocPixels;
+   mByteSize = allocBytes;
    mBits = new U8[mByteSize];
 
    dMemset(mBits, 0xFF, mByteSize);
@@ -432,40 +473,29 @@ void GBitmap::extrudeMipLevels(bool clearBorders)
    if(mNumMipLevels == 1)
       allocateBitmap(getWidth(), getHeight(), true, getFormat());
 
-   switch (getFormat())
+
+   if (getFormat() == GFXFormatR5G5B5A1)
    {
-      case GFXFormatR5G5B5A1:
-      {
-         for(U32 i = 1; i < mNumMipLevels; i++)
-            bitmapExtrude5551(getBits(i - 1), getWritableBits(i), getHeight(i), getWidth(i));
-         break;
-      }
-
-      case GFXFormatR8G8B8:
-      {
-         for(U32 i = 1; i < mNumMipLevels; i++)
-            bitmapExtrudeRGB(getBits(i - 1), getWritableBits(i), getHeight(i-1), getWidth(i-1));
-         break;
-      }
-
-      case GFXFormatR8G8B8A8:
-      case GFXFormatR8G8B8X8:
-      {
-         for(U32 i = 1; i < mNumMipLevels; i++)
-            bitmapExtrudeRGBA(getBits(i - 1), getWritableBits(i), getHeight(i-1), getWidth(i-1));
-         break;
-      }
-
-      case GFXFormatR16G16B16A16F:
-      {
-         for (U32 i = 1; i < mNumMipLevels; i++)
-            bitmapExtrudeFPRGBA(getBits(i - 1), getWritableBits(i), getHeight(i - 1), getWidth(i - 1));
-         break;
-      }
-      
-      default:
-         break;
+      for (U32 i = 1; i < mNumMipLevels; i++)
+         bitmapExtrude5551(getBits(i - 1), getWritableBits(i), getHeight(i), getWidth(i));
    }
+   else
+   {
+      for (U32 i = 1; i < mNumMipLevels; i++)
+      {
+         bitmapResizeToOutput(
+            getBits(i - 1),
+            getHeight(i - 1),
+            getWidth(i - 1),
+            getWritableBits(i),
+            getHeight(i),
+            getWidth(i),
+            mBytesPerPixel,
+            getFormat()
+         );
+      }
+   }
+
    if (clearBorders)
    {
       for (U32 i = 1; i<mNumMipLevels; i++)
@@ -538,7 +568,7 @@ void GBitmap::extrudeMipLevelsDetail()
       allocateBitmap(getWidth(), getHeight(), true, getFormat());
 
    for (i = 1; i < mNumMipLevels; i++) {
-      bitmapExtrudeRGB(getBits(i - 1), getWritableBits(i), getHeight(i-1), getWidth(i-1));
+      bitmapExtrudeRGB(getBits(i - 1), getWritableBits(i), getHeight(i-1), getWidth(i-1), mBytesPerPixel);
    }
 
    // Ok, now that we have the levels extruded, we need to move the lower miplevels
@@ -577,101 +607,24 @@ bool GBitmap::setFormat(GFXFormat fmt)
    for (U32 i=0; i < mNumMipLevels; i++)
       pixels += getHeight(i) * getWidth(i);
 
-   switch( getFormat() )
+   if (getFormat() == GFXFormatR8G8B8 && fmt == GFXFormatR5G5B5A1)
    {
-      case GFXFormatR8G8B8:
-         switch ( fmt )
-         {
-            case GFXFormatR5G5B5A1:
 #ifdef _XBOX
-               bitmapConvertRGB_to_1555(mBits, pixels);
+      bitmapConvertRGB_to_1555(mBits, pixels);
 #else
-               bitmapConvertRGB_to_5551(mBits, pixels);
+      bitmapConvertRGB_to_5551(mBits, pixels);
 #endif
-               mInternalFormat = GFXFormatR5G5B5A1;
-               mBytesPerPixel  = 2;
-               break;
-
-            case GFXFormatR8G8B8A8:
-            case GFXFormatR8G8B8X8:
-               // Took this out, it may crash -patw
-               //AssertFatal( mNumMipLevels == 1, "Do the mip-mapping in hardware." );
-
-               bitmapConvertRGB_to_RGBX( &mBits, pixels );
-               mInternalFormat = fmt;
-               mBytesPerPixel = 4;
-               mByteSize = pixels * 4;
-               break;
-
-            default:
-               AssertWarn(0, "GBitmap::setFormat: unable to convert bitmap to requested format.");
-               return false;
-         }
-         break;
-
-      case GFXFormatR8G8B8X8:
-         switch( fmt )
-         {
-            // No change needed for this
-            case GFXFormatR8G8B8A8:
-               mInternalFormat = GFXFormatR8G8B8A8;
-               break;
-
-            case GFXFormatR8G8B8:
-               bitmapConvertRGBX_to_RGB( &mBits, pixels );
-               mInternalFormat = GFXFormatR8G8B8;
-               mBytesPerPixel = 3;
-               mByteSize = pixels * 3;
-               break;
-
-            default:
-               AssertWarn(0, "GBitmap::setFormat: unable to convert bitmap to requested format.");
-               return false;
-         }
-         break;
-
-      case GFXFormatR8G8B8A8:
-         switch( fmt )
-         {
-            // No change needed for this
-            case GFXFormatR8G8B8X8:
-               mInternalFormat = GFXFormatR8G8B8X8;
-               break;
-
-            case GFXFormatR8G8B8:
-               bitmapConvertRGBX_to_RGB( &mBits, pixels );
-               mInternalFormat = GFXFormatR8G8B8;
-               mBytesPerPixel = 3;
-               mByteSize = pixels * 3;
-               break;
-
-            default:
-               AssertWarn(0, "GBitmap::setFormat: unable to convert bitmap to requested format.");
-               return false;
-         }
-         break;
-
-      case GFXFormatA8:
-         switch( fmt )
-         {
-            case GFXFormatR8G8B8A8:
-               mInternalFormat = GFXFormatR8G8B8A8;
-               bitmapConvertA8_to_RGBA( &mBits, pixels );
-               mBytesPerPixel = 4;
-               mByteSize = pixels * 4;
-               break;
-
-            default:
-               AssertWarn(0, "GBitmap::setFormat: unable to convert bitmap to requested format.");
-               return false;
-         }
-         break;
-
-      default:
-         AssertWarn(0, "GBitmap::setFormat: unable to convert bitmap to requested format.");
-         return false;
+      mInternalFormat = GFXFormatR5G5B5A1;
+      mBytesPerPixel = 2;
+   }
+   else
+   {
+      bitmapConvertToOutput(&mBits, pixels, getFormat(), fmt);
+      mInternalFormat = fmt;
+      mBytesPerPixel = getFormatBytesPerPixel(fmt);
    }
 
+   
    U32 offset = 0;
    for (U32 j=0; j < mNumMipLevels; j++)
    {
@@ -688,40 +641,43 @@ bool GBitmap::checkForTransparency()
 {
    mHasTransparency = false;
 
+   if (!mBits || mByteSize == 0)
+      return false;
+
    ColorI pixel(255, 255, 255, 255);
 
+   // Only check formats that can *possibly* have alpha.
    switch (mInternalFormat)
    {
-      // Non-transparent formats
-      case GFXFormatL8:
-	  case GFXFormatL16:
-      case GFXFormatR8G8B8:
-      case GFXFormatR5G6B5:
-         break;
-      // Transparent formats
-      case GFXFormatA8:
-      case GFXFormatR8G8B8A8:
-      case GFXFormatR5G5B5A1:
-         // Let getColor() do the heavy lifting
-         for (U32 x = 0; x < mWidth; x++)
+   case GFXFormatA8:
+   case GFXFormatA4L4:
+   case GFXFormatA8L8:
+   case GFXFormatR5G5B5A1:
+   case GFXFormatR8G8B8A8:
+   case GFXFormatB8G8R8A8:
+   case GFXFormatR8G8B8A8_SRGB:
+   case GFXFormatR10G10B10A2:
+   case GFXFormatR16G16B16A16:
+   case GFXFormatR16G16B16A16F:
+   case GFXFormatR32G32B32A32F:
+      break; // alpha-capable
+   default:
+      return false; // skip formats with no alpha
+   }
+
+   for (U32 x = 0; x < mWidth; x++)
+   {
+      for (U32 y = 0; y < mHeight; y++)
+      {
+         if (getColor(x, y, pixel))
          {
-            for (U32 y = 0; y < mHeight; y++)
+            if (pixel.alpha < 255)
             {
-               if (getColor(x, y, pixel))
-               {
-                  if (pixel.alpha < 255)
-                  {
-                     mHasTransparency = true;
-                     break;
-                  }
-               }
+               mHasTransparency = true;
+               break;
             }
          }
-
-         break;
-      default:
-         AssertFatal(false, "GBitmap::checkForTransparency: misunderstood format specifier");
-      break;
+      }
    }
 
    return mHasTransparency;
@@ -770,40 +726,197 @@ bool GBitmap::getColor(const U32 x, const U32 y, ColorI& rColor) const
    if (x >= mWidth || y >= mHeight)
       return false;
 
-   const U8* pLoc = getAddress(x, y);
+   const U8* p = getAddress(x, y);
 
-   switch (mInternalFormat) {
-     case GFXFormatA8:
-     case GFXFormatL8:
-      rColor.set( *pLoc, *pLoc, *pLoc, *pLoc );
-      break;
-	 case GFXFormatL16:
-		 rColor.set(U8(U16((pLoc[0] << 8) + pLoc[1])), 0, 0, 0);
-       break;
-     case GFXFormatR8G8B8:
-     case GFXFormatR8G8B8X8:
-        rColor.set( pLoc[0], pLoc[1], pLoc[2], 255 );
+   switch (mInternalFormat)
+   {
+      // --- 8-bit ---
+   case GFXFormatA8:
+      rColor.set(255, 255, 255, p[0]);
       break;
 
-     case GFXFormatR8G8B8A8:
-      rColor.set( pLoc[0], pLoc[1], pLoc[2], pLoc[3] );
+   case GFXFormatL8:
+      rColor.set(p[0], p[0], p[0], 255);
       break;
 
-     case GFXFormatR5G5B5A1:
-#if defined(TORQUE_OS_MAC)
-      rColor.set( (*((U16*)pLoc) >> 0) & 0x1F,
-                  (*((U16*)pLoc) >> 5) & 0x1F,
-                  (*((U16*)pLoc) >> 10) & 0x1F,
-                  ((*((U16*)pLoc) >> 15) & 0x01) ? 255 : 0 );
+   case GFXFormatA4L4:
+   {
+      U8 v = p[0];
+      U8 lum = (v & 0x0F) * 17;
+      U8 alp = ((v >> 4) & 0x0F) * 17;
+      rColor.set(lum, lum, lum, alp);
+      break;
+   }
+
+   // --- 16-bit ---
+   case GFXFormatR5G6B5:
+   {
+      U16 c = ((U16*)p)[0];
+#ifdef TORQUE_BIG_ENDIAN
+      c = convertLEndianToHost(c);
+#endif
+      U8 r = (c >> 11) & 0x1F;
+      U8 g = (c >> 5) & 0x3F;
+      U8 b = c & 0x1F;
+      rColor.set((r << 3) | (r >> 2),
+         (g << 2) | (g >> 4),
+         (b << 3) | (b >> 2),
+         255);
+      break;
+   }
+
+   case GFXFormatR5G5B5A1:
+   {
+      U16 c = ((U16*)p)[0];
+#ifdef TORQUE_BIG_ENDIAN
+      c = convertLEndianToHost(c);
+#endif
+      U8 r = (c >> 11) & 0x1F;
+      U8 g = (c >> 6) & 0x1F;
+      U8 b = (c >> 1) & 0x1F;
+      U8 a = (c & 0x01) ? 255 : 0;
+      rColor.set((r << 3) | (r >> 2),
+         (g << 3) | (g >> 2),
+         (b << 3) | (b >> 2),
+         a);
+      break;
+   }
+
+   case GFXFormatA8L8:
+   {
+      U16 c = ((U16*)p)[0];
+#ifdef TORQUE_BIG_ENDIAN
+      c = convertLEndianToHost(c);
+#endif
+      U8 l = c & 0xFF;
+      U8 a = (c >> 8) & 0xFF;
+      rColor.set(l, l, l, a);
+      break;
+   }
+
+   case GFXFormatL16:
+   {
+      U16 l = ((U16*)p)[0];
+#ifdef TORQUE_BIG_ENDIAN
+      l = convertLEndianToHost(l);
+#endif
+      rColor.set(convert16To8(l), convert16To8(l), convert16To8(l), 255);
+      break;
+   }
+   case GFXFormatR16F:
+   {
+      const U16* v = (U16*)p;
+      rColor.set(
+         floatTo8(convertHalfToFloat(v[0])),
+         0,
+         0,
+         255
+      );
+      break;
+   }
+
+   // --- 24-bit ---
+   case GFXFormatR8G8B8:
+   case GFXFormatR8G8B8_SRGB:
+      rColor.set(p[0], p[1], p[2], 255);
+      break;
+
+   // --- 32-bit ---
+   case GFXFormatR32F:
+   {
+      const F32* v = (F32*)p;
+      rColor.set(
+         floatTo8(v[0]), // red
+         0,                                      // green
+         0,                                      // blue
+         255                                     // alpha
+      );
+      break;
+   }
+   case GFXFormatR16G16:
+   {
+      const U16* v = (U16*)p;
+#ifdef TORQUE_BIG_ENDIAN
+      U16 r = convertLEndianToHost(v[0]);
+      U16 g = convertLEndianToHost(v[1]);
 #else
-      rColor.set( *((U16*)pLoc) >> 11,
-                  (*((U16*)pLoc) >> 6) & 0x1f,
-                  (*((U16*)pLoc) >> 1) & 0x1f,
-                  (*((U16*)pLoc) & 1) ? 255 : 0 );
+      U16 r = v[0];
+      U16 g = v[1];
+#endif
+      rColor.set(
+         convert16To8(r),    // red
+         convert16To8(g),    // green
+         0,             // blue
+         255            // alpha
+      );
+      break;
+   }
+   case GFXFormatR16G16F:
+   {
+      const U16* v = (U16*)p;
+      rColor.set(
+         floatTo8(convertHalfToFloat(v[0])),
+         floatTo8(convertHalfToFloat(v[1])),
+         0,
+         255
+      );
+      break;
+   }
+
+   case GFXFormatR8G8B8A8:
+   case GFXFormatR8G8B8A8_SRGB:
+   case GFXFormatR8G8B8X8:
+      rColor.set(p[0], p[1], p[2], (mInternalFormat == GFXFormatR8G8B8X8) ? 255 : p[3]);
+      break;
+
+   case GFXFormatB8G8R8A8:
+      rColor.set(p[2], p[1], p[0], p[3]);
+      break;
+
+   // --- 64-bit ---
+   case GFXFormatR16G16B16A16:
+   {
+      const U16* v = (U16*)p;
+#ifdef TORQUE_BIG_ENDIAN
+      rColor.set(
+         convert16To8(v[2]),
+         convert16To8(v[1]),
+         convert16To8(v[0]),
+         convert16To8(v[3]));
+#else
+      rColor.set(
+         convert16To8(v[0]),
+         convert16To8(v[1]),
+         convert16To8(v[2]),
+         convert16To8(v[3]));
 #endif
       break;
+   }
 
-     default:
+   case GFXFormatR16G16B16A16F:
+   {
+      const U16* v = (const U16*)p;
+      rColor.set(floatTo8(
+         convertHalfToFloat(v[0])),
+         floatTo8(convertHalfToFloat(v[1])),
+         floatTo8(convertHalfToFloat(v[2])),
+         floatTo8(convertHalfToFloat(v[3])));
+      break;
+   }
+
+   // --- 128-bit ---
+   case GFXFormatR32G32B32A32F:
+   {
+      const F32* v = (const F32*)p;
+      rColor.set(
+         floatTo8(v[0]),
+         floatTo8(v[1]),
+         floatTo8(v[2]),
+         floatTo8(v[3]));
+      break;
+   }
+
+   default:
       AssertFatal(false, "Bad internal format");
       return false;
    }
@@ -820,45 +933,158 @@ bool GBitmap::setColor(const U32 x, const U32 y, const ColorI& rColor)
    if (x >= mWidth || y >= mHeight)
       return false;
 
-   U8* pLoc = getAddress(x, y);
+   U8* p = getAddress(x, y);
 
-   switch (mInternalFormat) {
-     case GFXFormatA8:
-     case GFXFormatL8:
-      *pLoc = rColor.alpha;
+   switch (mInternalFormat)
+   {
+
+   // --- 8-bit ---
+   case GFXFormatA8:
+      *p = rColor.alpha;
       break;
 
-	 case GFXFormatL16:
-		 dMemcpy(pLoc, &rColor, 2 * sizeof(U8));
-		 break;
-
-     case GFXFormatR8G8B8:
-      dMemcpy( pLoc, &rColor, 3 * sizeof( U8 ) );
+   case GFXFormatL8:
+      *p = rColor.red; // L = R channel
       break;
 
-     case GFXFormatR8G8B8A8:
-     case GFXFormatR8G8B8X8:
-      dMemcpy( pLoc, &rColor, 4 * sizeof( U8 ) );
+   case GFXFormatA4L4:
+   {
+      U8 lum = rColor.red / 17;
+      U8 alp = rColor.alpha / 17;
+      *p = (alp << 4) | (lum & 0x0F);
       break;
-      
-     case GFXFormatR5G6B5:
-      #ifdef TORQUE_OS_MAC
-         *((U16*)pLoc) = (rColor.red << 11) | (rColor.green << 5) | (rColor.blue << 0) ;
-      #else
-         *((U16*)pLoc) = (rColor.blue << 0) | (rColor.green << 5) | (rColor.red << 11);
-      #endif
+   }
+
+   // --- 16-bit ---
+   case GFXFormatR5G6B5:
+   {
+      U16 r = rColor.red * 31 / 255;
+      U16 g = rColor.green * 63 / 255;
+      U16 b = rColor.blue * 31 / 255;
+#ifdef TORQUE_BIG_ENDIAN
+      * (U16*)p = (r << 11) | (g << 5) | b;
+#else
+      * (U16*)p = (b) | (g << 5) | (r << 11);
+#endif
+      break;
+   }
+
+   case GFXFormatR5G5B5A1:
+   {
+      U16 r = rColor.red * 31 / 255;
+      U16 g = rColor.green * 31 / 255;
+      U16 b = rColor.blue * 31 / 255;
+      U16 a = (rColor.alpha > 0) ? 1 : 0;
+#ifdef TORQUE_BIG_ENDIAN
+      * (U16*)p = (a << 15) | (b << 10) | (g << 5) | r;
+#else
+      * (U16*)p = (r << 11) | (g << 6) | (b << 1) | a;
+#endif
+      break;
+   }
+
+   case GFXFormatA8L8:
+   {
+      U16 l = rColor.red;
+      U16 a = rColor.alpha;
+#ifdef TORQUE_BIG_ENDIAN
+      * (U16*)p = (a << 8) | l;
+#else
+      * (U16*)p = (l) | (a << 8);
+#endif
+      break;
+   }
+
+   case GFXFormatL16:
+      *(U16*)p = convert8To16(rColor.red);
       break;
 
-     case GFXFormatR5G5B5A1:
-      #ifdef TORQUE_OS_MAC
-         *((U16*)pLoc) = (((rColor.alpha>0) ? 1 : 0)<<15) | (rColor.blue << 10) | (rColor.green << 5) | (rColor.red << 0);
-      #else
-         *((U16*)pLoc) = (rColor.blue << 1) | (rColor.green << 6) | (rColor.red << 11) | ((rColor.alpha>0) ? 1 : 0);
-      #endif
+   case GFXFormatR16F:
+   {
+      U16* v = (U16*)p;
+      v[0] = convertFloatToHalf(rColor.red / 255.f);
+      break;
+   }
+
+   // --- 24-bit ---
+   case GFXFormatR8G8B8:
+   case GFXFormatR8G8B8_SRGB:
+      p[0] = rColor.red;
+      p[1] = rColor.green;
+      p[2] = rColor.blue;
       break;
 
-     default:
-      AssertFatal(false, "Bad internal format");
+   // --- 32-bit ---
+   case GFXFormatR32F:
+   {
+      F32* v = (F32*)p;
+      v[0] = rColor.red / 255.f;
+      break;
+   }
+   case GFXFormatR16G16:
+   {
+      U16* v = (U16*)p;
+      v[0] = convert8To16(rColor.red);
+      v[1] = convert8To16(rColor.green);
+      break;
+   }
+   case GFXFormatR16G16F:
+   {
+      U16* v = (U16*)p;
+      v[0] = convertFloatToHalf(rColor.red / 255.f);
+      v[1] = convertFloatToHalf(rColor.green / 255.f);
+      break;
+   }
+   case GFXFormatR8G8B8A8:
+   case GFXFormatR8G8B8A8_SRGB:
+   case GFXFormatR8G8B8X8:
+      p[0] = rColor.red;
+      p[1] = rColor.green;
+      p[2] = rColor.blue;
+      p[3] = (mInternalFormat == GFXFormatR8G8B8X8) ? 255 : rColor.alpha;
+      break;
+
+   case GFXFormatB8G8R8A8:
+      p[0] = rColor.blue;
+      p[1] = rColor.green;
+      p[2] = rColor.red;
+      p[3] = rColor.alpha;
+      break;
+
+   // --- 64-bit ---
+   case GFXFormatR16G16B16A16:
+   {
+      U16* v = (U16*)p;
+      v[0] = convert8To16(rColor.red);
+      v[1] = convert8To16(rColor.green);
+      v[2] = convert8To16(rColor.blue);
+      v[3] = convert8To16(rColor.alpha);
+      break;
+   }
+
+   case GFXFormatR16G16B16A16F:
+   {
+      U16* v = (U16*)p;
+      v[0] = convertFloatToHalf(rColor.red / 255.f);
+      v[1] = convertFloatToHalf(rColor.green / 255.f);
+      v[2] = convertFloatToHalf(rColor.blue / 255.f);
+      v[3] = convertFloatToHalf(rColor.alpha / 255.f);
+      break;
+   }
+
+   // --- 128-bit ---
+   case GFXFormatR32G32B32A32F:
+   {
+      F32* v = (F32*)p;
+      v[0] = rColor.red / 255.f;
+      v[1] = rColor.green / 255.f;
+      v[2] = rColor.blue / 255.f;
+      v[3] = rColor.alpha / 255.f;
+      break;
+   }
+
+   default:
+      AssertFatal(false, "Bad internal format in setColor");
       return false;
    }
 
@@ -870,7 +1096,7 @@ U8 GBitmap::getChanelValueAt(U32 x, U32 y, U32 chan)
 {
    ColorI pixelColor = ColorI(255,255,255,255);
    getColor(x, y, pixelColor);
-   if (mInternalFormat == GFXFormatL16)
+   if (mInternalFormat == GFXFormatL16 || mInternalFormat == GFXFormatL8)
    {
       chan = 0;
    }
@@ -1124,6 +1350,7 @@ void GBitmap::copyChannel( U32 index, GBitmap *outBitmap ) const
 
 bool GBitmap::read(Stream& io_rStream)
 {
+   PROFILE_SCOPE(GBitmap_Read);
    // Handle versioning
    U32 version;
    io_rStream.read(&version);
@@ -1133,23 +1360,7 @@ bool GBitmap::read(Stream& io_rStream)
    U32 fmt;
    io_rStream.read(&fmt);
    mInternalFormat = GFXFormat(fmt);
-   mBytesPerPixel = 1;
-   switch (mInternalFormat) {
-     case GFXFormatA8:
-     case GFXFormatL8:  mBytesPerPixel = 1;
-      break;
-     case GFXFormatR8G8B8:        mBytesPerPixel = 3;
-      break;
-     case GFXFormatR8G8B8A8:       mBytesPerPixel = 4;
-      break;
-	 case GFXFormatL16:
-     case GFXFormatR5G6B5:
-     case GFXFormatR5G5B5A1:    mBytesPerPixel = 2;
-      break;
-     default:
-      AssertFatal(false, "GBitmap::read: misunderstood format specifier");
-      break;
-   }
+   mBytesPerPixel = getFormatBytesPerPixel(mInternalFormat);
 
    io_rStream.read(&mByteSize);
 
@@ -1170,6 +1381,7 @@ bool GBitmap::read(Stream& io_rStream)
 
 bool GBitmap::write(Stream& io_rStream) const
 {
+   PROFILE_SCOPE(GBitmap_Write);
    // Handle versioning
    io_rStream.write(csFileVersion);
 
@@ -1266,8 +1478,31 @@ template<> void *Resource<GBitmap>::create(const Torque::Path &path)
    Con::printf( "Resource<GBitmap>::create - [%s]", path.getFullPath().c_str() );
 #endif
 
+   GBitmap* bmp = new GBitmap;
    FileStream  stream;
 
+   Torque::Path dbm = path;
+   dbm.setExtension("dbm");
+   if (Torque::FS::IsFile(dbm))
+   {
+
+      Torque::FS::FileNodeRef assetFile = Torque::FS::GetFileNode(path);
+      Torque::FS::FileNodeRef compiledFile = Torque::FS::GetFileNode(dbm);
+
+      if (assetFile != NULL && compiledFile != NULL)
+      {
+         if (compiledFile->getModifiedTime() >= assetFile->getModifiedTime())
+         {
+#ifdef TORQUE_DEBUG_RES_MANAGER
+            Con::printf("Resource<GBitmap>::create - Loading cached image file: %s", dbm.getFullPath().c_str());
+#endif
+            stream.open(dbm.getFullPath(), Torque::FS::File::Read);
+            bmp->read(stream);
+            return bmp;
+         }
+      }
+   }
+
    stream.open( path.getFullPath(), Torque::FS::File::Read );
 
    if ( stream.getStatus() != Stream::Ok )
@@ -1276,7 +1511,6 @@ template<> void *Resource<GBitmap>::create(const Torque::Path &path)
       return NULL;
    }
 
-   GBitmap *bmp = new GBitmap;
    const String extension = path.getExtension();
    if( !bmp->readBitmap( extension, path ) )
    {
@@ -1411,6 +1645,7 @@ DefineEngineFunction(saveScaledImage, bool, (const char* bitmapSource, const cha
    "Saving it out to the destination path.\n")
 {
    bool isDDS = false;
+   bool isHDR = false;
 
    if (bitmapSource == 0 || bitmapSource[0] == '\0' || bitmapDest == 0 || bitmapDest[0] == '\0')
    {
@@ -1429,6 +1664,9 @@ DefineEngineFunction(saveScaledImage, bool, (const char* bitmapSource, const cha
    {
       if (String::ToLower(ret) == String(".dds"))
          isDDS = true;
+
+      if (String::ToLower(ret) == String(".hdr"))
+         isHDR = true;
    }
    else
    {
@@ -1462,6 +1700,8 @@ DefineEngineFunction(saveScaledImage, bool, (const char* bitmapSource, const cha
    if (isPow2(image->getWidth()) && isPow2(image->getHeight()))
       image->extrudeMipLevels();
 
+   image->setFormat(GFXFormatR8G8B8A8);
+
    U32 mipCount = image->getNumMipLevels();
    U32 targetMips = mFloor(mLog2((F32)(resolutionSize ? resolutionSize : 256))) + 1;
 
diff --git a/Engine/source/gfx/bitmap/gBitmap.h b/Engine/source/gfx/bitmap/gBitmap.h
index 558c5ec10..4a32531f9 100644
--- a/Engine/source/gfx/bitmap/gBitmap.h
+++ b/Engine/source/gfx/bitmap/gBitmap.h
@@ -135,13 +135,15 @@ public:
    GBitmap(const U32  in_width,
            const U32  in_height,
            const bool in_extrudeMipLevels = false,
-           const GFXFormat in_format = GFXFormatR8G8B8 );
+           const GFXFormat in_format = GFXFormatR8G8B8,
+           const U32 in_numFaces = 1);
 
    // This builds a GBitmap with the R8G8B8A8 format using the passed in
    // data (assumes that there is width * height * 4 U8's in data)
    GBitmap(const U32  in_width,
            const U32  in_height,
-           const U8*  data );
+           const U8*  data,
+           const U32 in_numFaces = 1);
 
    virtual ~GBitmap();
 
@@ -163,12 +165,14 @@ public:
    void allocateBitmap(const U32  in_width,
                        const U32  in_height,
                        const bool in_extrudeMipLevels = false,
-                       const GFXFormat in_format = GFXFormatR8G8B8 );
+                       const GFXFormat in_format = GFXFormatR8G8B8,
+                       const U32 in_numFaces = 1);
 
    void allocateBitmapWithMips(const U32  in_width,
-      const U32  in_height,
-      const U32  in_numMips,
-      const GFXFormat in_format = GFXFormatR8G8B8);
+                              const U32  in_height,
+                              const U32  in_numMips,
+                              const GFXFormat in_format = GFXFormatR8G8B8,
+                              const U32 in_numFaces = 1);
 
    void extrudeMipLevels(bool clearBorders = false);
    void chopTopMips(U32 mipsToChop);
@@ -191,16 +195,18 @@ public:
 
    U32         getWidth(const U32 in_mipLevel  = 0) const;
    U32         getHeight(const U32 in_mipLevel = 0) const;
+   U32         _getFaceOffset(const U32 face = 0) const;
    U32         getDepth(const U32 in_mipLevel = 0) const;
 
-   U8*         getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel = 0);
-   const U8*   getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel = 0) const;
+   U8*         getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel = 0, const U32 face = 0);
+   const U8*   getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel = 0, const U32 face = 0) const;
 
-   const U8*   getBits(const U32 in_mipLevel = 0) const;
-   U8*         getWritableBits(const U32 in_mipLevel = 0);
+   const U8*   getBits(const U32 in_mipLevel = 0, const U32 face = 0) const;
+   U8*         getWritableBits(const U32 in_mipLevel = 0, const U32 face = 0);
 
    U32         getByteSize() const { return mByteSize; }
    U32         getBytesPerPixel() const { return mBytesPerPixel; }
+   U32         getFormatBytesPerPixel(GFXFormat fmt);
 
    U32         getSurfaceSize(const U32 mipLevel) const;
 
@@ -220,6 +226,7 @@ public:
    bool        getColor(const U32 x, const U32 y, ColorI& rColor) const;
    bool        setColor(const U32 x, const U32 y, const ColorI& rColor);
    U8          getChanelValueAt(U32 x, U32 y, U32 chan);
+   U32         getNumFaces() const { return mNumFaces; }
 
    /// This method will combine bitmapA and bitmapB using the operation specified
    /// by combineOp. The result will be stored in the bitmap that this method is
@@ -275,7 +282,7 @@ public:
 
 private:
    GFXFormat mInternalFormat;
-
+   U32 mNumFaces;    // default 1, set to 6 for cubemap
    U8* mBits; // Master bytes
    U32 mByteSize;
    U32 mWidth;
@@ -284,6 +291,7 @@ private:
 
    U32 mNumMipLevels;
    U32 mMipLevelOffsets[c_maxMipLevels];
+   U32 mFaceOffsets[6];  // Maximum 6 for cubemaps; could also dynamically allocate if needed
 
    bool mHasTransparency;
 
@@ -316,32 +324,39 @@ inline U32 GBitmap::getHeight(const U32 in_mipLevel) const
    return (retVal != 0) ? retVal : 1;
 }
 
-inline const U8* GBitmap::getBits(const U32 in_mipLevel) const
+inline U32 GBitmap::_getFaceOffset(const U32 face) const
+{
+   AssertFatal(face < mNumFaces, "GBitmap::_getFaceOffset: invalid face index");
+
+   return mFaceOffsets[face];
+}
+
+inline const U8* GBitmap::getBits(const U32 in_mipLevel, const U32 face) const
 {
    AssertFatal(in_mipLevel < mNumMipLevels,
                avar("GBitmap::getBits: mip level out of range: (%d, %d)",
                     in_mipLevel, mNumMipLevels));
 
-   return &mBits[mMipLevelOffsets[in_mipLevel]];
+   return &mBits[_getFaceOffset(face) + mMipLevelOffsets[in_mipLevel]];
 }
 
-inline U8* GBitmap::getWritableBits(const U32 in_mipLevel)
+inline U8* GBitmap::getWritableBits(const U32 in_mipLevel, const U32 face)
 {
    AssertFatal(in_mipLevel < mNumMipLevels,
                avar("GBitmap::getWritableBits: mip level out of range: (%d, %d)",
                     in_mipLevel, mNumMipLevels));
 
-   return &mBits[mMipLevelOffsets[in_mipLevel]];
+   return &mBits[_getFaceOffset(face) + mMipLevelOffsets[in_mipLevel]];
 }
 
-inline U8* GBitmap::getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel)
+inline U8* GBitmap::getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel, const U32 face)
 {
-   return (getWritableBits(mipLevel) + (U64)(((in_y * getWidth(mipLevel)) + in_x) * mBytesPerPixel));
+   return (getWritableBits(mipLevel, face) + (U64)(((in_y * getWidth(mipLevel)) + in_x) * mBytesPerPixel));
 }
 
-inline const U8* GBitmap::getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel) const
+inline const U8* GBitmap::getAddress(const S32 in_x, const S32 in_y, const U32 mipLevel, const U32 face) const
 {
-   return (getBits(mipLevel) + ((in_y * getWidth(mipLevel)) + in_x) * mBytesPerPixel);
+   return (getBits(mipLevel, face) + ((in_y * getWidth(mipLevel)) + in_x) * mBytesPerPixel);
 }
 
 template<class T, dsize_t mapLength>
diff --git a/Engine/source/gfx/bitmap/loaders/bitmapSTB.cpp b/Engine/source/gfx/bitmap/loaders/bitmapSTB.cpp
index a0e8cf9a8..b9006b7e0 100644
--- a/Engine/source/gfx/bitmap/loaders/bitmapSTB.cpp
+++ b/Engine/source/gfx/bitmap/loaders/bitmapSTB.cpp
@@ -27,6 +27,7 @@
 #include "core/stream/memStream.h"
 #include "core/strings/stringFunctions.h"
 #include "gfx/bitmap/gBitmap.h"
+#include "gfx/bitmap/bitmapUtils.h"
 #include "gfx/bitmap/imageUtils.h"
 #include "gfx/bitmap/loaders/ies/ies_loader.h"
 #include "platform/profiler.h"
@@ -41,12 +42,12 @@
 #ifndef STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
-#include "stb_image.h"
+#include "gfx/bitmap/loaders/stb/stb_image.h"
 #endif
 
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #define STB_IMAGE_WRITE_STATIC
-#include "stb_image_write.h"
+#include "gfx/bitmap/loaders/stb/stb_image_write.h"
 
 #pragma warning(pop)
 
@@ -56,6 +57,38 @@ static bool sReadStreamSTB(Stream& stream, GBitmap* bitmap, U32 len);
 static bool sWriteSTB(const Torque::Path& path, GBitmap* bitmap, U32 compressionLevel);
 static bool sWriteStreamSTB(const String& bmType, Stream& stream, GBitmap* bitmap, U32 compressionLevel);
 
+static GFXFormat determineFormat(bool isHDR, bool is16Bit, int numChannels)
+{
+   if (isHDR)
+   {
+      // we force hdr to 4 channels.
+      return GFXFormatR32G32B32A32F;
+   }
+   else if (is16Bit)
+   {
+      switch (numChannels)
+      {
+      case 1: return GFXFormatL16;
+      case 2: return GFXFormatA8L8; // No native L16A16, but could add one later
+      case 3: return GFXFormatR16G16B16A16;
+      case 4: return GFXFormatR16G16B16A16;
+      }
+   }
+   else // 8-bit
+   {
+      switch (numChannels)
+      {
+      case 1: return GFXFormatA8;
+      case 2: return GFXFormatA8L8;
+      case 3: return GFXFormatR8G8B8;
+      case 4: return GFXFormatR8G8B8A8;
+      }
+   }
+
+   // fallback
+   return GFXFormatR8G8B8A8;
+}
+
 // stbi_write callback / rextimmy.
 static void stbiWriteFunc(void* context, void* data, int size)
 {
@@ -210,119 +243,55 @@ bool sReadSTB(const Torque::Path& path, GBitmap* bitmap)
 
    }
 
-   if (!stbi_info(path.getFullPath().c_str(), &x, &y, &channels))
-   {
-      const char* stbErr = stbi_failure_reason();
+   const char* filePath = path.getFullPath().c_str();
 
-      if (!stbErr)
-         stbErr = "Unknown Error!";
+   // Detect format
+   bool isHDR = stbi_is_hdr(filePath);
+   bool is16Bit = stbi_is_16_bit(filePath);
 
-      Con::errorf("STB get file info: %s", stbErr);
+   void* data = nullptr;
+
+   if (isHDR) {
+      data = stbi_loadf(filePath, &x, &y, &n, 4);
    }
+   else if (is16Bit)
+      data = stbi_load_16(filePath, &x, &y, &n, 0);
+   else
+      data = stbi_load(filePath, &x, &y, &n, 0);
 
-   // do this to map 2 channels to 4, 2 channel not supported by gbitmap yet..
-   if (channels == 2)
-      channels = 4;
-   if (!ext.equal("png"))
+   if (!data)
    {
-      if (stbi_is_16_bit(path.getFullPath().c_str()))
-      {
-         U16* data = stbi_load_16(path.getFullPath().c_str(), &x, &y, &n, channels);
-
-         // if succesful deal make the bitmap, else try other loaders.
-         if (data)
-         {
-            GFXFormat format;
-            if (n == 1)
-               format = GFXFormatL16;
-            else
-               format = GFXFormatR16G16B16A16; // not sure if this is correct.
-
-            bitmap->deleteImage();
-
-            // actually allocate the bitmap space...
-            bitmap->allocateBitmap(x, y,
-               false,            // don't extrude miplevels...
-               format);          // use determined format...
-
-            U16* pBase = (U16*)bitmap->getBits();
-
-            U32 rowBytes = bitmap->getByteSize();
-
-            dMemcpy(pBase, data, rowBytes);
-
-            stbi_image_free(data);
-
-            PROFILE_END();
-            return true;
-         }
-      }
-   }
-
-   if (ext.equal("hdr"))
-   {
-      // force load to 4 channel.
-      float* data = stbi_loadf(path.getFullPath().c_str(), &x, &y, &n, 0);
-
-      unsigned char* dataChar = stbi__hdr_to_ldr(data, x, y, n);
-      bitmap->deleteImage();
-      // actually allocate the bitmap space...
-      bitmap->allocateBitmap(x, y,
-         false,
-         GFXFormatR8G8B8);
-
-      U8* pBase = (U8*)bitmap->getBits();
-
-      U32 rowBytes = x * y * n;
-
-      dMemcpy(pBase, dataChar, rowBytes);
-
-      //stbi_image_free(data);
-      stbi_image_free(dataChar);
-
-      PROFILE_END();
-      return true;
-   }
-
-   unsigned char* data = stbi_load(path.getFullPath().c_str(), &x, &y, &n, channels);
-
-   bitmap->deleteImage();
-
-   GFXFormat format;
-
-   switch (channels) {
-   case  1:
-      format = GFXFormatA8;
-      break;
-   case 2:
-      format = GFXFormatA8L8;
-      break;
-   case 3:
-      format = GFXFormatR8G8B8;
-      break;
-   case 4:
-      format = GFXFormatR8G8B8A8;
-      break;
-   default:
       PROFILE_END();
+      Con::errorf("sReadSTB() - Failed to load %s: %s", filePath, stbi_failure_reason());
       return false;
    }
 
-   // actually allocate the bitmap space...
-   bitmap->allocateBitmap(x, y,
-      false,            // don't extrude miplevels...
-      format);          // use determined format...
+   // Determine internal GFX format
+   GFXFormat format = determineFormat(isHDR, is16Bit, n);
 
-   U8* pBase = (U8*)bitmap->getBits();
+   // Allocate bitmap
+   bitmap->deleteImage();
+   bitmap->allocateBitmap(x, y, false, format);
 
-   U32 rowBytes = bitmap->getByteSize();
-
-   dMemcpy(pBase, data, rowBytes);
+   //if (isHDR)
+   //{
+   //   U16* pBase = (U16*)bitmap->getBits();
+   //   const size_t totalPixels = (size_t)x * (size_t)y;
+   //   for (size_t i = 0; i < totalPixels * 4; ++i)
+   //   {
+   //      pBase[i] = convertFloatToHalf(reinterpret_cast<F32*>(data)[i]); // convert F32 -> U16
+   //   }
+   //}
+   //else
+   //{
+      U8* dst = (U8*)bitmap->getBits();
+      U32 byteSize = bitmap->getByteSize();
+      dMemcpy(dst, data, byteSize);
+   //}
 
    stbi_image_free(data);
-   // Check this bitmap for transparency
-   if (channels == 4)
-      bitmap->checkForTransparency();
+
+   bitmap->checkForTransparency();
 
    PROFILE_END();
    return true;
@@ -331,45 +300,36 @@ bool sReadSTB(const Torque::Path& path, GBitmap* bitmap)
 bool sReadStreamSTB(Stream& stream, GBitmap* bitmap, U32 len)
 {
    PROFILE_SCOPE(sReadStreamSTB);
-   // only used for font at the moment.
 
-   U8* data = new U8[len];
-   stream.read(len, data);
+   Vector<U8> data(len);
+   stream.read(len, data.address());
 
-   S32 width, height, comp = 0;
+   int x, y, n;
+   bool isHDR = stbi_is_hdr_from_memory(data.address(), len);
+   bool is16Bit = stbi_is_16_bit_from_memory(data.address(), len);
 
-   unsigned char* pixelData = stbi_load_from_memory((const U8*)data, (int)len, &width, &height, &comp, 0);
+   void* pixels = nullptr;
+   if (isHDR)
+      pixels = stbi_loadf_from_memory(data.address(), len, &x, &y, &n, 0);
+   else if (is16Bit)
+      pixels = stbi_load_16_from_memory(data.address(), len, &x, &y, &n, 0);
+   else
+      pixels = stbi_load_from_memory(data.address(), len, &x, &y, &n, 0);
 
-   if (!pixelData)
+   if (!pixels)
    {
-      const char* stbErr = stbi_failure_reason();
-
-      if (!stbErr)
-         stbErr = "Unknown Error!";
-
-      Con::errorf("sReadStreamSTB Error: %s", stbErr);
+      Con::errorf("sReadStreamSTB() - STB load failed: %s", stbi_failure_reason());
       return false;
    }
+
+   GFXFormat format = determineFormat(isHDR, is16Bit, n);
    bitmap->deleteImage();
+   bitmap->allocateBitmap(x, y, false, format);
+   dMemcpy(bitmap->getWritableBits(0), pixels, bitmap->getByteSize());
 
-   //work out what format we need to use - todo floating point?
-   GFXFormat fmt = GFXFormat_FIRST;
-   switch (comp)
-   {
-   case 1: fmt = GFXFormatA8; break;
-   case 2: fmt = GFXFormatA8L8; break; //todo check this
-   case 3: fmt = GFXFormatR8G8B8; break;
-   case 4: fmt = GFXFormatR8G8B8A8; break;
-   }
+   stbi_image_free(pixels);
 
-   bitmap->allocateBitmap(width, height, false, fmt);
-
-   U8* pBase = bitmap->getWritableBits(0);
-   U32 rowBytes = bitmap->getByteSize();
-   dMemcpy(pBase, pixelData, rowBytes);
-
-   dFree(data);
-   dFree(pixelData);
+   bitmap->checkForTransparency();
 
    return true;
 }
@@ -401,47 +361,34 @@ bool sWriteSTB(const Torque::Path& path, GBitmap* bitmap, U32 compressionLevel)
    GFXFormat format = bitmap->getFormat();
    String ext = path.getExtension();
 
-
-   // we always have at least 1
-   U32 comp = 1;
-
-   if (format == GFXFormatR8G8B8)
-   {
-      comp = 3;
-   }
-   else if (format == GFXFormatR8G8B8A8 || format == GFXFormatR8G8B8X8 || format == GFXFormatR8G8B8A8_LINEAR_FORCE)
-   {
-      comp = 4;
-   }
-
    if (ext.equal("png"))
    {
       stbi_write_png_compression_level = compressionLevel;
-      if (stbi_write_png(path.getFullPath().c_str(), width, height, comp, bitmap->getWritableBits(), 0))
+      if (stbi_write_png(path.getFullPath().c_str(), width, height, bytes, bitmap->getWritableBits(), 0))
          return true;
    }
 
    if (ext.equal("tga"))
    {
-      if (stbi_write_tga(path.getFullPath().c_str(), width, height, comp, bitmap->getWritableBits()))
+      if (stbi_write_tga(path.getFullPath().c_str(), width, height, bytes, bitmap->getWritableBits()))
          return true;
    }
 
    if (ext.equal("bmp"))
    {
-      if (stbi_write_bmp(path.getFullPath().c_str(), width, height, comp, bitmap->getWritableBits()))
+      if (stbi_write_bmp(path.getFullPath().c_str(), width, height, bytes, bitmap->getWritableBits()))
          return true;
    }
 
    if (ext.equal("jpg") || ext.equal("jpeg"))
    {
-      if (stbi_write_jpg(path.getFullPath().c_str(), width, height, comp, bitmap->getWritableBits(), compressionLevel))
+      if (stbi_write_jpg(path.getFullPath().c_str(), width, height, bytes, bitmap->getWritableBits(), compressionLevel))
          return true;
    }
 
    if (ext.equal("hdr"))
    {
-      if (stbi_write_hdr(path.getFullPath().c_str(), width, height, comp, (const F32*)bitmap->getWritableBits()))
+      if (stbi_write_hdr(path.getFullPath().c_str(), width, height, bytes, (const F32*)bitmap->getWritableBits()))
          return true;
    }
 
diff --git a/Engine/source/gfx/bitmap/loaders/stb/stb_image.h b/Engine/source/gfx/bitmap/loaders/stb/stb_image.h
index 1f44c87b4..9eedabedc 100644
--- a/Engine/source/gfx/bitmap/loaders/stb/stb_image.h
+++ b/Engine/source/gfx/bitmap/loaders/stb/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -48,6 +48,8 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.30  (2024-05-31) avoid erroneous gcc warning
+      2.29  (2023-05-xx) optimizations
       2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
       2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
       2.26  (2020-07-13) many minor fixes
@@ -1072,8 +1074,8 @@ static int stbi__addints_valid(int a, int b)
    return a <= INT_MAX - b;
 }
 
-// returns 1 if the product of two signed shorts is valid, 0 on overflow.
-static int stbi__mul2shorts_valid(short a, short b)
+// returns 1 if the product of two ints fits in a signed short, 0 on overflow.
+static int stbi__mul2shorts_valid(int a, int b)
 {
    if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
    if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
@@ -3384,13 +3386,13 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
    return 1;
 }
 
-static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
 {
    // some JPEGs have junk at end, skip over it but if we find what looks
    // like a valid marker, resume there
    while (!stbi__at_eof(j->s)) {
-      int x = stbi__get8(j->s);
-      while (x == 255) { // might be a marker
+      stbi_uc x = stbi__get8(j->s);
+      while (x == 0xff) { // might be a marker
          if (stbi__at_eof(j->s)) return STBI__MARKER_none;
          x = stbi__get8(j->s);
          if (x != 0x00 && x != 0xff) {
@@ -4176,6 +4178,7 @@ typedef struct
 {
    stbi_uc *zbuffer, *zbuffer_end;
    int num_bits;
+   int hit_zeof_once;
    stbi__uint32 code_buffer;
 
    char *zout;
@@ -4242,9 +4245,20 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
    int b,s;
    if (a->num_bits < 16) {
       if (stbi__zeof(a)) {
-         return -1;   /* report error for unexpected end of data. */
+         if (!a->hit_zeof_once) {
+            // This is the first time we hit eof, insert 16 extra padding btis
+            // to allow us to keep going; if we actually consume any of them
+            // though, that is invalid data. This is caught later.
+            a->hit_zeof_once = 1;
+            a->num_bits += 16; // add 16 implicit zero bits
+         } else {
+            // We already inserted our extra 16 padding bits and are again
+            // out, this stream is actually prematurely terminated.
+            return -1;
+         }
+      } else {
+         stbi__fill_bits(a);
       }
-      stbi__fill_bits(a);
    }
    b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
    if (b) {
@@ -4309,6 +4323,13 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
          int len,dist;
          if (z == 256) {
             a->zout = zout;
+            if (a->hit_zeof_once && a->num_bits < 16) {
+               // The first time we hit zeof, we inserted 16 extra zero bits into our bit
+               // buffer so the decoder can just do its speculative decoding. But if we
+               // actually consumed any of those bits (which is the case when num_bits < 16),
+               // the stream actually read past the end so it is malformed.
+               return stbi__err("unexpected end","Corrupt PNG");
+            }
             return 1;
          }
          if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
@@ -4320,7 +4341,7 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
          dist = stbi__zdist_base[z];
          if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
          if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
-         if (zout + len > a->zout_end) {
+         if (len > a->zout_end - zout) {
             if (!stbi__zexpand(a, zout, len)) return 0;
             zout = a->zout;
          }
@@ -4464,6 +4485,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       if (!stbi__parse_zlib_header(a)) return 0;
    a->num_bits = 0;
    a->code_buffer = 0;
+   a->hit_zeof_once = 0;
    do {
       final = stbi__zreceive(a,1);
       type = stbi__zreceive(a,2);
@@ -4619,9 +4641,8 @@ enum {
    STBI__F_up=2,
    STBI__F_avg=3,
    STBI__F_paeth=4,
-   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
-   STBI__F_avg_first,
-   STBI__F_paeth_first
+   // synthetic filter used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first
 };
 
 static stbi_uc first_row_filter[5] =
@@ -4630,29 +4651,56 @@ static stbi_uc first_row_filter[5] =
    STBI__F_sub,
    STBI__F_none,
    STBI__F_avg_first,
-   STBI__F_paeth_first
+   STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub
 };
 
 static int stbi__paeth(int a, int b, int c)
 {
-   int p = a + b - c;
-   int pa = abs(p-a);
-   int pb = abs(p-b);
-   int pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return a;
-   if (pb <= pc) return b;
-   return c;
+   // This formulation looks very different from the reference in the PNG spec, but is
+   // actually equivalent and has favorable data dependencies and admits straightforward
+   // generation of branch-free code, which helps performance significantly.
+   int thresh = c*3 - (a + b);
+   int lo = a < b ? a : b;
+   int hi = a < b ? b : a;
+   int t0 = (hi <= thresh) ? lo : c;
+   int t1 = (thresh <= lo) ? hi : t0;
+   return t1;
 }
 
 static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 
+// adds an extra all-255 alpha channel
+// dest == src is legal
+// img_n must be 1 or 3
+static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n)
+{
+   int i;
+   // must process data backwards since we allow dest==src
+   if (img_n == 1) {
+      for (i=x-1; i >= 0; --i) {
+         dest[i*2+1] = 255;
+         dest[i*2+0] = src[i];
+      }
+   } else {
+      STBI_ASSERT(img_n == 3);
+      for (i=x-1; i >= 0; --i) {
+         dest[i*4+3] = 255;
+         dest[i*4+2] = src[i*3+2];
+         dest[i*4+1] = src[i*3+1];
+         dest[i*4+0] = src[i*3+0];
+      }
+   }
+}
+
 // create the png data from post-deflated data
 static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
 {
-   int bytes = (depth == 16? 2 : 1);
+   int bytes = (depth == 16 ? 2 : 1);
    stbi__context *s = a->s;
    stbi__uint32 i,j,stride = x*out_n*bytes;
    stbi__uint32 img_len, img_width_bytes;
+   stbi_uc *filter_buf;
+   int all_ok = 1;
    int k;
    int img_n = s->img_n; // copy it into a local for later
 
@@ -4664,8 +4712,11 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
    a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
    if (!a->out) return stbi__err("outofmem", "Out of memory");
 
+   // note: error exits here don't need to clean up a->out individually,
+   // stbi__do_png always does on error.
    if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG");
    img_len = (img_width_bytes + 1) * y;
 
    // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
@@ -4673,189 +4724,137 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
    // so just check for raw_len < img_len always.
    if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 
+   // Allocate two scan lines worth of filter workspace buffer.
+   filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0);
+   if (!filter_buf) return stbi__err("outofmem", "Out of memory");
+
+   // Filtering for low-bit-depth images
+   if (depth < 8) {
+      filter_bytes = 1;
+      width = img_width_bytes;
+   }
+
    for (j=0; j < y; ++j) {
-      stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior;
+      // cur/prior filter buffers alternate
+      stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes;
+      stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes;
+      stbi_uc *dest = a->out + stride*j;
+      int nk = width * filter_bytes;
       int filter = *raw++;
 
-      if (filter > 4)
-         return stbi__err("invalid filter","Corrupt PNG");
-
-      if (depth < 8) {
-         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
-         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
-         filter_bytes = 1;
-         width = img_width_bytes;
+      // check filter type
+      if (filter > 4) {
+         all_ok = stbi__err("invalid filter","Corrupt PNG");
+         break;
       }
-      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
       // if first row, use special filter that doesn't sample previous row
       if (j == 0) filter = first_row_filter[filter];
 
-      // handle first byte explicitly
-      for (k=0; k < filter_bytes; ++k) {
-         switch (filter) {
-            case STBI__F_none       : cur[k] = raw[k]; break;
-            case STBI__F_sub        : cur[k] = raw[k]; break;
-            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
-            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
-            case STBI__F_avg_first  : cur[k] = raw[k]; break;
-            case STBI__F_paeth_first: cur[k] = raw[k]; break;
-         }
+      // perform actual filtering
+      switch (filter) {
+      case STBI__F_none:
+         memcpy(cur, raw, nk);
+         break;
+      case STBI__F_sub:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]);
+         break;
+      case STBI__F_up:
+         for (k = 0; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+         break;
+      case STBI__F_avg:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1));
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1));
+         break;
+      case STBI__F_paeth:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0)
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes]));
+         break;
+      case STBI__F_avg_first:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1));
+         break;
       }
 
-      if (depth == 8) {
-         if (img_n != out_n)
-            cur[img_n] = 255; // first pixel
-         raw += img_n;
-         cur += out_n;
-         prior += out_n;
-      } else if (depth == 16) {
-         if (img_n != out_n) {
-            cur[filter_bytes]   = 255; // first pixel top byte
-            cur[filter_bytes+1] = 255; // first pixel bottom byte
-         }
-         raw += filter_bytes;
-         cur += output_bytes;
-         prior += output_bytes;
-      } else {
-         raw += 1;
-         cur += 1;
-         prior += 1;
-      }
+      raw += nk;
 
-      // this is a little gross, so that we don't switch per-pixel or per-component
-      if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*filter_bytes;
-         #define STBI__CASE(f) \
-             case f:     \
-                for (k=0; k < nk; ++k)
-         switch (filter) {
-            // "none" filter turns into a memcpy here; make that explicit.
-            case STBI__F_none:         memcpy(cur, raw, nk); break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-         raw += nk;
-      } else {
-         STBI_ASSERT(img_n+1 == out_n);
-         #define STBI__CASE(f) \
-             case f:     \
-                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
-                   for (k=0; k < filter_bytes; ++k)
-         switch (filter) {
-            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-
-         // the loop above sets the high byte of the pixels' alpha, but for
-         // 16 bit png files we also need the low byte set. we'll do that here.
-         if (depth == 16) {
-            cur = a->out + stride*j; // start at the beginning of the row again
-            for (i=0; i < x; ++i,cur+=output_bytes) {
-               cur[filter_bytes+1] = 255;
-            }
-         }
-      }
-   }
-
-   // we make a separate pass to expand bits to pixels; for performance,
-   // this could run two scanlines behind the above code, so it won't
-   // intefere with filtering but will still be in the cache.
-   if (depth < 8) {
-      for (j=0; j < y; ++j) {
-         stbi_uc *cur = a->out + stride*j;
-         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
-         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
-         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+      // expand decoded bits in cur to dest, also adding an extra alpha channel if desired
+      if (depth < 8) {
          stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+         stbi_uc *in = cur;
+         stbi_uc *out = dest;
+         stbi_uc inb = 0;
+         stbi__uint32 nsmp = x*img_n;
 
-         // note that the final byte might overshoot and write more data than desired.
-         // we can allocate enough data that this never writes out of memory, but it
-         // could also overwrite the next scanline. can it overwrite non-empty data
-         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
-         // so we need to explicitly clamp the final ones
-
+         // expand bits to bytes first
          if (depth == 4) {
-            for (k=x*img_n; k >= 2; k-=2, ++in) {
-               *cur++ = scale * ((*in >> 4)       );
-               *cur++ = scale * ((*in     ) & 0x0f);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 1) == 0) inb = *in++;
+               *out++ = scale * (inb >> 4);
+               inb <<= 4;
             }
-            if (k > 0) *cur++ = scale * ((*in >> 4)       );
          } else if (depth == 2) {
-            for (k=x*img_n; k >= 4; k-=4, ++in) {
-               *cur++ = scale * ((*in >> 6)       );
-               *cur++ = scale * ((*in >> 4) & 0x03);
-               *cur++ = scale * ((*in >> 2) & 0x03);
-               *cur++ = scale * ((*in     ) & 0x03);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 3) == 0) inb = *in++;
+               *out++ = scale * (inb >> 6);
+               inb <<= 2;
             }
-            if (k > 0) *cur++ = scale * ((*in >> 6)       );
-            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
-            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
-         } else if (depth == 1) {
-            for (k=x*img_n; k >= 8; k-=8, ++in) {
-               *cur++ = scale * ((*in >> 7)       );
-               *cur++ = scale * ((*in >> 6) & 0x01);
-               *cur++ = scale * ((*in >> 5) & 0x01);
-               *cur++ = scale * ((*in >> 4) & 0x01);
-               *cur++ = scale * ((*in >> 3) & 0x01);
-               *cur++ = scale * ((*in >> 2) & 0x01);
-               *cur++ = scale * ((*in >> 1) & 0x01);
-               *cur++ = scale * ((*in     ) & 0x01);
+         } else {
+            STBI_ASSERT(depth == 1);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 7) == 0) inb = *in++;
+               *out++ = scale * (inb >> 7);
+               inb <<= 1;
             }
-            if (k > 0) *cur++ = scale * ((*in >> 7)       );
-            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
-            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
-            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
-            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
-            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
-            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
          }
-         if (img_n != out_n) {
-            int q;
-            // insert alpha = 255
-            cur = a->out + stride*j;
+
+         // insert alpha=255 values if desired
+         if (img_n != out_n)
+            stbi__create_png_alpha_expand8(dest, dest, x, img_n);
+      } else if (depth == 8) {
+         if (img_n == out_n)
+            memcpy(dest, cur, x*img_n);
+         else
+            stbi__create_png_alpha_expand8(dest, cur, x, img_n);
+      } else if (depth == 16) {
+         // convert the image data from big-endian to platform-native
+         stbi__uint16 *dest16 = (stbi__uint16*)dest;
+         stbi__uint32 nsmp = x*img_n;
+
+         if (img_n == out_n) {
+            for (i = 0; i < nsmp; ++i, ++dest16, cur += 2)
+               *dest16 = (cur[0] << 8) | cur[1];
+         } else {
+            STBI_ASSERT(img_n+1 == out_n);
             if (img_n == 1) {
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*2+1] = 255;
-                  cur[q*2+0] = cur[q];
+               for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = 0xffff;
                }
             } else {
                STBI_ASSERT(img_n == 3);
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*4+3] = 255;
-                  cur[q*4+2] = cur[q*3+2];
-                  cur[q*4+1] = cur[q*3+1];
-                  cur[q*4+0] = cur[q*3+0];
+               for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = (cur[2] << 8) | cur[3];
+                  dest16[2] = (cur[4] << 8) | cur[5];
+                  dest16[3] = 0xffff;
                }
             }
          }
       }
-   } else if (depth == 16) {
-      // force the image data from big-endian to platform-native.
-      // this is done in a separate pass due to the decoding relying
-      // on the data being untouched, but could probably be done
-      // per-line during decode if care is taken.
-      stbi_uc *cur = a->out;
-      stbi__uint16 *cur16 = (stbi__uint16*)cur;
-
-      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
-         *cur16 = (cur[0] << 8) | cur[1];
-      }
    }
 
+   STBI_FREE(filter_buf);
+   if (!all_ok) return 0;
+
    return 1;
 }
 
@@ -5161,9 +5160,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
                if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
                if (z->depth == 16) {
-                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+                  for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning
+                     tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
                } else {
-                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+                  for (k = 0; k < s->img_n && k < 3; ++k)
+                     tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
                }
             }
             break;
@@ -7984,4 +7985,4 @@ AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ------------------------------------------------------------------------------
-*/
\ No newline at end of file
+*/
diff --git a/Engine/source/gfx/bitmap/loaders/stb/stb_image_resize2.h b/Engine/source/gfx/bitmap/loaders/stb/stb_image_resize2.h
index e0c428246..6146ab7ee 100644
--- a/Engine/source/gfx/bitmap/loaders/stb/stb_image_resize2.h
+++ b/Engine/source/gfx/bitmap/loaders/stb/stb_image_resize2.h
@@ -1,9 +1,9 @@
-/* stb_image_resize2 - v2.01 - public domain image resizing
-   
-   by Jeff Roberts (v2) and Jorge L Rodriguez 
+/* stb_image_resize2 - v2.17 - public domain image resizing
+
+   by Jeff Roberts (v2) and Jorge L Rodriguez
    http://github.com/nothings/stb
 
-   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only 
+   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
    scaling and translation is supported, no rotations or shears.
 
    COMPILING & LINKING
@@ -11,35 +11,6 @@
          #define STB_IMAGE_RESIZE_IMPLEMENTATION
       before the #include. That will create the implementation in that file.
 
-   PORTING FROM VERSION 1
-
-      The API has changed. You can continue to use the old version of stb_image_resize.h,
-      which is available in the "deprecated/" directory.
-
-      If you're using the old simple-to-use API, porting is straightforward.
-      (For more advanced APIs, read the documentation.)
-
-        stbir_resize_uint8():
-          - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
-
-        stbir_resize_float():
-          - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
-
-        stbir_resize_uint8_srgb():
-          - function name is unchanged
-          - cast channel count to `stbir_pixel_layout`
-          - above is sufficient unless your image has alpha and it's not RGBA/BGRA
-            - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
-
-        stbir_resize_uint8_srgb_edgemode()
-          - switch to the "medium complexity" API
-          - stbir_resize(), very similar API but a few more parameters:
-            - pixel_layout: cast channel count to `stbir_pixel_layout`
-            - data_type:    STBIR_TYPE_UINT8_SRGB
-            - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
-            - filter:       STBIR_FILTER_DEFAULT
-          - which channel is alpha is specified in stbir_pixel_layout, see enum for details
-
    EASY API CALLS:
      Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
 
@@ -67,60 +38,60 @@
    ADDITIONAL DOCUMENTATION
 
       MEMORY ALLOCATION
-         By default, we use malloc and free for memory allocation.  To override the 
+         By default, we use malloc and free for memory allocation.  To override the
          memory allocation, before the implementation #include, add a:
 
             #define STBIR_MALLOC(size,user_data) ...
             #define STBIR_FREE(ptr,user_data)   ...
 
-         Each resize makes exactly one call to malloc/free (unless you use the 
+         Each resize makes exactly one call to malloc/free (unless you use the
          extended API where you can do one allocation for many resizes). Under
          address sanitizer, we do separate allocations to find overread/writes.
 
       PERFORMANCE
          This library was written with an emphasis on performance. When testing
-         stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with 
+         stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
          STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
-         libs do by default). Also, make sure SIMD is turned on of course (default 
+         libs do by default). Also, make sure SIMD is turned on of course (default
          for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
 
          This library also comes with profiling built-in. If you define STBIR_PROFILE,
-         you can use the advanced API and get low-level profiling information by 
+         you can use the advanced API and get low-level profiling information by
          calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
          after a resize.
 
       SIMD
-         Most of the routines have optimized SSE2, AVX, NEON and WASM versions. 
+         Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
 
-         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and 
-         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or 
+         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
+         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
          STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
-         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2 
+         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
          support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
 
          On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
          we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
          on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
-         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to 
+         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
          automatically enable NEON.
 
          On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
          for converting back and forth to half-floats. This is autoselected when we
-         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses 
-         the built-in half float hardware NEON instructions. 
+         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
+         the built-in half float hardware NEON instructions.
 
-         You can also tell us to use multiply-add instructions with STBIR_USE_FMA. 
+         You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
          Because x86 doesn't always have fma, we turn it off by default to maintain
          determinism across all platforms. If you don't care about non-FMA determinism
-         and are willing to restrict yourself to more recent x86 CPUs (around the AVX 
+         and are willing to restrict yourself to more recent x86 CPUs (around the AVX
          timeframe), then fma will give you around a 15% speedup.
 
          You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
          off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
          to 40% faster, and AVX2 is generally another 12%.
-        
+
       ALPHA CHANNEL
-         Most of the resizing functions provide the ability to control how the alpha 
+         Most of the resizing functions provide the ability to control how the alpha
          channel of an image is processed.
 
          When alpha represents transparency, it is important that when combining
@@ -167,33 +138,33 @@
 
          stb_image_resize expects case #1 by default, applying alpha weighting to
          images, expecting the input images to be unpremultiplied. This is what the
-         COLOR+ALPHA buffer types tell the resizer to do. 
+         COLOR+ALPHA buffer types tell the resizer to do.
 
-         When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB, 
-         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are 
-         non-premultiplied. In these cases, the resizer will alpha weight the colors 
-         (effectively creating the premultiplied image), do the filtering, and then 
+         When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
+         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
+         non-premultiplied. In these cases, the resizer will alpha weight the colors
+         (effectively creating the premultiplied image), do the filtering, and then
          convert back to non-premult on exit.
 
          When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
-         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels 
-         ARE premultiplied. In this case, the resizer doesn't have to do the 
-         premultipling - it can filter directly on the input. This about twice as 
-         fast as the non-premultiplied case, so it's the right option if your data is 
+         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
+         ARE premultiplied. In this case, the resizer doesn't have to do the
+         premultipling - it can filter directly on the input. This about twice as
+         fast as the non-premultiplied case, so it's the right option if your data is
          already setup correctly.
 
-         When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are 
-         telling us that there is no channel that represents transparency; it may be 
-         RGB and some unrelated fourth channel that has been stored in the alpha 
-         channel, but it is actually not alpha. No special processing will be 
-         performed. 
+         When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
+         telling us that there is no channel that represents transparency; it may be
+         RGB and some unrelated fourth channel that has been stored in the alpha
+         channel, but it is actually not alpha. No special processing will be
+         performed.
 
-         The difference between the generic 4 or 2 channel layouts, and the 
+         The difference between the generic 4 or 2 channel layouts, and the
          specialized _PM versions is with the _PM versions you are telling us that
          the data *is* alpha, just don't premultiply it. That's important when
          using SRGB pixel formats, we need to know where the alpha is, because
          it is converted linearly (rather than with the SRGB converters).
-   
+
          Because alpha weighting produces the same effect as premultiplying, you
          even have the option with non-premultiplied inputs to let the resizer
          produce a premultiplied output. Because the intially computed alpha-weighted
@@ -201,10 +172,10 @@
          than the normal path which un-premultiplies the output image as a final step.
 
          Finally, when converting both in and out of non-premulitplied space (for
-         example, when using STBIR_RGBA), we go to somewhat heroic measures to 
-         ensure that areas with zero alpha value pixels get something reasonable 
-         in the RGB values. If you don't care about the RGB values of zero alpha 
-         pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality() 
+         example, when using STBIR_RGBA), we go to somewhat heroic measures to
+         ensure that areas with zero alpha value pixels get something reasonable
+         in the RGB values. If you don't care about the RGB values of zero alpha
+         pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
          function - this runs a premultiplied resize about 25% faster. That said,
          when you really care about speed, using premultiplied pixels for both in
          and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
@@ -218,38 +189,38 @@
          layouts with the same number of channels.
 
       DETERMINISM
-         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc). 
-         This requires compiling with fast-math off (using at least /fp:precise). 
+         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
+         This requires compiling with fast-math off (using at least /fp:precise).
          Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
-         We attempt to do this with pragmas, but with Clang, you usually want to add 
+         We attempt to do this with pragmas, but with Clang, you usually want to add
          -ffp-contract=off to the command line as well.
 
-         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is, 
-         if the scalar x87 unit gets used at all, we immediately lose determinism. 
+         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
+         if the scalar x87 unit gets used at all, we immediately lose determinism.
          On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
-         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even 
-         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and 
+         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
+         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
          -fpmath=sse.
 
          Note that we will not be deterministic with float data containing NaNs -
-         the NaNs will propagate differently on different SIMD and platforms. 
+         the NaNs will propagate differently on different SIMD and platforms.
 
-         If you turn on STBIR_USE_FMA, then we will be deterministic with other 
-         fma targets, but we will differ from non-fma targets (this is unavoidable, 
-         because a fma isn't simply an add with a mult - it also introduces a 
-         rounding difference compared to non-fma instruction sequences. 
+         If you turn on STBIR_USE_FMA, then we will be deterministic with other
+         fma targets, but we will differ from non-fma targets (this is unavoidable,
+         because a fma isn't simply an add with a mult - it also introduces a
+         rounding difference compared to non-fma instruction sequences.
 
       FLOAT PIXEL FORMAT RANGE
-         Any range of values can be used for the non-alpha float data that you pass 
-         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values 
-         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we 
-         scale back properly. The alpha channel must also be 0 to 1 for any format 
-         that does premultiplication prior to resizing. 
+         Any range of values can be used for the non-alpha float data that you pass
+         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
+         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
+         scale back properly. The alpha channel must also be 0 to 1 for any format
+         that does premultiplication prior to resizing.
 
-         Note also that with float output, using filters with negative lobes, the 
-         output filtered values might go slightly out of range. You can define 
-         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range 
-         to clamp to on output, if that's important. 
+         Note also that with float output, using filters with negative lobes, the
+         output filtered values might go slightly out of range. You can define
+         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
+         to clamp to on output, if that's important.
 
       MAX/MIN SCALE FACTORS
          The input pixel resolutions are in integers, and we do the internal pointer
@@ -263,13 +234,13 @@
          buffers).
 
       FLIPPED IMAGES
-         Stride is just the delta from one scanline to the next. This means you can 
-         use a negative stride to handle inverted images (point to the final 
+         Stride is just the delta from one scanline to the next. This means you can
+         use a negative stride to handle inverted images (point to the final
          scanline and use a negative stride). You can invert the input or output,
          using negative strides.
 
       DEFAULT FILTERS
-         For functions which don't provide explicit control over what filters to 
+         For functions which don't provide explicit control over what filters to
          use, you can change the compile-time defaults with:
 
             #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
@@ -278,24 +249,52 @@
          See stbir_filter in the header-file section for the list of filters.
 
       NEW FILTERS
-         A number of 1D filter kernels are supplied. For a list of supported 
-         filters, see the stbir_filter enum. You can install your own filters by 
+         A number of 1D filter kernels are supplied. For a list of supported
+         filters, see the stbir_filter enum. You can install your own filters by
          using the stbir_set_filter_callbacks function.
 
       PROGRESS
-         For interactive use with slow resize operations, you can use the the 
-         scanline callbacks in the extended API. It would have to be a *very* large 
+         For interactive use with slow resize operations, you can use the 
+         scanline callbacks in the extended API. It would have to be a *very* large
          image resample to need progress though - we're very fast.
 
       CEIL and FLOOR
-         In scalar mode, the only functions we use from math.h are ceilf and floorf, 
-         but if you have your own versions, you can define the STBIR_CEILF(v) and 
+         In scalar mode, the only functions we use from math.h are ceilf and floorf,
+         but if you have your own versions, you can define the STBIR_CEILF(v) and
          STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
          our own versions.
 
       ASSERT
          Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
 
+     PORTING FROM VERSION 1
+        The API has changed. You can continue to use the old version of stb_image_resize.h,
+        which is available in the "deprecated/" directory.
+
+        If you're using the old simple-to-use API, porting is straightforward.
+        (For more advanced APIs, read the documentation.)
+
+          stbir_resize_uint8():
+            - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_float():
+            - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
+
+          stbir_resize_uint8_srgb():
+            - function name is unchanged
+            - cast channel count to `stbir_pixel_layout`
+            - above is sufficient unless your image has alpha and it's not RGBA/BGRA
+              - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
+
+          stbir_resize_uint8_srgb_edgemode()
+            - switch to the "medium complexity" API
+            - stbir_resize(), very similar API but a few more parameters:
+              - pixel_layout: cast channel count to `stbir_pixel_layout`
+              - data_type:    STBIR_TYPE_UINT8_SRGB
+              - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
+              - filter:       STBIR_FILTER_DEFAULT
+            - which channel is alpha is specified in stbir_pixel_layout, see enum for details
+
       FUTURE TODOS
         *  For polyphase integral filters, we just memcpy the coeffs to dupe
            them, but we should indirect and use the same coeff memory.
@@ -304,10 +303,12 @@
          * For SIMD encode and decode scanline routines, do any pre-aligning
            for bad input/output buffer alignments and pitch?
          * For very wide scanlines, we should we do vertical strips to stay within
-           L2 cache. Maybe do chunks of 1K pixels at a time. There would be 
+           L2 cache. Maybe do chunks of 1K pixels at a time. There would be
            some pixel reconversion, but probably dwarfed by things falling out
            of cache. Probably also something possible with alternating between
            scattering and gathering at high resize scales?
+         * Should we have a multiple MIPs at the same time function (could keep
+           more memory in cache during multiple resizes)?
          * Rewrite the coefficient generator to do many at once.
          * AVX-512 vertical kernels - worried about downclocking here.
          * Convert the reincludes to macros when we know they aren't changing.
@@ -316,21 +317,56 @@
            the pivot cost and the extra memory touches). Need to buffer the whole
            image so have to balance memory use.
          * Most of our code is internally function pointers, should we compile
-           all the SIMD stuff always and dynamically dispatch? 
+           all the SIMD stuff always and dynamically dispatch?
 
    CONTRIBUTORS
       Jeff Roberts: 2.0 implementation, optimizations, SIMD
-      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer.
+      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer
       Fabian Giesen: half float and srgb converters
       Sean Barrett: API design, optimizations
       Jorge L Rodriguez: Original 1.0 implementation
-      Aras Pranckevicius: bugfixes for 1.0
+      Aras Pranckevicius: bugfixes
       Nathan Reed: warning fixes for 1.0
 
    REVISIONS
-      2.00 (2022-02-20) mostly new source: new api, optimizations, simd, vertical-first, etc 
-                       (2x-5x faster without simd, 4x-12x faster with simd)
-                       (in some cases, 20x to 40x faster - resizing to very small for example)
+      2.17 (2025-10-25) silly format bug in easy-to-use APIs.
+      2.16 (2025-10-21) fixed the easy-to-use APIs to allow inverted bitmaps (negative
+                          strides), fix vertical filter kernel callback, fix threaded
+                          gather buffer priming (and assert).
+                          (thanks adipose, TainZerL, and Harrison Green)
+      2.15 (2025-07-17) fixed an assert in debug mode when using floats with input
+                          callbacks, work around GCC warning when adding to null ptr
+                          (thanks Johannes Spohr and Pyry Kovanen).
+      2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and 
+                          scatter with vertical first.
+      2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for 
+                          tiny-c, fixed some variables that should have been static,
+                          fixes a bug when calculating temp memory with resizes that
+                          exceed 2GB of temp memory (very large resizes).
+      2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE
+      2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode
+                          with AVX-2, fix some weird scaling edge conditions with
+                          point sample mode.
+      2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control,
+                          fix MSVC 32-bit arm half float routines.
+      2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting
+                          hardware half floats).
+      2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks
+                          to Ryan Salsbury), fix for sub-rect resizes, use the
+                          pragmas to control unrolling when they are available.
+      2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
+                          wide scanlines when downsampling (caused by extra input 
+                          converting), fix for wide scanline resamples with many 
+                          splits (int overflow), fix GCC warning.
+      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
+                          undersampling a single row on rare resize ratios (about 1%).
+      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras),
+                        fix for output callback (thanks Julien Koenen).
+      2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
+      2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
+      2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
+                          2x-5x faster without simd, 4x-12x faster with simd,
+                          in some cases, 20x to 40x faster esp resizing large to very small.
       0.96 (2019-03-04) fixed warnings
       0.95 (2017-07-23) fixed warnings
       0.94 (2017-03-18) fixed warnings
@@ -362,62 +398,6 @@ typedef uint32_t stbir_uint32;
 typedef uint64_t stbir_uint64;
 #endif
 
-#ifdef _M_IX86_FP
-#if ( _M_IX86_FP >= 1 )
-#ifndef STBIR_SSE
-#define STBIR_SSE
-#endif
-#endif
-#endif 
-
-#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
-  #ifndef STBIR_SSE2
-    #define STBIR_SSE2
-  #endif
-  #if defined(__AVX__) || defined(STBIR_AVX2)
-    #ifndef STBIR_AVX
-      #ifndef STBIR_NO_AVX
-        #define STBIR_AVX
-      #endif
-    #endif
-  #endif
-  #if defined(__AVX2__) || defined(STBIR_AVX2)
-    #ifndef STBIR_NO_AVX2
-      #ifndef STBIR_AVX2  
-        #define STBIR_AVX2
-      #endif
-      #if defined( _MSC_VER ) && !defined(__clang__)
-        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
-          #define STBIR_FP16C
-        #endif
-      #endif
-    #endif
-  #endif
-  #ifdef __F16C__
-    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
-      #define STBIR_FP16C
-    #endif
-  #endif
-#endif
-
-#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(_M_ARM) || (__ARM_NEON_FP & 4) != 0 &&  __ARM_FP16_FORMAT_IEEE != 0
-#ifndef STBIR_NEON
-#define STBIR_NEON
-#endif
-#endif
-
-#if defined(_M_ARM)
-#ifdef STBIR_USE_FMA
-#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC 
-#endif
-#endif
-
-#if defined(__wasm__) && defined(__wasm_simd128__)
-#ifndef STBIR_WASM
-#define STBIR_WASM
-#endif
-#endif
-
 #ifndef STBIRDEF
 #ifdef STB_IMAGE_RESIZE_STATIC
 #define STBIRDEF static
@@ -435,7 +415,7 @@ typedef uint64_t stbir_uint64;
 //
 // Easy-to-use API:
 //
-//     * stride is the offset between successive rows of image data 
+//     * stride is the offset between successive rows of image data
 //        in memory, in bytes. specify 0 for packed continuously in memory
 //     * colorspace is linear or sRGB as specified by function name
 //     * Uses the default filters
@@ -448,27 +428,35 @@ typedef uint64_t stbir_uint64;
 //   order of channels
 //   whether color is premultiplied by alpha
 // for back compatibility, you can cast the old channel count to an stbir_pixel_layout
-typedef enum 
+typedef enum
 {
-  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
-  STBIR_1CHANNEL = 1,              
+  STBIR_1CHANNEL = 1,
   STBIR_2CHANNEL = 2,
-  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping) 
-  STBIR_RGBA     = 4,               // alpha formats, alpha is NOT premultiplied into color channels
-
+  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping)
+  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
   STBIR_4CHANNEL = 5,
+
+  STBIR_RGBA = 4,                   // alpha formats, where alpha is NOT premultiplied into color channels
   STBIR_BGRA = 6,
   STBIR_ARGB = 7,
   STBIR_ABGR = 8,
   STBIR_RA   = 9,
   STBIR_AR   = 10,
 
-  STBIR_RGBA_PM = 11,               // alpha formats, alpha is premultiplied into color channels
+  STBIR_RGBA_PM = 11,               // alpha formats, where alpha is premultiplied into color channels
   STBIR_BGRA_PM = 12,
   STBIR_ARGB_PM = 13,
   STBIR_ABGR_PM = 14,
   STBIR_RA_PM   = 15,
   STBIR_AR_PM   = 16,
+
+  STBIR_RGBA_NO_AW = 11,            // alpha formats, where NO alpha weighting is applied at all!
+  STBIR_BGRA_NO_AW = 12,            //   these are just synonyms for the _PM flags (which also do
+  STBIR_ARGB_NO_AW = 13,            //   no alpha weighting). These names just make it more clear
+  STBIR_ABGR_NO_AW = 14,            //   for some folks).
+  STBIR_RA_NO_AW   = 15,
+  STBIR_AR_NO_AW   = 16,
+
 } stbir_pixel_layout;
 
 //===============================================================
@@ -549,8 +537,8 @@ STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int inpu
 //     * Separate input and output data types
 //     * Can specify regions with subpixel correctness
 //     * Can specify alpha flags
-//     * Can specify a memory callback 
-//     * Can specify a callback data type for pixel input and output 
+//     * Can specify a memory callback
+//     * Can specify a callback data type for pixel input and output
 //     * Can be threaded for a single resize
 //     * Can be used to resize many frames without recalculating the sampler info
 //
@@ -577,7 +565,7 @@ typedef float stbir__kernel_callback( float x, float scale, void * user_data );
 typedef float stbir__support_callback( float scale, void * user_data );
 
 // internal structure with precomputed scaling
-typedef struct stbir__info stbir__info; 
+typedef struct stbir__info stbir__info;
 
 typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
 {
@@ -604,7 +592,7 @@ typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override fun
   stbir_edge horizontal_edge, vertical_edge;
   stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
   stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
-  stbir__info * samplers;      
+  stbir__info * samplers;
 } STBIR_RESIZE;
 
 // extended complexity api
@@ -620,7 +608,7 @@ STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
 // You can update these parameters any time after resize_init and there is no cost
 //--------------------------------
 
-STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );         
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
 STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb );   // no callbacks by default
 STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data );                                               // pass back STBIR_RESIZE* by default
 STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
@@ -636,7 +624,7 @@ STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout
 STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge );       // CLAMP by default
 
 STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
-STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support ); 
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
 
 STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh );        // sets both sub-regions (full regions by default)
 STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 );    // sets input sub-region (full region by default)
@@ -658,7 +646,7 @@ STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, i
 //--------------------------------
 
 // This builds the samplers and does one allocation
-STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize ); 
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
 
 // You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
 STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
@@ -681,7 +669,7 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
 //   It returns the number of splits (threads) that you can call it with.
 ///  It might be less if the image resize can't be split up that many ways.
 
-STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );             
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
 
 // This function does a split of the resizing (you call this fuction for each
 // split, on multiple threads). A split is a piece of the output resize pixel space.
@@ -691,10 +679,10 @@ STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_sp
 // Usually, you will always call stbir_resize_split with split_start as the thread_index
 //   and "1" for the split_count.
 // But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
-//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the 
+//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
 //   split_count each time to turn in into a 4 thread resize. (This is unusual).
 
-STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );         
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
 //===============================================================
 
 
@@ -705,10 +693,10 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 //   The input callback is super flexible - it calls you with the input address
 //   (based on the stride and base pointer), it gives you an optional_output
 //   pointer that you can fill, or you can just return your own pointer into
-//   your own data. 
+//   your own data.
 //
-//   You can also do conversion from non-supported data types if necessary - in 
-//   this case, you ignore the input_ptr and just use the x and y parameters to 
+//   You can also do conversion from non-supported data types if necessary - in
+//   this case, you ignore the input_ptr and just use the x and y parameters to
 //   calculate your own input_ptr based on the size of each non-supported pixel.
 //   (Something like the third example below.)
 //
@@ -722,14 +710,14 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 //           return input_ptr;  // use buffer from call
 //        }
 //
-//     Next example, copying: (copy from some other buffer or stream):  
+//     Next example, copying: (copy from some other buffer or stream):
 //        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
 //        {
 //           CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
 //           return optional_output;  // return the optional buffer that we filled
 //        }
 //
-//     Third example, input another buffer without copying: (zero-copy from other buffer):  
+//     Third example, input another buffer without copying: (zero-copy from other buffer):
 //        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
 //        {
 //           void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
@@ -758,7 +746,7 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 
 #ifdef STBIR_PROFILE
 
-typedef struct STBIR_PROFILE_INFO 
+typedef struct STBIR_PROFILE_INFO
 {
   stbir_uint64 total_clocks;
 
@@ -766,7 +754,7 @@ typedef struct STBIR_PROFILE_INFO
   //    there are "resize_count" number of zones
   stbir_uint64 clocks[ 8 ];
   char const ** descriptions;
-  
+
   // count of clocks and descriptions
   stbir_uint32 count;
 } STBIR_PROFILE_INFO;
@@ -865,15 +853,15 @@ STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, ST
 #endif
 
 // the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
-//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
-typedef enum 
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
+typedef enum
 {
   STBIRI_1CHANNEL = 0,
   STBIRI_2CHANNEL = 1,
   STBIRI_RGB      = 2,
   STBIRI_BGR      = 3,
   STBIRI_4CHANNEL = 4,
-  
+
   STBIRI_RGBA = 5,
   STBIRI_BGRA = 6,
   STBIRI_ARGB = 7,
@@ -979,7 +967,7 @@ typedef struct
   stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
 } stbir__extents;
 
-typedef struct 
+typedef struct
 {
 #ifdef STBIR_PROFILE
   union
@@ -1008,9 +996,9 @@ typedef struct
   char no_cache_straddle[64];
 } stbir__per_split_info;
 
-typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
+typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
 typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
-typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, 
+typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
   stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
 typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
 typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
@@ -1053,10 +1041,10 @@ struct stbir__info
   stbir__horizontal_gather_channels_func * horizontal_gather_channels;
   stbir__alpha_unweight_func * alpha_unweight;
   stbir__encode_pixels_func * encode_pixels;
-  
-  int alloced_total;
+
+  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
   int splits; // count of splits
-  
+
   stbir_internal_pixel_layout input_pixel_layout_internal;
   stbir_internal_pixel_layout output_pixel_layout_internal;
 
@@ -1065,21 +1053,22 @@ struct stbir__info
   int vertical_first;
   int channels;
   int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
-  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
+  size_t alloced_total;
 };
 
 
 #define stbir__max_uint8_as_float             255.0f
 #define stbir__max_uint16_as_float            65535.0f
-#define stbir__max_uint8_as_float_inverted    (1.0f/255.0f)
-#define stbir__max_uint16_as_float_inverted   (1.0f/65535.0f)
+#define stbir__max_uint8_as_float_inverted    3.9215689e-03f     // (1.0f/255.0f)
+#define stbir__max_uint16_as_float_inverted   1.5259022e-05f     // (1.0f/65535.0f)
 #define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
 
 // min/max friendly
-#define STBIR_CLAMP(x, xmin, xmax) do { \
+#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \
   if ( (x) < (xmin) ) (x) = (xmin);     \
   if ( (x) > (xmax) ) (x) = (xmax);     \
-} while (0)
+  break;                                \
+}
 
 static stbir__inline int stbir__min(int a, int b)
 {
@@ -1141,7 +1130,7 @@ static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
   0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
   0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
 };
- 
+
 static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 {
   static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
@@ -1172,19 +1161,107 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
 #endif
 
-// restrict pointers for the output pointers
+#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
+#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
+#endif
+
+#define STBIR_INPUT_CALLBACK_PADDING 3
+
+#ifdef _M_IX86_FP
+#if ( _M_IX86_FP >= 1 )
+#ifndef STBIR_SSE
+#define STBIR_SSE
+#endif
+#endif
+#endif
+
+#ifdef __TINYC__
+  // tiny c has no intrinsics yet - this can become a version check if they add them
+  #define STBIR_NO_SIMD
+#endif
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
+  #ifndef STBIR_SSE2
+    #define STBIR_SSE2
+  #endif
+  #if defined(__AVX__) || defined(STBIR_AVX2)
+    #ifndef STBIR_AVX
+      #ifndef STBIR_NO_AVX
+        #define STBIR_AVX
+      #endif
+    #endif
+  #endif
+  #if defined(__AVX2__) || defined(STBIR_AVX2)
+    #ifndef STBIR_NO_AVX2
+      #ifndef STBIR_AVX2
+        #define STBIR_AVX2
+      #endif
+      #if defined( _MSC_VER ) && !defined(__clang__)
+        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
+          #define STBIR_FP16C
+        #endif
+      #endif
+    #endif
+  #endif
+  #ifdef __F16C__
+    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
+      #define STBIR_FP16C
+    #endif
+  #endif
+#endif
+
+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__)
+#ifndef STBIR_NEON
+#define STBIR_NEON
+#endif
+#endif
+
+#if defined(_M_ARM) || defined(__arm__)
+#ifdef STBIR_USE_FMA
+#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
+#endif
+#endif
+
+#if defined(__wasm__) && defined(__wasm_simd128__)
+#ifndef STBIR_WASM
+#define STBIR_WASM
+#endif
+#endif
+
+// restrict pointers for the output pointers, other loop and unroll control
 #if defined( _MSC_VER ) && !defined(__clang__)
   #define STBIR_STREAMOUT_PTR( star ) star __restrict
   #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
-#elif defined(  __clang__ )
-  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
-  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
-#elif defined(  __GNUC__ )
+  #if _MSC_VER >= 1900
+    #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) 
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START 
+  #endif
+#elif defined( __clang__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) 
+  #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) )
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif 
+#elif defined( __GNUC__ )
   #define STBIR_STREAMOUT_PTR( star ) star __restrict__
   #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
+  #if __GNUC__ >= 14
+    #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector")
+  #else
+    #define STBIR_NO_UNROLL_LOOP_START
+  #endif
+  #define STBIR_NO_UNROLL_LOOP_START_INF_FOR
 #else
   #define STBIR_STREAMOUT_PTR( star ) star
   #define STBIR_NO_UNROLL( ptr )
+  #define STBIR_NO_UNROLL_LOOP_START
+#endif
+
+#ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR
+#define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START
 #endif
 
 #ifdef STBIR_NO_SIMD // force simd off for whatever reason
@@ -1223,7 +1300,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
 #ifdef STBIR_SSE2
   #include <emmintrin.h>
-  
+
   #define stbir__simdf __m128
   #define stbir__simdi __m128i
 
@@ -1254,7 +1331,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
 
   #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
- 
+
   #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
   { \
     stbir__simdi zero = _mm_setzero_si128(); \
@@ -1285,7 +1362,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
   #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
 
-  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i) 
+  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
   #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
   #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
@@ -1397,8 +1474,8 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     #include <smmintrin.h>
     #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
   #else
-    STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
-    STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
+    static STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
+    static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
 
     #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
       { \
@@ -1440,10 +1517,10 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
     #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
     #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
-  
+
     #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
     #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
-    
+
     #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
 
     #ifdef STBIR_AVX2
@@ -1471,8 +1548,8 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
       out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
     }
 
-    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() ); 
-  
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
+
     #define stbir__simdf8_pack_to_16words(out,aa,bb) \
       { \
         stbir__simdf8 af,bf; \
@@ -1496,7 +1573,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
       a = _mm_unpackhi_epi8( ireg, zero ); \
       out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
     }
-  
+
     #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
     { \
       stbir__simdi t; \
@@ -1514,7 +1591,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
       t = _mm_packus_epi16( t, t ); \
       out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
     }
-  
+
     #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
     { \
       stbir__simdi a,b,zero = _mm_setzero_si128(); \
@@ -1549,7 +1626,6 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
     #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
 
-    #define stbir__simdf8_load2( out, ptr ) (out) = _mm256_castsi256_ps(_mm256_castsi128_si256( _mm_loadl_epi64( (__m128i*)(ptr)) )) // top values can be random (not denormal or nan for perf)
     #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
 
     static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
@@ -1582,11 +1658,11 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
     #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
     #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
-    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( _mm256_castps128_ps256( mul ), _mm256_castps128_ps256( _mm_loadu_ps( (float const*)(ptr) ) ), add )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
     #else
     #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
     #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
-    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_castps128_ps256( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) ) )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )  (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
     #endif
     #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
 
@@ -1627,7 +1703,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   }
 
 #elif defined(STBIR_NEON)
-  
+
   #include <arm_neon.h>
 
   #define stbir__simdf float32x4_t
@@ -1686,7 +1762,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
   #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
   #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
-  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0) 
+  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
   #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
   #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
@@ -1737,12 +1813,20 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
           ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
         vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
           ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
+
+      static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb)
+      {
+        uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) };
+        return r;
+      }
     #else
       #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
+      #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}}
     #endif
 
     #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
-  
+    #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) )
+
     #define stbir__simdi_16madd( out, reg0, reg1 ) \
     { \
       int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
@@ -1942,7 +2026,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
   #define stbir__simdf_convert_float_to_i32( i, f )    (i) = wasm_i32x4_trunc_sat_f32x4(f)
   #define stbir__simdf_convert_float_to_int( f )       wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
-  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0) 
+  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0)
   #define stbir__simdf_convert_float_to_uint8( f )     ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
   #define stbir__simdf_convert_float_to_short( f )     ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
@@ -2125,7 +2209,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #endif
 
 
-#if defined(STBIR_NEON) && !defined(_M_ARM)
+#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__)
 
   #if defined( _MSC_VER ) && !defined(__clang__)
   typedef __int16 stbir__FP16;
@@ -2142,7 +2226,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
 #endif
 
-#if !defined(STBIR_NEON) && !defined(STBIR_FP16C) || defined(STBIR_NEON) && defined(_M_ARM)
+#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__))
 
   // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
 
@@ -2168,7 +2252,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     unsigned int sign_mask = 0x80000000u;
     stbir__FP16 o = { 0 };
     stbir__FP32 f;
-    unsigned int sign; 
+    unsigned int sign;
 
     f.f = val;
     sign = f.u & sign_mask;
@@ -2369,24 +2453,6 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     stbir__simdi_store( output,final );
   }
 
-#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM)) // WASM or 32-bit ARM on MSVC/clang
-
-  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
-  {
-    for (int i=0; i<8; i++)
-    {
-      output[i] = stbir__half_to_float(input[i]);
-    }
-  }
-
-  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
-  {
-    for (int i=0; i<8; i++)
-    {
-      output[i] = stbir__float_to_half(input[i]);
-    }
-  }
-
 #elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
 
   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
@@ -2415,7 +2481,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
   }
 
-#elif defined(STBIR_NEON) // 64-bit ARM
+#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM
 
   static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
   {
@@ -2441,6 +2507,23 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
   }
 
+#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__half_to_float(input[i]);
+    }
+  }
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__float_to_half(input[i]);
+    }
+  }
+
 #endif
 
 
@@ -2462,10 +2545,10 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
 #define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
 #define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
-#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 ) 
-#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 ) 
-#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 ) 
-#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 ) 
+#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
+#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
+#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
+#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
 
 typedef union stbir__simdi_u32
 {
@@ -2493,14 +2576,16 @@ static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
 //   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
 #define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
 #define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START
+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR
 
 #ifdef STBIR_MEMCPY
 #undef STBIR_MEMCPY
-#define STBIR_MEMCPY stbir_simd_memcpy
 #endif
+#define STBIR_MEMCPY stbir_simd_memcpy
 
 // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
-static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) 
+static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
 {
   char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
   char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
@@ -2513,8 +2598,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
   {
     if ( bytes < 16 )
     {
-      if ( bytes ) 
+      if ( bytes )
       {
+        STBIR_SIMD_NO_UNROLL_LOOP_START
         do
         {
           STBIR_SIMD_NO_UNROLL(d);
@@ -2529,8 +2615,9 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
       // do one unaligned to get us aligned for the stream out below
       stbir__simdf_load( x, ( d + ofs_to_src ) );
       stbir__simdf_store( d, x );
-      d = (char*)( ( ( (ptrdiff_t)d ) + 16 ) & ~15 );
+      d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
 
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
       for(;;)
       {
         STBIR_SIMD_NO_UNROLL(d);
@@ -2561,12 +2648,13 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
     stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
     stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
     stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
-    d = (char*)( ( ( (ptrdiff_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
+    d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
 
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_SIMD_NO_UNROLL(d);
-  
+
       if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
       {
         if ( d == d_end )
@@ -2590,7 +2678,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
 // memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 //   the diff between dest and src)
-static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
 {
   char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
   char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
@@ -2599,6 +2687,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
   {
     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do
     {
       stbir__simdf x;
@@ -2615,7 +2704,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   do
   {
     STBIR_SIMD_NO_UNROLL(sd);
-    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
     sd += 4;
   } while ( sd < s_end );
 }
@@ -2624,13 +2713,17 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 // when in scalar mode, we let unrolling happen, so this macro just does the __restrict
 #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
-#define STBIR_SIMD_NO_UNROLL(ptr) 
+#define STBIR_SIMD_NO_UNROLL(ptr)
+#define STBIR_SIMD_NO_UNROLL_LOOP_START
+#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
 
 #endif // SSE2
 
 
 #ifdef STBIR_PROFILE
 
+#ifndef STBIR_PROFILE_FUNC
+
 #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
 
 #ifdef _MSC_VER
@@ -2640,7 +2733,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #else // non msvc
 
-  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC() 
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
   {
     stbir_uint32 lo, hi;
     asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
@@ -2649,7 +2742,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #endif  // msvc
 
-#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) 
+#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
 
 #if defined( _MSC_VER ) && !defined(__clang__)
 
@@ -2670,8 +2763,9 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #error Unknown platform for profiling.
 
-#endif  //x64 and   
+#endif  // x64, arm
 
+#endif // STBIR_PROFILE_FUNC
 
 #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
 #define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
@@ -2680,7 +2774,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 #define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
 
 // super light-weight micro profiler
-#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded; 
+#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
 #define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
 #define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
 #define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
@@ -2710,8 +2804,8 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 #define STBIR_PROFILE_FIRST_START( wh )
 #define STBIR_PROFILE_CLEAR_EXTRAS( )
 
-#define STBIR_PROFILE_BUILD_START( wh ) 
-#define STBIR_PROFILE_BUILD_END( wh ) 
+#define STBIR_PROFILE_BUILD_START( wh )
+#define STBIR_PROFILE_BUILD_END( wh )
 #define STBIR_PROFILE_BUILD_FIRST_START( wh )
 #define STBIR_PROFILE_BUILD_CLEAR( info )
 
@@ -2736,10 +2830,10 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #ifndef STBIR_SIMD
 
-// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be
 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 //   the diff between dest and src)
-static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
 {
   char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
   char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
@@ -2748,10 +2842,11 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
   {
     char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
+    STBIR_NO_UNROLL_LOOP_START
     do
     {
       STBIR_NO_UNROLL(sd);
-      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd; 
+      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
       sd += 8;
     } while ( sd < s_end8 );
 
@@ -2759,10 +2854,11 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
       return;
   }
 
+  STBIR_NO_UNROLL_LOOP_START
   do
   {
     STBIR_NO_UNROLL(sd);
-    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
     sd += 4;
   } while ( sd < s_end );
 }
@@ -2863,13 +2959,6 @@ static float stbir__filter_mitchell(float x, float s, void * user_data)
   return (0.0f);
 }
 
-static float stbir__support_zero(float s, void * user_data)
-{
-  STBIR__UNUSED(s);
-  STBIR__UNUSED(user_data);
-  return 0;
-}
-
 static float stbir__support_zeropoint5(float s, void * user_data)
 {
   STBIR__UNUSED(s);
@@ -2884,7 +2973,7 @@ static float stbir__support_one(float s, void * user_data)
   return 1;
 }
 
-static float stbir__support_two(float s, void * user_data) 
+static float stbir__support_two(float s, void * user_data)
 {
   STBIR__UNUSED(s);
   STBIR__UNUSED(user_data);
@@ -2903,7 +2992,7 @@ static int stbir__get_filter_pixel_width(stbir__support_callback * support, floa
     return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
 }
 
-// this is how many coefficents per run of the filter (which is different 
+// this is how many coefficents per run of the filter (which is different
 //   from the filter_pixel_width depending on if we are scattering or gathering)
 static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
 {
@@ -2924,7 +3013,7 @@ static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, vo
   }
 }
 
-static int stbir__get_contributors(stbir__sampler * samp, int is_gather)  
+static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
 {
   if (is_gather)
       return samp->scale_info.output_sub_size;
@@ -2954,7 +3043,7 @@ static int stbir__edge_reflect_full( int n, int max )
 {
   if (n < 0)
   {
-    if (n > -max)    
+    if (n > -max)
       return -n;
     else
       return max - 1;
@@ -3056,7 +3145,7 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
     left_margin = -min_n;
     min_n = 0;
   }
-  
+
   right_margin = 0;
   if ( max_n >= input_full_size )
   {
@@ -3081,7 +3170,7 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
   // don't have to do edge calc for zero clamp
   if ( edge == STBIR_EDGE_ZERO )
     return;
-  
+
   // convert margin pixels to the pixels within the input (min and max)
   for( j = -left_margin ; j < 0 ; j++ )
   {
@@ -3151,10 +3240,9 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
     newspan->n0 = -left_margin;
     newspan->n1 = ( max_left - min_left ) - left_margin;
     scanline_extents->edge_sizes[0] = 0;  // don't need to copy the left margin, since we are directly decoding into the margin
-    return;
   }
-
   // if we can't merge the min_left range, add it as a second range
+  else  
   if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
   {
     stbir__span * newspan = scanline_extents->spans + 1;
@@ -3169,7 +3257,14 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
     newspan->n0 = scanline_extents->spans[1].n1 + 1;
     newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
     scanline_extents->edge_sizes[1] = 0;  // don't need to copy the right margin, since we are directly decoding into the margin
-    return;
+  }
+
+  // sort the spans into write output order
+  if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) )
+  {
+    stbir__span tspan = scanline_extents->spans[0];
+    scanline_extents->spans[0] = scanline_extents->spans[1];
+    scanline_extents->spans[1] = tspan;
   }
 }
 
@@ -3179,20 +3274,21 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel
   float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
   float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
 
-  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale; 
-  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale; 
+  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
+  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
 
   first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
   last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
+  if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross
 
   if ( edge == STBIR_EDGE_WRAP )
   {
-    if ( first <= -input_size )
-      first = -(input_size-1);
+    if ( first < -input_size )
+      first = -input_size;
     if ( last >= (input_size*2))
       last = (input_size*2) - 1;
   }
-  
+
   *first_pixel = first;
   *last_pixel = last;
 }
@@ -3213,12 +3309,17 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
     int i;
     int last_non_zero;
     float out_pixel_center = (float)n + 0.5f;
-    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;  
+    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
 
     int in_first_pixel, in_last_pixel;
-    
+
     stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
 
+    // make sure we never generate a range larger than our precalculated coeff width
+    //   this only happens in point sample mode, but it's a good safe thing to do anyway
+    if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width )
+      in_last_pixel = in_first_pixel + coefficient_width - 1;
+
     last_non_zero = -1;
     for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
     {
@@ -3229,7 +3330,7 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
       if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
       {
         if ( i == 0 )  // if we're at the front, just eat zero contributors
-        { 
+        {
           STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
           ++in_first_pixel;
           i--;
@@ -3239,10 +3340,10 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
       }
       else
         last_non_zero = i;
-      
+
       coefficient_group[i] = coeff;
     }
-    
+
     in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
     contributors->n0 = in_first_pixel;
     contributors->n1 = in_last_pixel;
@@ -3254,19 +3355,22 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
   }
 }
 
-static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff )
+static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width )
 {
   if ( new_pixel <= contribs->n1 )  // before the end
   {
     if ( new_pixel < contribs->n0 ) // before the front?
     {
-      int j, o = contribs->n0 - new_pixel;
-      for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
-        coeffs[ j + o ] = coeffs[ j ];
-      for ( j = 1 ; j < o ; j-- )
-        coeffs[ j ] = coeffs[ 0 ];
-      coeffs[ 0 ] = new_coeff;
-      contribs->n0 = new_pixel;
+      if ( ( contribs->n1 - new_pixel + 1 ) <= max_width )
+      { 
+        int j, o = contribs->n0 - new_pixel;
+        for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
+          coeffs[ j + o ] = coeffs[ j ];
+        for ( j = 1 ; j < o ; j-- )
+          coeffs[ j ] = coeffs[ 0 ];
+        coeffs[ 0 ] = new_coeff;
+        contribs->n0 = new_pixel;
+      }
     }
     else
     {
@@ -3275,12 +3379,15 @@ static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs,
   }
   else
   {
-    int j, e = new_pixel - contribs->n0;
-    for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
-      coeffs[j] = 0;
+    if ( ( new_pixel - contribs->n0 + 1 ) <= max_width )
+    {
+      int j, e = new_pixel - contribs->n0;
+      for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
+        coeffs[j] = 0;
 
-    coeffs[ e ] = new_coeff;
-    contribs->n1 = new_pixel;
+      coeffs[ e ] = new_coeff;
+      contribs->n1 = new_pixel;
+    }
   }
 }
 
@@ -3354,7 +3461,7 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
         stbir__contributors * contribs = contributors + out;
 
         // is this the first time this output pixel has been seen?  Init it.
-        if ( out > first_out_inited ) 
+        if ( out > first_out_inited )
         {
           STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
           first_out_inited = out;
@@ -3362,7 +3469,7 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
           contribs->n1 = in_pixel;
           coeffs[0]  = coeff;
         }
-        else 
+        else
         {
           // insert on end (always in order)
           if ( coeffs[0] == 0.0f )  // if the first coefficent is zero, then zap it for this coeffs
@@ -3379,10 +3486,16 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
   }
 }
 
+#ifdef STBIR_RENORMALIZE_IN_FLOAT
+#define STBIR_RENORM_TYPE float
+#else
+#define STBIR_RENORM_TYPE double
+#endif
+
 static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
 {
   int input_size = scale_info->input_full_size;
-  int input_last_n1 = input_size - 1; 
+  int input_last_n1 = input_size - 1;
   int n, end;
   int lowest = 0x7fffffff;
   int highest = -0x7fffffff;
@@ -3400,14 +3513,14 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
   for (n = 0; n < end; n++)
   {
     int i;
-    float filter_scale, total_filter = 0;
+    STBIR_RENORM_TYPE filter_scale, total_filter = 0;
     int e;
 
     // add all contribs
     e = contribs->n1 - contribs->n0;
     for( i = 0 ; i <= e ; i++ )
     {
-      total_filter += coeffs[i];
+      total_filter += (STBIR_RENORM_TYPE) coeffs[i];
       STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
     }
 
@@ -3423,10 +3536,11 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
       // if the total isn't 1.0, rescale everything
       if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
       {
-        filter_scale = 1.0f / total_filter;
+        filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
+
         // scale them all
         for (i = 0; i <= e; i++)
-          coeffs[i] *= filter_scale;
+          coeffs[i] = (float) ( coeffs[i] * filter_scale );
       }
     }
     ++contribs;
@@ -3452,6 +3566,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
 
   coeffs = coefficient_group;
   contribs = contributors;
+
   for (n = 0; n < num_contributors; n++)
   {
     int i;
@@ -3483,15 +3598,15 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
     else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
     {
       // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
-      
+
       // right hand side first
       if ( contribs->n1 > input_last_n1 )
       {
         int start = contribs->n0;
         int endi = contribs->n1;
-        contribs->n1 = input_last_n1;  
+        contribs->n1 = input_last_n1;
         for( i = input_size; i <= endi; i++ )
-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start] );
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width );
       }
 
       // now check left hand edge
@@ -3500,20 +3615,20 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
         int save_n0;
         float save_n0_coeff;
         float * c = coeffs - ( contribs->n0 + 1 );
-        
+
         // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
-        for( i = -1 ; i > contribs->n0 ; i-- ) 
-          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c-- );
+        for( i = -1 ; i > contribs->n0 ; i-- )
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width );
         save_n0 = contribs->n0;
         save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
 
         // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
-        contribs->n0 = 0;  
+        contribs->n0 = 0;
         for(i = 0 ; i <= contribs->n1 ; i++ )
           coeffs[i] = coeffs[i-save_n0];
-        
+
         // now that we have shrunk down the contribs, we insert the first one safely
-        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff );
+        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width );
       }
     }
 
@@ -3522,6 +3637,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
       int diff = contribs->n1 - contribs->n0 + 1;
       while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
         --diff;
+
       contribs->n1 = contribs->n0 + diff - 1;
 
       if ( contribs->n0 <= contribs->n1 )
@@ -3547,7 +3663,9 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
   filter_info->widest = widest;
 }
 
-static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row_width )
+#undef STBIR_RENORM_TYPE 
+
+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
 {
   #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
   #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
@@ -3556,6 +3674,10 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
   #else
   #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
   #endif
+
+  int row_end = row1 + 1;
+  STBIR__UNUSED( row0 ); // only used in an assert
+
   if ( coefficient_width != widest )
   {
     float * pc = coefficents;
@@ -3564,6 +3686,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
     switch( widest )
     {
       case 1:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_1( pc, coeffs );
           ++pc;
@@ -3571,6 +3694,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 2:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_2( pc, coeffs );
           pc += 2;
@@ -3578,6 +3702,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 3:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_2( pc, coeffs );
           STBIR_MOVE_1( pc+2, coeffs+2 );
@@ -3586,6 +3711,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 4:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           pc += 4;
@@ -3593,6 +3719,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 5:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_1( pc+4, coeffs+4 );
@@ -3601,6 +3728,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 6:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_2( pc+4, coeffs+4 );
@@ -3609,6 +3737,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 7:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_2( pc+4, coeffs+4 );
@@ -3618,6 +3747,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 8:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3626,6 +3756,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 9:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3635,6 +3766,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 10:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3644,6 +3776,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 11:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3654,6 +3787,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       case 12:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           STBIR_MOVE_4( pc, coeffs );
           STBIR_MOVE_4( pc+4, coeffs+4 );
@@ -3663,6 +3797,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         } while ( pc < pc_end );
         break;
       default:
+        STBIR_NO_UNROLL_LOOP_START
         do {
           float * copy_end = pc + widest - 4;
           float * c = coeffs;
@@ -3673,6 +3808,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
             c += 4;
           } while ( pc <= copy_end );
           copy_end += 4;
+          STBIR_NO_UNROLL_LOOP_START
           while ( pc < copy_end )
           {
             STBIR_MOVE_1( pc, c );
@@ -3688,7 +3824,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
   coefficents[ widest * num_contributors ] = 8888.0f;
 
   // the minimum we might read for unrolled filters widths is 12. So, we need to
-  //   make sure we never read outside the decode buffer, by possibly moving 
+  //   make sure we never read outside the decode buffer, by possibly moving
   //   the sample area back into the scanline, and putting zeros weights first.
   // we start on the right edge and check until we're well past the possible
   //   clip area (2*widest).
@@ -3697,13 +3833,13 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
     float * coeffs = coefficents + widest * ( num_contributors - 1 );
 
     // go until no chance of clipping (this is usually less than 8 lops)
-    while ( ( ( contribs->n0 + widest*2 ) >= row_width ) && ( contribs >= contributors ) )
+    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
     {
       // might we clip??
-      if ( ( contribs->n0 + widest ) > row_width )
+      if ( ( contribs->n0 + widest ) > row_end )
       {
         int stop_range = widest;
-      
+
         // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
         //   of this contrib n1, instead of a fixed widest amount - so calculate this
         if ( widest > 12 )
@@ -3712,22 +3848,22 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
 
           // how far will be read in the n_coeff loop (which depends on the widest count mod4);
           mod = widest & 3;
-          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
 
           // the n_coeff loops do a minimum amount of coeffs, so factor that in!
           if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
         }
 
         // now see if we still clip with the refined range
-        if ( ( contribs->n0 + stop_range ) > row_width )
+        if ( ( contribs->n0 + stop_range ) > row_end )
         {
-          int new_n0 = row_width - stop_range;
+          int new_n0 = row_end - stop_range;
           int num = contribs->n1 - contribs->n0 + 1;
           int backup = contribs->n0 - new_n0;
           float * from_co = coeffs + num - 1;
           float * to_co = from_co + backup;
 
-          STBIR_ASSERT( ( new_n0 >= 0 ) && ( new_n0 < contribs->n0 ) );
+          STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
 
           // move the coeffs over
           while( num )
@@ -3746,7 +3882,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
 
             // how far will be read in the n_coeff loop (which depends on the widest count mod4);
             mod = widest & 3;
-            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
 
             // the n_coeff loops do a minimum amount of coeffs, so factor that in!
             if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
@@ -3774,7 +3910,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
   int input_full_size = samp->scale_info.input_full_size;
   int gather_num_contributors = samp->num_contributors;
   stbir__contributors* gather_contributors = samp->contributors;
-  float * gather_coeffs = samp->coefficients; 
+  float * gather_coeffs = samp->coefficients;
   int gather_coefficient_width = samp->coefficient_width;
 
   switch ( samp->is_gather )
@@ -3792,16 +3928,16 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
     break;
 
     case 0: // scatter downsample (only on vertical)
-    case 2: // gather downsample  
+    case 2: // gather downsample
     {
       float in_pixels_radius = support(scale,user_data) * inv_scale;
       int filter_pixel_margin = samp->filter_pixel_margin;
       int input_end = input_full_size + filter_pixel_margin;
-      
+
       // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
       if ( !samp->is_gather )
       {
-        // check if we are using the same gather downsample on the horizontal as this vertical, 
+        // check if we are using the same gather downsample on the horizontal as this vertical,
         //   if so, then we don't have to generate them, we can just pivot from the horizontal.
         if ( other_axis_for_pivot )
         {
@@ -3846,30 +3982,37 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
           float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
           float * g_coeffs = gather_coeffs;
           scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
-          
+
           for (k = gn0 ; k <= gn1 ; k++ )
           {
             float gc = *g_coeffs++;
-            if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
+            
+            // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
+            //   (which happens when pivoting from horizontal, which might have dummy zeros)
+            if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
             {
+              if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
               {
-                // if we are skipping over several contributors, we need to clear the skipped ones
-                stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
-                while ( clear_contributors < scatter_contributors )
                 {
-                  clear_contributors->n0 = 0; 
-                  clear_contributors->n1 = -1;
-                  ++clear_contributors;
+                  // if we are skipping over several contributors, we need to clear the skipped ones
+                  stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+                  while ( clear_contributors < scatter_contributors )
+                  {
+                    clear_contributors->n0 = 0;
+                    clear_contributors->n1 = -1;
+                    ++clear_contributors;
+                  }
                 }
+                scatter_contributors->n0 = n;
+                scatter_contributors->n1 = n;
+                scatter_coeffs[0]  = gc;
+                highest_set = k;
               }
-              scatter_contributors->n0 = n;
-              scatter_contributors->n1 = n;
-              scatter_coeffs[0]  = gc;
-              highest_set = k;
-            }
-            else
-            {
-              stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+              else
+              {
+                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width );
+              }
+              STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
             }
             ++scatter_contributors;
             scatter_coeffs += scatter_coefficient_width;
@@ -3908,11 +4051,11 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix BGRA
 #define stbir__decode_swizzle
-#define stbir__decode_order0  2 
+#define stbir__decode_order0  2
 #define stbir__decode_order1  1
 #define stbir__decode_order2  0
 #define stbir__decode_order3  3
-#define stbir__encode_order0  2 
+#define stbir__encode_order0  2
 #define stbir__encode_order1  1
 #define stbir__encode_order2  0
 #define stbir__encode_order3  3
@@ -3922,11 +4065,11 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix ARGB
 #define stbir__decode_swizzle
-#define stbir__decode_order0  1 
+#define stbir__decode_order0  1
 #define stbir__decode_order1  2
 #define stbir__decode_order2  3
 #define stbir__decode_order3  0
-#define stbir__encode_order0  3 
+#define stbir__encode_order0  3
 #define stbir__encode_order1  0
 #define stbir__encode_order2  1
 #define stbir__encode_order3  2
@@ -3936,11 +4079,11 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix ABGR
 #define stbir__decode_swizzle
-#define stbir__decode_order0  3 
+#define stbir__decode_order0  3
 #define stbir__decode_order1  2
 #define stbir__decode_order2  1
 #define stbir__decode_order3  0
-#define stbir__encode_order0  3 
+#define stbir__encode_order0  3
 #define stbir__encode_order1  2
 #define stbir__encode_order2  1
 #define stbir__encode_order3  0
@@ -3950,12 +4093,12 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix AR
 #define stbir__decode_swizzle
-#define stbir__decode_order0  1 
-#define stbir__decode_order1  0 
+#define stbir__decode_order0  1
+#define stbir__decode_order1  0
 #define stbir__decode_order2  3
 #define stbir__decode_order3  2
-#define stbir__encode_order0  1 
-#define stbir__encode_order1  0 
+#define stbir__encode_order0  1
+#define stbir__encode_order1  0
 #define stbir__encode_order2  3
 #define stbir__encode_order3  2
 #define stbir__coder_min_num 2
@@ -3973,9 +4116,10 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
   // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
 
   #ifdef STBIR_SIMD
-  
+
   #ifdef STBIR_SIMD8
   decode += 16;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdf8 d0,d1,a0,a1,p0,p1;
@@ -3998,8 +4142,9 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
     out += 28;
   }
   decode -= 16;
-  #else  
+  #else
   decode += 8;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdf d0,a0,d1,a1,p0,p1;
@@ -4022,12 +4167,14 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
 
   // might be one last odd pixel
   #ifdef STBIR_SIMD8
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode < end_decode )
   #else
   if ( decode < end_decode )
   #endif
   {
     stbir__simdf d,a,p;
+    STBIR_NO_UNROLL(decode);
     stbir__simdf_load( d, decode );
     stbir__simdf_0123to3333( a, d );
     stbir__simdf_mult( p, a, d );
@@ -4069,6 +4216,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
   decode += 8;
   if ( decode <= end_decode )
   {
+    STBIR_NO_UNROLL_LOOP_START
     do {
       #ifdef STBIR_SIMD8
       stbir__simdf8 d0,a0,p0;
@@ -4077,11 +4225,11 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
       stbir__simdf8_0123to11331133( p0, d0 );
       stbir__simdf8_0123to00220022( a0, d0 );
       stbir__simdf8_mult( p0, p0, a0 );
- 
+
       stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
       stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
       stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
-      
+
       stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
       stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
       stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
@@ -4112,6 +4260,7 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
   decode -= 8;
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float x = decode[0], y = decode[1];
@@ -4132,6 +4281,7 @@ static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_ti
 
   // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float alpha = input[3];
 #ifdef STBIR_SIMD
@@ -4199,6 +4349,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim
   #ifdef STBIR_SIMD
   {
     decode += 2 * stbir__simdfX_float_count;
+    STBIR_NO_UNROLL_LOOP_START
     while ( decode <= end_decode )
     {
       stbir__simdfX d0,a0,d1,a1;
@@ -4217,6 +4368,7 @@ static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_tim
 
     // few last pixels remnants
     #ifdef STBIR_SIMD8
+    STBIR_NO_UNROLL_LOOP_START
     while ( decode < end_decode )
     #else
     if ( decode < end_decode )
@@ -4252,6 +4404,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim
 
   #ifdef STBIR_SIMD
   decode += 2 * stbir__simdfX_float_count;
+  STBIR_NO_UNROLL_LOOP_START
   while ( decode <= end_decode )
   {
     stbir__simdfX d0,a0,d1,a1;
@@ -4269,6 +4422,7 @@ static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_tim
   decode -= 2 * stbir__simdfX_float_count;
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float alpha = decode[1];
@@ -4283,6 +4437,7 @@ static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_t
   float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
   float const * end_output = encode_buffer + width_times_channels;
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float alpha = encode[3];
 
@@ -4330,9 +4485,77 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
   float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
   float const * end_decode = decode_buffer + width_times_channels;
 
-  decode += 12;
+#ifdef STBIR_SIMD
+    #ifdef stbir__simdf_swiz2 // do we have two argument swizzles?
+      end_decode -= 12; 
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // on arm64 8 instructions, no overlapping stores
+        stbir__simdf a,b,c,na,nb;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+4 );
+        stbir__simdf_load( c, decode+8 );
+
+        na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 );   
+        b  = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 );   
+        nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 );   
+        c  = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 );   
+
+        stbir__simdf_store( decode, na );
+        stbir__simdf_store( decode+4, nb ); 
+        stbir__simdf_store( decode+8, c );
+        decode += 12;
+      }
+      end_decode += 12;
+    #else
+      end_decode -= 24;
+      STBIR_NO_UNROLL_LOOP_START
+      while( decode <= end_decode )
+      {
+        // 26 instructions on x64
+        stbir__simdf a,b,c,d,e,f,g;
+        float i21, i23;
+        STBIR_SIMD_NO_UNROLL(decode);
+        stbir__simdf_load( a, decode );
+        stbir__simdf_load( b, decode+3 );
+        stbir__simdf_load( c, decode+6 );
+        stbir__simdf_load( d, decode+9 );
+        stbir__simdf_load( e, decode+12 );
+        stbir__simdf_load( f, decode+15 );
+        stbir__simdf_load( g, decode+18 );
+
+        a = stbir__simdf_swiz( a, 2, 1, 0, 3 );   
+        b = stbir__simdf_swiz( b, 2, 1, 0, 3 );   
+        c = stbir__simdf_swiz( c, 2, 1, 0, 3 );   
+        d = stbir__simdf_swiz( d, 2, 1, 0, 3 );   
+        e = stbir__simdf_swiz( e, 2, 1, 0, 3 );   
+        f = stbir__simdf_swiz( f, 2, 1, 0, 3 );   
+        g = stbir__simdf_swiz( g, 2, 1, 0, 3 );   
+
+        // stores overlap, need to be in order, 
+        stbir__simdf_store( decode,    a );
+        i21 = decode[21];
+        stbir__simdf_store( decode+3,  b ); 
+        i23 = decode[23];
+        stbir__simdf_store( decode+6,  c );
+        stbir__simdf_store( decode+9,  d );
+        stbir__simdf_store( decode+12, e );
+        stbir__simdf_store( decode+15, f );
+        stbir__simdf_store( decode+18, g );
+        decode[21] = i23;
+        decode[23] = i21;
+        decode += 24;
+      }
+      end_decode += 24;
+    #endif
+#else
+  end_decode -= 12;
+  STBIR_NO_UNROLL_LOOP_START
   while( decode <= end_decode )
   {
+    // 16 instructions
     float t0,t1,t2,t3;
     STBIR_NO_UNROLL(decode);
     t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
@@ -4340,8 +4563,10 @@ static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_chann
     decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
     decode += 12;
   }
-  decode -= 12;
+  end_decode += 12;
+#endif
 
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < end_decode )
   {
     float t = decode[0];
@@ -4362,14 +4587,15 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
   stbir_edge edge_horizontal = stbir_info->horizontal.edge;
   stbir_edge edge_vertical = stbir_info->vertical.edge;
   int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
-  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (ptrdiff_t)row * (ptrdiff_t) stbir_info->input_stride_bytes;
+  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
   stbir__span const * spans = stbir_info->scanline_extents.spans;
-  float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
+  float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
+  float * last_decoded = 0;
 
   // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
   STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
 
-  do 
+  do
   {
     float * decode_buffer;
     void const * input_data;
@@ -4377,7 +4603,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     int width_times_channels;
     int width;
 
-    if ( spans->n1 < spans->n0 )    
+    if ( spans->n1 < spans->n0 )
       break;
 
     width = spans->n1 + 1 - spans->n0;
@@ -4392,12 +4618,12 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     if ( stbir_info->in_pixels_cb )
     {
       // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
-      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
+      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + ( ( stbir_info->input_type != STBIR_TYPE_FLOAT ) ? ( sizeof(float)*STBIR_INPUT_CALLBACK_PADDING ) : 0 ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
     }
-    
+
     STBIR_PROFILE_START( decode );
     // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
-    stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
+    last_decoded = stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
     STBIR_PROFILE_END( decode );
 
     if (stbir_info->alpha_weight)
@@ -4418,7 +4644,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     // this code only runs if we're in edge_wrap, and we're doing the entire scanline
     int e, start_x[2];
     int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
-    
+
     start_x[0] = -stbir_info->scanline_extents.edge_sizes[0];  // left edge start x
     start_x[1] =  input_full_size;                             // right edge
 
@@ -4432,9 +4658,19 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
         float * marg = full_decode_buffer + x * effective_channels;
         float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
         STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
+        if ( e == 1 ) last_decoded = marg + margin * effective_channels;
       }
     }
   }
+  
+  // some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in
+  //   (we can't pre-zero it, because the input callback can use that area as padding)
+  last_decoded[0] = 0.0f; 
+
+  // we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width
+  //   when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one.
+  //   this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING)
+  last_decoded[1] = 0.0f;
 }
 
 
@@ -4447,7 +4683,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf tot,c;                \
     STBIR_SIMD_NO_UNROLL(decode);      \
     stbir__simdf_load1( c, hc );       \
-    stbir__simdf_mult1_mem( tot, c, decode ); 
+    stbir__simdf_mult1_mem( tot, c, decode );
 
 #define stbir__2_coeff_only()          \
     stbir__simdf tot,c,d;              \
@@ -4456,7 +4692,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load2( d, decode );   \
     stbir__simdf_mult( tot, c, d );    \
     stbir__simdf_0123to1230( c, tot ); \
-    stbir__simdf_add1( tot, tot, c );          
+    stbir__simdf_add1( tot, tot, c );
 
 #define stbir__3_coeff_only()                  \
     stbir__simdf tot,c,t;                      \
@@ -4466,7 +4702,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1230( c, tot );         \
     stbir__simdf_0123to2301( t, tot );         \
     stbir__simdf_add1( tot, tot, c );          \
-    stbir__simdf_add1( tot, tot, t );    
+    stbir__simdf_add1( tot, tot, t );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store1( output, tot );           \
@@ -4483,7 +4719,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 #define stbir__4_coeff_continue_from_4( ofs )  \
     STBIR_SIMD_NO_UNROLL(decode);              \
     stbir__simdf_load( c, hc + (ofs) );        \
-    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) ); 
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
 
 #define stbir__1_coeff_remnant( ofs )          \
     { stbir__simdf d;                          \
@@ -4495,7 +4731,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     { stbir__simdf d;                          \
     stbir__simdf_load2z( c, hc+(ofs) );        \
     stbir__simdf_load2( d, decode+(ofs) );     \
-    stbir__simdf_madd( tot, tot, d, c ); }   
+    stbir__simdf_madd( tot, tot, d, c ); }
 
 #define stbir__3_coeff_setup()                 \
     stbir__simdf mask;                         \
@@ -4520,18 +4756,18 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 
 #define stbir__1_coeff_only()  \
     float tot;                 \
-    tot = decode[0]*hc[0];     
+    tot = decode[0]*hc[0];
 
 #define stbir__2_coeff_only()  \
     float tot;                 \
     tot = decode[0] * hc[0];   \
-    tot += decode[1] * hc[1];    
+    tot += decode[1] * hc[1];
 
 #define stbir__3_coeff_only()  \
     float tot;                 \
     tot = decode[0] * hc[0];   \
     tot += decode[1] * hc[1];  \
-    tot += decode[2] * hc[2];    
+    tot += decode[2] * hc[2];
 
 #define stbir__store_output_tiny()                \
     output[0] = tot;                              \
@@ -4544,16 +4780,16 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     tot0 = decode[0] * hc[0];   \
     tot1 = decode[1] * hc[1];   \
     tot2 = decode[2] * hc[2];   \
-    tot3 = decode[3] * hc[3];     
+    tot3 = decode[3] * hc[3];
 
 #define stbir__4_coeff_continue_from_4( ofs )  \
     tot0 += decode[0+(ofs)] * hc[0+(ofs)];     \
     tot1 += decode[1+(ofs)] * hc[1+(ofs)];     \
     tot2 += decode[2+(ofs)] * hc[2+(ofs)];     \
-    tot3 += decode[3+(ofs)] * hc[3+(ofs)];     
+    tot3 += decode[3+(ofs)] * hc[3+(ofs)];
 
 #define stbir__1_coeff_remnant( ofs )        \
-    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];
 
 #define stbir__2_coeff_remnant( ofs )        \
     tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
@@ -4562,7 +4798,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 #define stbir__3_coeff_remnant( ofs )        \
     tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
     tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
-    tot2 += decode[2+(ofs)] * hc[2+(ofs)];   
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];
 
 #define stbir__store_output()                     \
     output[0] = (tot0+tot2)+(tot1+tot3);          \
@@ -4570,7 +4806,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 1;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 1
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -4588,14 +4824,14 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load1z( c, hc );     \
     stbir__simdf_0123to0011( c, c );  \
     stbir__simdf_load2( d, decode );  \
-    stbir__simdf_mult( tot, d, c ); 
+    stbir__simdf_mult( tot, d, c );
 
 #define stbir__2_coeff_only()         \
     stbir__simdf tot,c;               \
     STBIR_SIMD_NO_UNROLL(decode);     \
     stbir__simdf_load2( c, hc );      \
     stbir__simdf_0123to0011( c, c );  \
-    stbir__simdf_mult_mem( tot, c, decode ); 
+    stbir__simdf_mult_mem( tot, c, decode );
 
 #define stbir__3_coeff_only()                \
     stbir__simdf tot,c,cs,d;                 \
@@ -4605,7 +4841,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_mult_mem( tot, c, decode ); \
     stbir__simdf_0123to2222( c, cs );        \
     stbir__simdf_load2z( d, decode+4 );      \
-    stbir__simdf_madd( tot, tot, d, c );   
+    stbir__simdf_madd( tot, tot, d, c );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_0123to2301( c, tot );            \
@@ -4628,15 +4864,16 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     STBIR_SIMD_NO_UNROLL(decode);                    \
     stbir__simdf8_load4b( cs, hc + (ofs) );          \
     stbir__simdf8_0123to00112233( c, cs );           \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
 
 #define stbir__1_coeff_remnant( ofs )                \
-    { stbir__simdf t;                                \
+    { stbir__simdf t,d;                              \
     stbir__simdf_load1z( t, hc + (ofs) );            \
+    stbir__simdf_load2( d, decode + (ofs) * 2 );     \
     stbir__simdf_0123to0011( t, t );                 \
-    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf_mult( t, t, d );                    \
     stbir__simdf8_add4( tot0, tot0, t ); }
-
+ 
 #define stbir__2_coeff_remnant( ofs )                \
     { stbir__simdf t;                                \
     stbir__simdf_load2( t, hc + (ofs) );             \
@@ -4649,13 +4886,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_load4b( cs, hc + (ofs) );          \
     stbir__simdf8_0123to00112233( c, cs );           \
     stbir__simdf8_load6z( d, decode+(ofs)*2 );       \
-    stbir__simdf8_madd( tot0, tot0, c, d ); }               
+    stbir__simdf8_madd( tot0, tot0, c, d ); }
 
 #define stbir__store_output()                     \
-    { stbir__simdf t,c;                           \
+    { stbir__simdf t,d;                           \
     stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );    \
-    stbir__simdf_0123to2301( c, t );              \
-    stbir__simdf_add( t, t, c );                  \
+    stbir__simdf_0123to2301( d, t );              \
+    stbir__simdf_add( t, t, d );                  \
     stbir__simdf_store2( output, t );             \
     horizontal_coefficients += coefficient_width; \
     ++horizontal_contributors;                    \
@@ -4670,7 +4907,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0011( c, cs );            \
     stbir__simdf_mult_mem( tot0, c, decode );    \
     stbir__simdf_0123to2233( c, cs );            \
-    stbir__simdf_mult_mem( tot1, c, decode+4 );   
+    stbir__simdf_mult_mem( tot1, c, decode+4 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                \
     STBIR_SIMD_NO_UNROLL(decode);                            \
@@ -4678,7 +4915,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0011( c, cs );                        \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );  \
     stbir__simdf_0123to2233( c, cs );                        \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );   
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
 
 #define stbir__1_coeff_remnant( ofs )            \
     { stbir__simdf d;                            \
@@ -4690,7 +4927,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 #define stbir__2_coeff_remnant( ofs )                      \
     stbir__simdf_load2( cs, hc + (ofs) );                  \
     stbir__simdf_0123to0011( c, cs );                      \
-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );       
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
 
 #define stbir__3_coeff_remnant( ofs )                       \
     { stbir__simdf d;                                       \
@@ -4699,7 +4936,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
     stbir__simdf_0123to2222( c, cs );                       \
     stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 );       \
-    stbir__simdf_madd( tot1, tot1, d, c ); }  
+    stbir__simdf_madd( tot1, tot1, d, c ); }
 
 #define stbir__store_output()                     \
     stbir__simdf_add( tot0, tot0, tot1 );         \
@@ -4718,7 +4955,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     float tota,totb,c;         \
     c = hc[0];                 \
     tota = decode[0]*c;        \
-    totb = decode[1]*c;     
+    totb = decode[1]*c;
 
 #define stbir__2_coeff_only()  \
     float tota,totb,c;         \
@@ -4727,7 +4964,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb = decode[1]*c;        \
     c = hc[1];                 \
     tota += decode[2]*c;       \
-    totb += decode[3]*c;     
+    totb += decode[3]*c;
 
 // this weird order of add matches the simd
 #define stbir__3_coeff_only()  \
@@ -4740,7 +4977,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb += decode[5]*c;       \
     c = hc[1];                 \
     tota += decode[2]*c;       \
-    totb += decode[3]*c;     
+    totb += decode[3]*c;
 
 #define stbir__store_output_tiny()                \
     output[0] = tota;                             \
@@ -4762,7 +4999,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb2 = decode[5]*c;            \
     c = hc[3];                      \
     tota3 = decode[6]*c;            \
-    totb3 = decode[7]*c;     
+    totb3 = decode[7]*c;
 
 #define stbir__4_coeff_continue_from_4( ofs )  \
     c = hc[0+(ofs)];                           \
@@ -4776,12 +5013,12 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb2 += decode[5+(ofs)*2]*c;              \
     c = hc[3+(ofs)];                           \
     tota3 += decode[6+(ofs)*2]*c;              \
-    totb3 += decode[7+(ofs)*2]*c;     
+    totb3 += decode[7+(ofs)*2]*c;
 
 #define stbir__1_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
     tota0 += decode[0+(ofs)*2] * c;    \
-    totb0 += decode[1+(ofs)*2] * c;   
+    totb0 += decode[1+(ofs)*2] * c;
 
 #define stbir__2_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
@@ -4789,7 +5026,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb0 += decode[1+(ofs)*2] * c;    \
     c = hc[1+(ofs)];                   \
     tota1 += decode[2+(ofs)*2] * c;    \
-    totb1 += decode[3+(ofs)*2] * c;   
+    totb1 += decode[3+(ofs)*2] * c;
 
 #define stbir__3_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
@@ -4800,7 +5037,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb1 += decode[3+(ofs)*2] * c;    \
     c = hc[2+(ofs)];                   \
     tota2 += decode[4+(ofs)*2] * c;    \
-    totb2 += decode[5+(ofs)*2] * c;    
+    totb2 += decode[5+(ofs)*2] * c;
 
 #define stbir__store_output()                     \
     output[0] = (tota0+tota2)+(tota1+tota3);      \
@@ -4809,7 +5046,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 2;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 2
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -4827,7 +5064,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load1z( c, hc );     \
     stbir__simdf_0123to0001( c, c );  \
     stbir__simdf_load( d, decode );   \
-    stbir__simdf_mult( tot, d, c ); 
+    stbir__simdf_mult( tot, d, c );
 
 #define stbir__2_coeff_only()         \
     stbir__simdf tot,c,cs,d;          \
@@ -4838,7 +5075,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_mult( tot, d, c );   \
     stbir__simdf_0123to1111( c, cs ); \
     stbir__simdf_load( d, decode+3 ); \
-    stbir__simdf_madd( tot, tot, d, c ); 
+    stbir__simdf_madd( tot, tot, d, c );
 
 #define stbir__3_coeff_only()            \
     stbir__simdf tot,c,d,cs;             \
@@ -4852,7 +5089,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd( tot, tot, d, c ); \
     stbir__simdf_0123to2222( c, cs );    \
     stbir__simdf_load( d, decode+6 );    \
-    stbir__simdf_madd( tot, tot, d, c ); 
+    stbir__simdf_madd( tot, tot, d, c );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store2( output, tot );           \
@@ -4872,7 +5109,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );         \
     stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
     stbir__simdf8_0123to22223333( c, cs );         \
-    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );    
+    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
 
 #define stbir__4_coeff_continue_from_4( ofs )      \
     STBIR_SIMD_NO_UNROLL(decode);                  \
@@ -4880,26 +5117,26 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );         \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
     stbir__simdf8_0123to22223333( c, cs );         \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );    
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
 
 #define stbir__1_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf_load1rep4( t, hc + (ofs) );                   \
-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 ); 
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
 
 #define stbir__2_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
     stbir__simdf8_0123to22223333( c, cs );                     \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );   
- 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
+
  #define stbir__3_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                                \
     stbir__simdf8_load4b( cs, hc + (ofs) );                      \
     stbir__simdf8_0123to00001111( c, cs );                       \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
     stbir__simdf8_0123to2222( t, cs );                           \
-    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 ); 
+    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
 
 #define stbir__store_output()                       \
     stbir__simdf8_add( tot0, tot0, tot1 );          \
@@ -4930,7 +5167,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1122( c, cs );           \
     stbir__simdf_mult_mem( tot1, c, decode+4 ); \
     stbir__simdf_0123to2333( c, cs );           \
-    stbir__simdf_mult_mem( tot2, c, decode+8 ); 
+    stbir__simdf_mult_mem( tot2, c, decode+8 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                 \
     STBIR_SIMD_NO_UNROLL(decode);                             \
@@ -4940,13 +5177,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1122( c, cs );                         \
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
     stbir__simdf_0123to2333( c, cs );                         \
-    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );   
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
 
 #define stbir__1_coeff_remnant( ofs )         \
     STBIR_SIMD_NO_UNROLL(decode);             \
     stbir__simdf_load1z( c, hc + (ofs) );     \
     stbir__simdf_0123to0001( c, c );          \
-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
 
 #define stbir__2_coeff_remnant( ofs )                       \
     { stbir__simdf d;                                       \
@@ -4956,7 +5193,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
     stbir__simdf_0123to1122( c, cs );                       \
     stbir__simdf_load2z( d, decode+(ofs)*3+4 );             \
-    stbir__simdf_madd( tot1, tot1, c, d ); }                 
+    stbir__simdf_madd( tot1, tot1, c, d ); }
 
 #define stbir__3_coeff_remnant( ofs )                         \
     { stbir__simdf d;                                         \
@@ -4968,7 +5205,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
     stbir__simdf_0123to2222( c, cs );                         \
     stbir__simdf_load1z( d, decode+(ofs)*3+8 );               \
-    stbir__simdf_madd( tot2, tot2, c, d );  }                
+    stbir__simdf_madd( tot2, tot2, c, d );  }
 
 #define stbir__store_output()                       \
     stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 );   \
@@ -4999,7 +5236,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[0];                 \
     tot0 = decode[0]*c;        \
     tot1 = decode[1]*c;        \
-    tot2 = decode[2]*c;              
+    tot2 = decode[2]*c;
 
 #define stbir__2_coeff_only()  \
     float tot0, tot1, tot2, c; \
@@ -5010,7 +5247,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[1];                 \
     tot0 += decode[3]*c;       \
     tot1 += decode[4]*c;       \
-    tot2 += decode[5]*c;              
+    tot2 += decode[5]*c;
 
 #define stbir__3_coeff_only()  \
     float tot0, tot1, tot2, c; \
@@ -5025,7 +5262,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[2];                 \
     tot0 += decode[6]*c;       \
     tot1 += decode[7]*c;       \
-    tot2 += decode[8]*c;              
+    tot2 += decode[8]*c;
 
 #define stbir__store_output_tiny()                \
     output[0] = tot0;                             \
@@ -5052,7 +5289,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[3];                      \
     totd0 = decode[9]*c;            \
     totd1 = decode[10]*c;           \
-    totd2 = decode[11]*c;            
+    totd2 = decode[11]*c;
 
 #define stbir__4_coeff_continue_from_4( ofs )  \
     c = hc[0+(ofs)];                           \
@@ -5070,7 +5307,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[3+(ofs)];                           \
     totd0 += decode[9+(ofs)*3]*c;              \
     totd1 += decode[10+(ofs)*3]*c;             \
-    totd2 += decode[11+(ofs)*3]*c;              
+    totd2 += decode[11+(ofs)*3]*c;
 
 #define stbir__1_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
@@ -5100,7 +5337,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[2+(ofs)];                   \
     totc0 += decode[6+(ofs)*3]*c;      \
     totc1 += decode[7+(ofs)*3]*c;      \
-    totc2 += decode[8+(ofs)*3]*c;              
+    totc2 += decode[8+(ofs)*3]*c;
 
 #define stbir__store_output()                     \
     output[0] = (tota0+totc0)+(totb0+totd0);      \
@@ -5110,7 +5347,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 3;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 3
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -5126,7 +5363,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     STBIR_SIMD_NO_UNROLL(decode);         \
     stbir__simdf_load1( c, hc );          \
     stbir__simdf_0123to0000( c, c );      \
-    stbir__simdf_mult_mem( tot, c, decode ); 
+    stbir__simdf_mult_mem( tot, c, decode );
 
 #define stbir__2_coeff_only()                       \
     stbir__simdf tot,c,cs;                          \
@@ -5135,7 +5372,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0000( c, cs );               \
     stbir__simdf_mult_mem( tot, c, decode );        \
     stbir__simdf_0123to1111( c, cs );               \
-    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); 
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 );
 
 #define stbir__3_coeff_only()                       \
     stbir__simdf tot,c,cs;                          \
@@ -5146,7 +5383,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1111( c, cs );               \
     stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
     stbir__simdf_0123to2222( c, cs );               \
-    stbir__simdf_madd_mem( tot, tot, c, decode+8 ); 
+    stbir__simdf_madd_mem( tot, tot, c, decode+8 );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store( output, tot );            \
@@ -5163,7 +5400,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );         \
     stbir__simdf8_mult_mem( tot0, c, decode );     \
     stbir__simdf8_0123to22223333( c, cs );         \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );    
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                  \
     STBIR_SIMD_NO_UNROLL(decode);                              \
@@ -5171,26 +5408,26 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );                     \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
     stbir__simdf8_0123to22223333( c, cs );                     \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );    
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
 
 #define stbir__1_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf_load1rep4( t, hc + (ofs) );                   \
-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 ); 
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
 
 #define stbir__2_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
     stbir__simdf8_0123to22223333( c, cs );                     \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   
- 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
+
  #define stbir__3_coeff_remnant( ofs )                         \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf8_load4b( cs, hc + (ofs) );                    \
     stbir__simdf8_0123to00001111( c, cs );                     \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
     stbir__simdf8_0123to2222( t, cs );                         \
-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 ); 
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
 
 #define stbir__store_output()                      \
     stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );     \
@@ -5199,7 +5436,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                     \
     output += 4;
 
-#else    
+#else
 
 #define stbir__4_coeff_start()                        \
     stbir__simdf tot0,tot1,c,cs;                      \
@@ -5212,7 +5449,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to2222( c, cs );                 \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
     stbir__simdf_0123to3333( c, cs );                 \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 ); 
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                  \
     STBIR_SIMD_NO_UNROLL(decode);                              \
@@ -5224,13 +5461,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to2222( c, cs );                          \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );  \
     stbir__simdf_0123to3333( c, cs );                          \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 ); 
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
 
 #define stbir__1_coeff_remnant( ofs )                       \
     STBIR_SIMD_NO_UNROLL(decode);                           \
     stbir__simdf_load1( c, hc + (ofs) );                    \
     stbir__simdf_0123to0000( c, c );                        \
-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); 
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
 
 #define stbir__2_coeff_remnant( ofs )                         \
     STBIR_SIMD_NO_UNROLL(decode);                             \
@@ -5238,8 +5475,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0000( c, cs );                         \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
     stbir__simdf_0123to1111( c, cs );                         \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); 
-  
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
+
 #define stbir__3_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf_load( cs, hc + (ofs) );                       \
@@ -5365,7 +5602,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     x0 += decode[0+(ofs)*4] * c;      \
     x1 += decode[1+(ofs)*4] * c;      \
     x2 += decode[2+(ofs)*4] * c;      \
-    x3 += decode[3+(ofs)*4] * c;      
+    x3 += decode[3+(ofs)*4] * c;
 
 #define stbir__2_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);     \
@@ -5378,8 +5615,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y0 += decode[4+(ofs)*4] * c;      \
     y1 += decode[5+(ofs)*4] * c;      \
     y2 += decode[6+(ofs)*4] * c;      \
-    y3 += decode[7+(ofs)*4] * c;    
-  
+    y3 += decode[7+(ofs)*4] * c;
+
 #define stbir__3_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);     \
     c = hc[0+(ofs)];                  \
@@ -5396,7 +5633,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     x0 += decode[8+(ofs)*4] * c;      \
     x1 += decode[9+(ofs)*4] * c;      \
     x2 += decode[10+(ofs)*4] * c;     \
-    x3 += decode[11+(ofs)*4] * c;     
+    x3 += decode[11+(ofs)*4] * c;
 
 #define stbir__store_output()                     \
     output[0] = x0 + y0;                          \
@@ -5407,7 +5644,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 4;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 4
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -5426,7 +5663,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load1( c, hc );                \
     stbir__simdf_0123to0000( c, c );            \
     stbir__simdf_mult_mem( tot0, c, decode );   \
-    stbir__simdf_mult_mem( tot1, c, decode+3 ); 
+    stbir__simdf_mult_mem( tot1, c, decode+3 );
 
 #define stbir__2_coeff_only()                         \
     stbir__simdf tot0,tot1,c,cs;                      \
@@ -5437,7 +5674,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_mult_mem( tot1, c, decode+3 );       \
     stbir__simdf_0123to1111( c, cs );                 \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
-    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 ); 
+    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
 
 #define stbir__3_coeff_only()                           \
     stbir__simdf tot0,tot1,c,cs;                        \
@@ -5451,7 +5688,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+10 );  \
     stbir__simdf_0123to2222( c, cs );                   \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store( output+3, tot1 );         \
@@ -5473,7 +5710,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to22222222( c, cs );         \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 );  \
     stbir__simdf8_0123to33333333( c, cs );         \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );  
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                   \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5485,19 +5722,19 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to22222222( c, cs );                      \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
     stbir__simdf8_0123to33333333( c, cs );                      \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 ); 
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
 
 #define stbir__1_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
     stbir__simdf8_load1b( c, hc + (ofs) );                      \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
 
 #define stbir__2_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
     stbir__simdf8_load1b( c, hc + (ofs) );                      \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
     stbir__simdf8_load1b( c, hc + (ofs)+1 );                    \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );   
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
 
 #define stbir__3_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5507,7 +5744,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to11111111( c, cs );                      \
     stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
     stbir__simdf8_0123to22222222( c, cs );                      \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
 
 #define stbir__store_output()                     \
     stbir__simdf8_add( tot0, tot0, tot1 );        \
@@ -5540,7 +5777,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  \
     stbir__simdf_0123to3333( c, cs );                   \
     stbir__simdf_madd_mem( tot2, tot2, c, decode+21 );  \
-    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );         
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                   \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5556,7 +5793,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  \
     stbir__simdf_0123to3333( c, cs );                           \
     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 );  \
-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );   
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
 
 #define stbir__1_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5573,8 +5810,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
     stbir__simdf_0123to1111( c, cs );                           \
     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  
-  
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
+
 #define stbir__3_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
     stbir__simdf_load( cs, hc + (ofs) );                        \
@@ -5586,7 +5823,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
     stbir__simdf_0123to2222( c, cs );                           \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
 
 #define stbir__store_output()                     \
     stbir__simdf_add( tot0, tot0, tot2 );         \
@@ -5610,7 +5847,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     tot3 = decode[3]*c;              \
     tot4 = decode[4]*c;              \
     tot5 = decode[5]*c;              \
-    tot6 = decode[6]*c;              
+    tot6 = decode[6]*c;
 
 #define stbir__2_coeff_only()        \
     float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
@@ -5704,7 +5941,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y3 += decode[24] * c;         \
     y4 += decode[25] * c;         \
     y5 += decode[26] * c;         \
-    y6 += decode[27] * c; 
+    y6 += decode[27] * c;
 
 #define stbir__4_coeff_continue_from_4( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);  \
@@ -5739,7 +5976,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y3 += decode[24+(ofs)*7] * c;  \
     y4 += decode[25+(ofs)*7] * c;  \
     y5 += decode[26+(ofs)*7] * c;  \
-    y6 += decode[27+(ofs)*7] * c; 
+    y6 += decode[27+(ofs)*7] * c;
 
 #define stbir__1_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);  \
@@ -5770,7 +6007,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y4 += decode[11+(ofs)*7] * c;  \
     y5 += decode[12+(ofs)*7] * c;  \
     y6 += decode[13+(ofs)*7] * c;  \
-  
+
 #define stbir__3_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);  \
     c = hc[0+(ofs)];               \
@@ -5810,7 +6047,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 7;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 7
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -5937,7 +6174,7 @@ static void stbir__encode_scanline( stbir__info const * stbir_info, void *output
   // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
   if ( stbir_info->out_pixels_cb )
     output_buffer = encode_buffer;
-  
+
   STBIR_PROFILE_START( encode );
   // convert into the output buffer
   stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
@@ -5945,7 +6182,7 @@ static void stbir__encode_scanline( stbir__info const * stbir_info, void *output
 
   // if we have an output callback, call it to send the data
   if ( stbir_info->out_pixels_cb )
-    stbir_info->out_pixels_cb( output_buffer_data, num_pixels, row, stbir_info->user_data );
+    stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
 }
 
 
@@ -6012,10 +6249,12 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi
   if ( vertical_first )
   {
     // Now resample the gathered vertical data in the horizontal axis into the encode buffer
+    decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
+    decode_buffer[ width_times_channels+1 ] = 0.0f; 
     stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
   }
 
-  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes), 
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
                           encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 }
 
@@ -6030,7 +6269,7 @@ static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info cons
   // update new end scanline
   split_info->ring_buffer_last_scanline = n;
 
-  // get ring buffer 
+  // get ring buffer
   ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
   ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
 
@@ -6056,7 +6295,7 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__
 
   // initialize the ring buffer for gathering
   split_info->ring_buffer_begin_index = 0;
-  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;  
+  split_info->ring_buffer_first_scanline = vertical_contributors->n0;
   split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
 
   for (y = start_output_y; y < end_output_y; y++)
@@ -6080,12 +6319,12 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__
         split_info->ring_buffer_first_scanline++;
         split_info->ring_buffer_begin_index++;
       }
-      
+
       if ( stbir_info->vertical_first )
       {
         float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
         // Decode the nth scanline from the source image into the decode buffer.
-        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
       }
       else
       {
@@ -6108,10 +6347,10 @@ static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_
 {
   // evict a scanline out into the output buffer
   float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
-  
+
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
-  
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
 
@@ -6129,10 +6368,10 @@ static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(st
 
   // Now resample it into the buffer.
   stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
-  
+
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
-  
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
 
@@ -6172,7 +6411,7 @@ static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stb
   STBIR_PROFILE_END( vertical );
 }
 
-typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info); 
+typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
 
 static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
 {
@@ -6183,6 +6422,8 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
   void * scanline_scatter_buffer;
   void * scanline_scatter_buffer_end;
   int on_first_input_y, last_input_y;
+  int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
+  int width_times_channels = stbir_info->effective_channels * width;
 
   STBIR_ASSERT( !stbir_info->vertical.is_gather );
 
@@ -6193,7 +6434,7 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
   end_input_y = split_info[split_count-1].end_input_y;
 
   // adjust for starting offset start_input_y
-  y = start_input_y + stbir_info->vertical.filter_pixel_margin; 
+  y = start_input_y + stbir_info->vertical.filter_pixel_margin;
   vertical_contributors += y ;
   vertical_coefficients += stbir_info->vertical.coefficient_width * y;
 
@@ -6217,7 +6458,12 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
 
   // mark all the buffers as empty to start
   for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
-    stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
+  {
+    float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y );
+    decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3
+    decode_buffer[ width_times_channels+1 ] = 0.0f; 
+    decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
+  }
 
   // do the loop in input space
   on_first_input_y = 1; last_input_y = start_input_y;
@@ -6240,7 +6486,7 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
         split_info->start_input_y = y;
       on_first_input_y = 0;
 
-      // clip the region 
+      // clip the region
       if ( out_first_scanline < start_output_y )
       {
         vc += start_output_y - out_first_scanline;
@@ -6253,11 +6499,11 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
       // if very first scanline, init the index
       if (split_info->ring_buffer_begin_index < 0)
         split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
-      
+
       STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
 
       // Decode the nth scanline from the source image into the decode buffer.
-      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
       // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
       if ( !stbir_info->vertical_first )
@@ -6269,7 +6515,7 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
       if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
            ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
         handle_scanline_for_scatter( stbir_info, split_info );
-    
+
       // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
       stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
 
@@ -6305,7 +6551,7 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
     if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
     {
       if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
-        filter = STBIR_FILTER_POINT_SAMPLE;  
+        filter = STBIR_FILTER_POINT_SAMPLE;
       else
         filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
     }
@@ -6313,7 +6559,7 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
   samp->filter_enum = filter;
 
   STBIR_ASSERT(samp->filter_enum != 0);
-  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER); 
+  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER);
   samp->filter_kernel = stbir__builtin_kernels[ filter ];
   samp->filter_support = stbir__builtin_supports[ filter ];
 
@@ -6339,17 +6585,33 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
   // pre calculate stuff based on the above
   samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
 
+  // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
+  //   In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the 
+  //   filter will extend before or after the scanline beyond just one extra entire copy of the 
+  //   scanline (we would hit the edge twice). We don't let you do that, so we clamp the total 
+  //   width to 3x the total of input pixel (once for the scanline, once for the left side 
+  //   overhang, and once for the right side). We only do this for edge mode, since the other 
+  //   modes can just re-edge clamp back in again.
   if ( edge == STBIR_EDGE_WRAP )
-    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 2 ) )  // this can only happen when shrinking to a single pixel
-      samp->filter_pixel_width = scale_info->input_full_size * 2;
+    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
+      samp->filter_pixel_width = scale_info->input_full_size * 3;
 
   // This is how much to expand buffers to account for filters seeking outside
   // the image boundaries.
   samp->filter_pixel_margin = samp->filter_pixel_width / 2;
+  
+  // filter_pixel_margin is the amount that this filter can overhang on just one side of either 
+  //   end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's 
+  //   worth of pixels, we clamp this one side of overhang to the input scanline size. Again, 
+  //   this clamping only happens in rare cases with the default filters (2 pix to 1 pix). 
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_margin > scale_info->input_full_size )
+      samp->filter_pixel_margin = scale_info->input_full_size;
 
   samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
+
   samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
-  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
+  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding
 
   samp->gather_prescatter_contributors = 0;
   samp->gather_prescatter_coefficients = 0;
@@ -6397,8 +6659,8 @@ static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contr
     range->n0 = in_first_pixel;
     stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
     range->n1 = in_last_pixel;
-     
-    // now go through the margin to the start of area to find bottom 
+
+    // now go through the margin to the start of area to find bottom
     n = range->n0 + 1;
     input_end = -filter_pixel_margin;
     while( n >= input_end )
@@ -6413,7 +6675,7 @@ static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contr
       --n;
     }
 
-    // now go through the end of the area through the margin to find top 
+    // now go through the end of the area through the margin to find top
     n = range->n1 - 1;
     input_end = n + 1 + filter_pixel_margin;
     while( n <= input_end )
@@ -6454,7 +6716,7 @@ static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contr
   }
 }
 
-static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height )
+static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height, int is_gather, stbir__contributors * contribs )
 {
   int i, cur;
   int left = output_height;
@@ -6462,10 +6724,59 @@ static void stbir__get_split_info( stbir__per_split_info* split_info, int splits
   cur = 0;
   for( i = 0 ; i < splits ; i++ )
   {
-    int each; 
+    int each;
+
     split_info[i].start_output_y = cur;
     each = left / ( splits - i );
     split_info[i].end_output_y = cur + each;
+
+    // ok, when we are gathering, we need to make sure we are starting on a y offset that doesn't have
+    //   a "special" set of coefficients. Basically, with exactly the right filter at exactly the right
+    //   resize at exactly the right phase, some of the coefficents can be zero. When they are zero, we
+    //   don't process them at all.  But this leads to a tricky thing with the thread splits, where we
+    //   might have a set of two coeffs like this for example: (4,4) and (3,6).  The 4,4 means there was
+    //   just one single coeff because things worked out perfectly (normally, they all have 4 coeffs
+    //   like the range 3,6.  The problem is that if we start right on the (4,4) on a brand new thread,
+    //   then when we get to (3,6), we don't have the "3" sample in memory (because we didn't load
+    //   it on the initial (4,4) range because it didn't have a 3 (we only add new samples that are 
+    //   larger than our existing samples - it's just how the eviction works). So, our solution here
+    //   is pretty simple, if we start right on a range that has samples that start earlier, then we 
+    //   simply bump up our previous thread split range to include it, and then start this threads
+    //   range with the smaller sample. It just moves one scanline from one thread split to another,
+    //   so that we end with the unusual one, instead of start with it. To do this, we check 2-4 
+    //   sample at each thread split start and then occassionally move them.
+    
+    if ( ( is_gather ) && ( i ) )
+    {
+      stbir__contributors * small_contribs;
+      int j, smallest, stop, start_n0;
+      stbir__contributors * split_contribs = contribs + cur;
+
+      // scan for a max of 3x the filter width or until the next thread split
+      stop = vertical_pixel_margin * 3;
+      if ( each < stop )
+        stop = each;
+
+      // loops a few times before early out
+      smallest = 0;
+      small_contribs = split_contribs;
+      start_n0 = small_contribs->n0;
+      for( j = 1 ; j <= stop ; j++ )
+      {
+        ++split_contribs;
+        if ( split_contribs->n0 > start_n0 )
+          break;
+        if ( split_contribs->n0 < small_contribs->n0 )
+        {
+          small_contribs = split_contribs;
+          smallest = j;
+        }
+      }
+
+      split_info[i-1].end_output_y += smallest;
+      split_info[i].start_output_y += smallest;
+    }
+
     cur += each;
     left -= each;
 
@@ -6478,7 +6789,7 @@ static void stbir__get_split_info( stbir__per_split_info* split_info, int splits
 static void stbir__free_internal_mem( stbir__info *info )
 {
   #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
-  
+
   if ( info )
   {
   #ifndef STBIR__SEPARATE_ALLOCATIONS
@@ -6496,16 +6807,16 @@ static void stbir__free_internal_mem( stbir__info *info )
       for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
       {
         #ifdef STBIR_SIMD8
-        if ( info->effective_channels == 3 ) 
+        if ( info->effective_channels == 3 )
           --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
-        #endif  
+        #endif
         STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
       }
 
       #ifdef STBIR_SIMD8
-      if ( info->effective_channels == 3 ) 
+      if ( info->effective_channels == 3 )
         --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
-      #endif  
+      #endif
       STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
       STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
       STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
@@ -6519,10 +6830,10 @@ static void stbir__free_internal_mem( stbir__info *info )
     STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
     STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
     STBIR__FREE_AND_CLEAR( info->alloced_mem );
-    STBIR__FREE_AND_CLEAR( info );
+    STBIR_FREE( info, info->user_data );
   #endif
   }
-  
+
   #undef STBIR__FREE_AND_CLEAR
 }
 
@@ -6534,20 +6845,20 @@ static int stbir__get_max_split( int splits, int height )
   for( i = 0 ; i < splits ; i++ )
   {
     int each = height / ( splits - i );
-    if ( each > max ) 
+    if ( each > max )
       max = each;
     height -= each;
   }
   return max;
 }
 
-static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] = 
-{ 
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] =
+{
   0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
 };
 
-static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] = 
-{ 
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] =
+{
   0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
 };
 
@@ -6622,28 +6933,28 @@ static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
 #endif
 
 // Figure out whether to scale along the horizontal or vertical first.
-//   This only *super* important when you are scaling by a massively 
-//   different amount in the vertical vs the horizontal (for example, if 
-//   you are scaling by 2x in the width, and 0.5x in the height, then you 
-//   want to do the vertical scale first, because it's around 3x faster 
+//   This only *super* important when you are scaling by a massively
+//   different amount in the vertical vs the horizontal (for example, if
+//   you are scaling by 2x in the width, and 0.5x in the height, then you
+//   want to do the vertical scale first, because it's around 3x faster
 //   in that order.
 //
-//   In more normal circumstances, this makes a 20-40% differences, so 
+//   In more normal circumstances, this makes a 20-40% differences, so
 //     it's good to get right, but not critical. The normal way that you
-//     decide which direction goes first is just figuring out which 
-//     direction does more multiplies. But with modern CPUs with their 
+//     decide which direction goes first is just figuring out which
+//     direction does more multiplies. But with modern CPUs with their
 //     fancy caches and SIMD and high IPC abilities, so there's just a lot
-//     more that goes into it. 
+//     more that goes into it.
 //
-//   My handwavy sort of solution is to have an app that does a whole 
+//   My handwavy sort of solution is to have an app that does a whole
 //     bunch of timing for both vertical and horizontal first modes,
 //     and then another app that can read lots of these timing files
 //     and try to search for the best weights to use. Dotimings.c
 //     is the app that does a bunch of timings, and vf_train.c is the
-//     app that solves for the best weights (and shows how well it 
+//     app that solves for the best weights (and shows how well it
 //     does currently).
 
-static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )    
+static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )
 {
   double v_cost, h_cost;
   float * weights;
@@ -6655,15 +6966,15 @@ static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLA
     v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
   else if ( vertical_scale <= 1.0f )
     v_classification = ( is_gather ) ? 1 : 0;
-  else if ( vertical_scale <= 2.0f) 
+  else if ( vertical_scale <= 2.0f)
     v_classification = 2;
-  else if ( vertical_scale <= 3.0f) 
+  else if ( vertical_scale <= 3.0f)
     v_classification = 3;
-  else if ( vertical_scale <= 4.0f) 
+  else if ( vertical_scale <= 4.0f)
     v_classification = 5;
-  else 
+  else
     v_classification = 6;
-  
+
   // use the right weights
   weights = weights_table[ v_classification ];
 
@@ -6684,10 +6995,10 @@ static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLA
     info->is_gather = is_gather;
   }
 
-  // and this allows us to override everything for testing (see dotiming.c) 
-  if ( ( info ) && ( info->control_v_first ) ) 
+  // and this allows us to override everything for testing (see dotiming.c)
+  if ( ( info ) && ( info->control_v_first ) )
     vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
-  
+
   return vertical_first;
 }
 
@@ -6699,9 +7010,9 @@ static unsigned char stbir__pixel_channels[] = {
 };
 
 // the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
-//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
 static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
-  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA, 
+  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA,
   STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
   STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
 };
@@ -6712,17 +7023,18 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
   stbir__info * info = 0;
   void * alloced = 0;
-  int alloced_total = 0;
+  size_t alloced_total = 0;
   int vertical_first;
-  int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
+  size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size;
+  int alloc_ring_buffer_num_entries;
 
   int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
-  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size ); 
-  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];  
+  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
+  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];
   stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
-  int channels = stbir__pixel_channels[ input_pixel_layout ];      
+  int channels = stbir__pixel_channels[ input_pixel_layout ];
   int effective_channels = channels;
-  
+
   // first figure out what type of alpha weighting to use (if any)
   if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
   {
@@ -6759,14 +7071,16 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
   vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
 
   // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
-  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
-  
+  //   we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without
+  //   the conversion routines overwriting the callback input data.
+  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger
+
 #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
   if ( effective_channels == 3 )
     decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
-#endif  
+#endif
 
-  ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+  ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding
 
   // if we do vertical first, the ring buffer holds a whole decoded line
   if ( vertical_first )
@@ -6781,13 +7095,13 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
   if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
     alloc_ring_buffer_num_entries = conservative_split_output_size;
 
-  ring_buffer_size = alloc_ring_buffer_num_entries * ring_buffer_length_bytes;
+  ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes;
 
   // The vertical buffer is used differently, depending on whether we are scattering
   //   the vertical scanlines, or gathering them.
   //   If scattering, it's used at the temp buffer to accumulate each output.
   //   If gathering, it's just the output buffer.
-  vertical_buffer_size = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float);  // extra float for padding
+  vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float);  // extra float for padding
 
   // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
   for(;;)
@@ -6800,12 +7114,12 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 #ifdef STBIR__SEPARATE_ALLOCATIONS
     #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; }
 #else
-    #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = ((char*)advance_mem) + (size);
+    #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = (char*)(((size_t)advance_mem) + (size));
 #endif
 
-    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );      
+    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );
 
-    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );      
+    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );
 
     if ( info )
     {
@@ -6820,39 +7134,39 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
       info->channels = channels;
       info->effective_channels = effective_channels;
-  
+
       info->offset_x = new_x;
       info->offset_y = new_y;
-      info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries;
-      info->ring_buffer_num_entries = 0;  
-      info->ring_buffer_length_bytes = ring_buffer_length_bytes;
+      info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries;
+      info->ring_buffer_num_entries = 0;
+      info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes;
       info->splits = splits;
       info->vertical_first = vertical_first;
 
-      info->input_pixel_layout_internal = input_pixel_layout;  
+      info->input_pixel_layout_internal = input_pixel_layout;
       info->output_pixel_layout_internal = output_pixel_layout;
 
       // setup alpha weight functions
       info->alpha_weight = 0;
       info->alpha_unweight = 0;
-    
+
       // handle alpha weighting functions and overrides
       if ( alpha_weighting_type == 2 )
       {
         // high quality alpha multiplying on the way in, dividing on the way out
-        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
         info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
       }
       else if ( alpha_weighting_type == 4 )
       {
         // fast alpha multiplying on the way in, dividing on the way out
-        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
         info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
       }
       else if ( alpha_weighting_type == 1 )
       {
         // fast alpha on the way in, leave in premultiplied form on way out
-        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ]; 
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
       }
       else if ( alpha_weighting_type == 3 )
       {
@@ -6871,7 +7185,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
           info->alpha_weight = stbir__simple_flip_3ch;
       }
 
-    }        
+    }
 
     // get all the per-split buffers
     for( i = 0 ; i < splits ; i++ )
@@ -6883,7 +7197,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       #ifdef STBIR_SIMD8
       if ( ( info ) && ( effective_channels == 3 ) )
         ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
-      #endif  
+      #endif
 
       STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
       {
@@ -6894,7 +7208,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
           #ifdef STBIR_SIMD8
           if ( ( info ) && ( effective_channels == 3 ) )
             ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
-          #endif  
+          #endif
         }
       }
 #else
@@ -6906,26 +7220,31 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
     // alloc memory for to-be-pivoted coeffs (if necessary)
     if ( vertical->is_gather == 0 )
     {
-      int both;
-      int temp_mem_amt;
+      size_t both;
+      size_t temp_mem_amt;
 
       // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
       //   that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
       //   is too small, we just allocate extra memory to use as this temp.
 
-      both = vertical->gather_prescatter_contributors_size + vertical->gather_prescatter_coefficients_size;
+      both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size;
 
 #ifdef STBIR__SEPARATE_ALLOCATIONS
       temp_mem_amt = decode_buffer_size;
+
+      #ifdef STBIR_SIMD8
+      if ( effective_channels == 3 )
+        --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif
 #else
-      temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits;
+      temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits;
 #endif
       if ( temp_mem_amt >= both )
       {
-        if ( info ) 
-        { 
-          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer; 
-          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size ); 
+        if ( info )
+        {
+          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer;
+          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size );
         }
       }
       else
@@ -6948,7 +7267,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
       if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
       {
-        if ( horizontal->is_gather == vertical->is_gather ) 
+        if ( horizontal->is_gather == vertical->is_gather )
         {
           copy_horizontal = 1;
           goto no_vert_alloc;
@@ -6975,16 +7294,16 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
       if ( horizontal->extent_info.widest <= 12 )
         info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
-      
+
       info->scanline_extents.conservative.n0 = conservative->n0;
       info->scanline_extents.conservative.n1 = conservative->n1;
-      
+
       // get exact extents
       stbir__get_extents( horizontal, &info->scanline_extents );
 
       // pack the horizontal coeffs
-      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n1 + 1 );
-      
+      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 );
+
       STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
 
       STBIR_PROFILE_BUILD_END( horizontal );
@@ -7004,7 +7323,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       }
 
       // setup the vertical split ranges
-      stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size );
+      stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size, info->vertical.is_gather, info->vertical.contributors );
 
       // now we know precisely how many entries we need
       info->ring_buffer_num_entries = info->vertical.extent_info.widest;
@@ -7013,49 +7332,14 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
         info->ring_buffer_num_entries = conservative_split_output_size;
       STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
-
-      // a few of the horizontal gather functions read one dword past the end (but mask it out), so put in a normal value so no snans or denormals accidentally sneak in
-      for( i = 0 ; i < splits ; i++ )
-      {
-        int width, ofs;
-        
-        // find the right most span
-        if ( info->scanline_extents.spans[0].n1 > info->scanline_extents.spans[1].n1 )
-          width = info->scanline_extents.spans[0].n1 - info->scanline_extents.spans[0].n0;
-        else
-          width = info->scanline_extents.spans[1].n1 - info->scanline_extents.spans[1].n0;
-        
-        // this calc finds the exact end of the decoded scanline for all filter modes.
-        //   usually this is just the width * effective channels.  But we have to account 
-        //   for the area to the left of the scanline for wrap filtering and alignment, this 
-        //   is stored as a negative value in info->scanline_extents.conservative.n0. Next,
-        //   we need to skip the exact size of the right hand size filter area (again for
-        //   wrap mode), this is in info->scanline_extents.edge_sizes[1]).
-        ofs = ( width + 1 - info->scanline_extents.conservative.n0 + info->scanline_extents.edge_sizes[1] ) * effective_channels;
-        
-        // place a known, but numerically valid value in the decode buffer
-        info->split_info[i].decode_buffer[ ofs ] = 9999.0f;
-
-        // if vertical filtering first, place a known, but numerically valid value in the all
-        //   of the ring buffer accumulators
-        if ( vertical_first )
-        {
-          int j;  
-          for( j = 0; j < info->ring_buffer_num_entries ; j++ )
-          {
-            stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ ofs ] = 9999.0f;
-          }
-        }
-      }
     }
-
     #undef STBIR__NEXT_PTR
 
 
     // is this the first time through loop?
     if ( info == 0 )
     {
-      alloced_total = (int) ( 15 + (size_t)advance_mem );
+      alloced_total = ( 15 + (size_t)advance_mem );
       alloced = STBIR_MALLOC( alloced_total, user_data );
       if ( alloced == 0 )
         return 0;
@@ -7065,7 +7349,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
   }
 }
 
-static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count ) 
+static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count )
 {
   stbir__per_split_info * split_info = info->split_info + split_start;
 
@@ -7085,7 +7369,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
 {
   static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
   {
-    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear, 
+    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear,
   };
 
   static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
@@ -7148,7 +7432,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   stbir_datatype input_type, output_type;
 
   input_type = resize->input_data_type;
-  output_type = resize->output_data_type; 
+  output_type = resize->output_data_type;
   info->input_data = resize->input_pixels;
   info->input_stride_bytes = resize->input_stride_in_bytes;
   info->output_stride_bytes = resize->output_stride_in_bytes;
@@ -7156,7 +7440,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   // if we're completely point sampling, then we can turn off SRGB
   if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
   {
-    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) && 
+    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) &&
          ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
     {
       input_type = STBIR_TYPE_UINT8;
@@ -7164,7 +7448,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
     }
   }
 
-  // recalc the output and input strides  
+  // recalc the output and input strides
   if ( info->input_stride_bytes == 0 )
     info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
 
@@ -7172,7 +7456,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
     info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
 
   // calc offset
-  info->output_data = ( (char*) resize->output_pixels ) + ( (ptrdiff_t) info->offset_y * (ptrdiff_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
+  info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
 
   info->in_pixels_cb = resize->input_cb;
   info->user_data = resize->user_data;
@@ -7205,7 +7489,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
   {
     int non_scaled = 0;
-    
+
     // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
     if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
       if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
@@ -7225,16 +7509,16 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   }
 
   info->input_type = input_type;
-  info->output_type = output_type; 
+  info->output_type = output_type;
   info->decode_pixels = decode_pixels;
-  info->encode_pixels = encode_pixels; 
+  info->encode_pixels = encode_pixels;
 }
 
 static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
 {
   double per, adj;
   int over;
-  
+
   // do left/top edge
   if ( *outx < 0 )
   {
@@ -7253,7 +7537,7 @@ static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, doubl
     *u1 += adj; // decrease u1
     *outsubw = outw - *outx;
   }
-}    
+}
 
 // converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
 static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
@@ -7270,7 +7554,7 @@ static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32
   bot = 1 << 25;
 
   // keep refining, but usually stops in a few loops - usually 5 for bad cases
-  for(;;)  
+  for(;;)
   {
     stbir_uint64 est, temp;
 
@@ -7303,13 +7587,13 @@ static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32
     bot = temp;
 
     // move remainders
-    temp = est * denom_estimate + denom_last; 
-    denom_last = denom_estimate; 
+    temp = est * denom_estimate + denom_last;
+    denom_last = denom_estimate;
     denom_estimate = temp;
 
     // move remainders
-    temp = est * numer_estimate + numer_last; 
-    numer_last = numer_estimate; 
+    temp = est * numer_estimate + numer_last;
+    numer_last = numer_estimate;
     numer_estimate = temp;
   }
 
@@ -7353,11 +7637,11 @@ static int stbir__calculate_region_transform( stbir__scale_info * scale_info, in
 
   output_s = ( (double)output_sub_range) / output_range;
 
-  // figure out the scaling to use 
-  ratio = output_s / input_s; 
+  // figure out the scaling to use
+  ratio = output_s / input_s;
 
   // save scale before clipping
-  scale = ( output_range / input_range ) * ratio; 
+  scale = ( output_range / input_range ) * ratio;
   scale_info->scale = (float)scale;
   scale_info->inv_scale = (float)( 1.0 / scale );
 
@@ -7368,11 +7652,11 @@ static int stbir__calculate_region_transform( stbir__scale_info * scale_info, in
   input_s = input_s1 - input_s0;
 
   // after clipping do we have zero input area?
-  if ( input_s <= stbir__small_float ) 
+  if ( input_s <= stbir__small_float )
     return 0;
 
-  // calculate and store the starting source offsets in output pixel space 
-  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range ); 
+  // calculate and store the starting source offsets in output pixel space
+  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range );
 
   scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
 
@@ -7389,7 +7673,6 @@ static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layou
   resize->output_cb = 0;
   resize->user_data = resize;
   resize->samplers = 0;
-  resize->needs_rebuild = 1;
   resize->called_alloc = 0;
   resize->horizontal_filter = STBIR_FILTER_DEFAULT;
   resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
@@ -7403,9 +7686,10 @@ static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layou
   resize->output_data_type = data_type;
   resize->input_pixel_layout_public = pixel_layout;
   resize->output_pixel_layout_public = pixel_layout;
+  resize->needs_rebuild = 1;
 }
 
-STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize, 
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
                                  const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
                                        void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
                                  stbir_pixel_layout pixel_layout, stbir_datatype data_type )
@@ -7428,17 +7712,27 @@ STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_t
 {
   resize->input_data_type = input_type;
   resize->output_data_type = output_type;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+    stbir__update_info_from_resize( resize->samplers, resize );
 }
 
 STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb )   // no callbacks by default
 {
   resize->input_cb = input_cb;
   resize->output_cb = output_cb;
+
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+  {
+    resize->samplers->in_pixels_cb = input_cb;
+    resize->samplers->out_pixels_cb = output_cb;
+  }
 }
 
 STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data )                                     // pass back STBIR_RESIZE* by default
 {
   resize->user_data = user_data;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+    resize->samplers->user_data = user_data;
 }
 
 STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
@@ -7447,6 +7741,8 @@ STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_p
   resize->input_stride_in_bytes = input_stride_in_bytes;
   resize->output_pixels = output_pixels;
   resize->output_stride_in_bytes = output_stride_in_bytes;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
+    stbir__update_info_from_resize( resize->samplers, resize );
 }
 
 
@@ -7549,7 +7845,7 @@ STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby,
   return 1;
 }
 
-static int stbir__perform_build( STBIR_RESIZE * resize, int splits ) 
+static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
 {
   stbir__contributors conservative = { 0, 0 };
   stbir__sampler horizontal, vertical;
@@ -7563,13 +7859,13 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
   // have we already built the samplers?
   if ( resize->samplers )
     return 0;
-  
+
   #define STBIR_RETURN_ERROR_AND_ASSERT( exp )  STBIR_ASSERT( !(exp) ); if (exp) return 0;
   STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
   STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
   #undef STBIR_RETURN_ERROR_AND_ASSERT
 
-  if ( splits <= 0 ) 
+  if ( splits <= 0 )
     return 0;
 
   STBIR_PROFILE_BUILD_FIRST_START( build );
@@ -7591,11 +7887,11 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
 
   stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data );
   stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
-  stbir__set_sampler(&vertical, resize->vertical_filter, resize->horizontal_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
+  stbir__set_sampler(&vertical, resize->vertical_filter, resize->vertical_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
 
-  if ( ( vertical.scale_info.output_sub_size / splits ) < 4 ) // each split should be a minimum of 4 scanlines (handwavey choice)
+  if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice)
   {
-    splits = vertical.scale_info.output_sub_size / 4;
+    splits = vertical.scale_info.output_sub_size / STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS;
     if ( splits == 0 ) splits = 1;
   }
 
@@ -7603,7 +7899,7 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
   out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
   STBIR_PROFILE_BUILD_END( alloc );
   STBIR_PROFILE_BUILD_END( build );
- 
+
   if ( out_info )
   {
     resize->splits = splits;
@@ -7612,6 +7908,10 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
     #ifdef STBIR_PROFILE
       STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
     #endif
+
+    // update anything that can be changed without recalcing samplers
+    stbir__update_info_from_resize( out_info, resize );
+
     return splits;
   }
 
@@ -7640,7 +7940,7 @@ STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits
   }
 
   STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
-  
+
   return 1;
 }
 
@@ -7652,7 +7952,7 @@ STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
 STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
 {
   int result;
-  
+
   if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
   {
     int alloc_state = resize->called_alloc;  // remember allocated state
@@ -7665,10 +7965,10 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
 
     if ( !stbir_build_samplers( resize ) )
       return 0;
-    
+
     resize->called_alloc = alloc_state;
 
-    // if build_samplers succeeded (above), but there are no samplers set, then 
+    // if build_samplers succeeded (above), but there are no samplers set, then
     //   the area to stretch into was zero pixels, so don't do anything and return
     //   success
     if ( resize->samplers == 0 )
@@ -7680,10 +7980,6 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
     STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
   }
 
-
-  // update anything that can be changed without recalcing samplers
-  stbir__update_info_from_resize( resize->samplers, resize );
-
   // do resize
   result = stbir__perform_resize( resize->samplers, 0, resize->splits );
 
@@ -7692,7 +7988,7 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
   {
     stbir_free_samplers( resize );
     resize->samplers = 0;
-  } 
+  }
 
   return result;
 }
@@ -7707,150 +8003,73 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 
   // you **must** build samplers first when using split resize
   if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
-    return 0; 
-    
+    return 0;
+
   if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
     return 0;
-  
-  // update anything that can be changed without recalcing samplers
-  stbir__update_info_from_resize( resize->samplers, resize );
- 
+
   // do resize
   return stbir__perform_resize( resize->samplers, split_start, split_count );
 }
 
-static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * output_pixels, int type_size, int output_w, int output_h, int output_stride_in_bytes, stbir_internal_pixel_layout pixel_layout )
+
+static void * stbir_quick_resize_helper( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                               void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                               stbir_pixel_layout pixel_layout, stbir_datatype data_type, stbir_edge edge, stbir_filter filter )
 {
-  size_t size;
-  int pitch;
-  void * ptr;
+  STBIR_RESIZE resize;
+  int scanline_output_in_bytes;
+  int positive_output_stride_in_bytes;
+  void * start_ptr;
+  void * free_ptr;
 
-  pitch = output_w * type_size * stbir__pixel_channels[ pixel_layout ];
-  if ( pitch == 0 )
+  scanline_output_in_bytes = output_w * stbir__type_size[ data_type ] * stbir__pixel_channels[ stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ];
+  if ( scanline_output_in_bytes == 0 )
     return 0;
 
+  // if zero stride, use scanline output
   if ( output_stride_in_bytes == 0 )
-    output_stride_in_bytes = pitch;
+    output_stride_in_bytes = scanline_output_in_bytes;
 
-  if ( output_stride_in_bytes < pitch )
+  // abs value for inverted images (negative pitches)
+  positive_output_stride_in_bytes = output_stride_in_bytes;
+  if ( positive_output_stride_in_bytes < 0 )
+    positive_output_stride_in_bytes = -positive_output_stride_in_bytes;
+
+  // is the requested stride smaller than the scanline output? if so, just fail
+  if ( positive_output_stride_in_bytes < scanline_output_in_bytes )
     return 0;
 
-  size = output_stride_in_bytes * output_h;
-  if ( size == 0 )
-    return 0;
-
-  *ret_ptr = 0;
-  *ret_pitch = output_stride_in_bytes;
+  start_ptr = output_pixels;
+  free_ptr = 0;  // no free pointer, since they passed buffer to use
 
+  // did they pass a zero for the dest? if so, allocate the buffer
   if ( output_pixels == 0 )
   {
-    ptr = STBIR_MALLOC( size, 0 );
+    size_t size;
+    char * ptr;
+  
+    size = (size_t)positive_output_stride_in_bytes * (size_t)output_h;
+    if ( size == 0 )
+      return 0;
+
+    ptr = (char*) STBIR_MALLOC( size, 0 );
     if ( ptr == 0 )
       return 0;
 
-    *ret_ptr = ptr;
-    *ret_pitch = pitch;
+    free_ptr = ptr;
+
+    // point at the last scanline, if they requested a flipped image
+    if ( output_stride_in_bytes < 0 )
+      start_ptr = ptr + ( (size_t)positive_output_stride_in_bytes * (size_t)( output_h - 1 ) );
+    else
+      start_ptr = ptr;
   }
 
-  return 1;  
-}
-
-
-STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
-                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                                          stbir_pixel_layout pixel_layout )
-{
-  STBIR_RESIZE resize;
-  unsigned char * optr;
-  int opitch;
-
-  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
-    return 0;
-
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
-                     pixel_layout, STBIR_TYPE_UINT8 );
-
-  if ( !stbir_resize_extended( &resize ) )
-  {
-    if ( optr )
-      STBIR_FREE( optr, 0 );
-    return 0;
-  }
-
-  return (optr) ? optr : output_pixels;
-}
-
-STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
-                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                                        stbir_pixel_layout pixel_layout )
-{
-  STBIR_RESIZE resize;
-  unsigned char * optr;
-  int opitch;
-
-  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
-    return 0;
-
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
-                     pixel_layout, STBIR_TYPE_UINT8_SRGB );
-
-  if ( !stbir_resize_extended( &resize ) )
-  {
-    if ( optr )
-      STBIR_FREE( optr, 0 );
-    return 0;
-  }
-
-  return (optr) ? optr : output_pixels;
-}
-
-
-STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
-                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                                                  stbir_pixel_layout pixel_layout )
-{
-  STBIR_RESIZE resize;
-  float * optr;
-  int opitch;
-
-  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( float ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
-    return 0;
-
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
-                     pixel_layout, STBIR_TYPE_FLOAT );
-
-  if ( !stbir_resize_extended( &resize ) )
-  {
-    if ( optr )
-      STBIR_FREE( optr, 0 );
-    return 0;
-  }
-
-  return (optr) ? optr : output_pixels;
-}
-
-
-STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
-                                    void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                              stbir_pixel_layout pixel_layout, stbir_datatype data_type, 
-                              stbir_edge edge, stbir_filter filter )
-{
-  STBIR_RESIZE resize;
-  float * optr;
-  int opitch;
-
-  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, stbir__type_size[data_type], output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
-    return 0;
-
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, output_stride_in_bytes, 
+  // ok, now do the resize
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     start_ptr, output_w, output_h, output_stride_in_bytes,
                      pixel_layout, data_type );
 
   resize.horizontal_edge = edge;
@@ -7860,12 +8079,53 @@ STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input
 
   if ( !stbir_resize_extended( &resize ) )
   {
-    if ( optr )
-      STBIR_FREE( optr, 0 );
+    if ( free_ptr )
+      STBIR_FREE( free_ptr, 0 );
     return 0;
   }
 
-  return (optr) ? optr : output_pixels;
+  return (free_ptr) ? free_ptr : start_ptr;
+}
+
+
+
+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                          stbir_pixel_layout pixel_layout )
+{
+  return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
+                                                      output_pixels, output_w, output_h, output_stride_in_bytes, 
+                                                      pixel_layout, STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
+}
+
+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                        stbir_pixel_layout pixel_layout )
+{
+  return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
+                                                      output_pixels, output_w, output_h, output_stride_in_bytes, 
+                                                      pixel_layout, STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT );
+}
+
+
+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                  stbir_pixel_layout pixel_layout )
+{
+  return (float *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
+                                              output_pixels, output_w, output_h, output_stride_in_bytes, 
+                                              pixel_layout, STBIR_TYPE_FLOAT, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT  );
+}
+
+
+STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                    void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                    stbir_pixel_layout pixel_layout, stbir_datatype data_type,
+                                    stbir_edge edge, stbir_filter filter )
+{
+  return (void *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, 
+                                             output_pixels, output_w, output_h, output_stride_in_bytes, 
+                                             pixel_layout, data_type, edge, filter  );
 }
 
 #ifdef STBIR_PROFILE
@@ -7958,7 +8218,7 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
 #else  // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
 
 // we reinclude the header file to define all the horizontal functions
-//   specializing each function for the number of coeffs is 20-40% faster *OVERALL* 
+//   specializing each function for the number of coeffs is 20-40% faster *OVERALL*
 
 // by including the header file again this way, we can still debug the functions
 
@@ -7991,18 +8251,18 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
 #define stbir__encode_order2 2
 #define stbir__encode_order3 3
 #define stbir__decode_simdf8_flip(reg)
-#define stbir__decode_simdf4_flip(reg) 
+#define stbir__decode_simdf4_flip(reg)
 #define stbir__encode_simdf8_unflip(reg)
-#define stbir__encode_simdf4_unflip(reg) 
+#define stbir__encode_simdf4_unflip(reg)
 #endif
 
 #ifdef STBIR_SIMD8
 #define stbir__encode_simdfX_unflip  stbir__encode_simdf8_unflip
 #else
 #define stbir__encode_simdfX_unflip  stbir__encode_simdf4_unflip
-#endif 
+#endif
 
-static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
   float * decode_end = (float*) decode + width_times_channels;
@@ -8013,6 +8273,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
   if ( width_times_channels >= 16 )
   {
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8054,20 +8315,21 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
       #endif
       decode += 16;
       input += 16;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 16 ) )
         break;
       decode = decode_end; // backup and do last couple
       input = end_input_m16;
     }
-    return;
+    return decode_end + 16;
   }
   #endif
 
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8083,6 +8345,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8097,6 +8360,8 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
     input += stbir__coder_min_num;
   }
   #endif
+
+  return decode_end;
 }
 
 static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
@@ -8109,6 +8374,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   {
     float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
     end_output -= stbir__simdfX_float_count*2;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -8119,15 +8385,15 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
       stbir__encode_simdfX_unflip( e0 );
       stbir__encode_simdfX_unflip( e1 );
       #ifdef STBIR_SIMD8
-      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
       stbir__simdi_store( output, i );
       #else
-      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdf_pack_to_8bytes( i, e0, e1 );
       stbir__simdi_store2( output, i );
       #endif
       encode += stbir__simdfX_float_count*2;
       output += stbir__simdfX_float_count*2;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
         break;
@@ -8140,6 +8406,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -8158,9 +8425,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
-    stbir__simdf e0; 
+    stbir__simdf e0;
     STBIR_NO_UNROLL(encode);
     stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
     #if stbir__coder_min_num >= 2
@@ -8173,7 +8441,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
     encode += stbir__coder_min_num;
   }
   #endif
-  
+
   #else
 
   // try to do blocks of 4 when you can
@@ -8194,6 +8462,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8212,7 +8481,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   #endif
 }
 
-static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
   float * decode_end = (float*) decode + width_times_channels;
@@ -8223,6 +8492,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
   if ( width_times_channels >= 16 )
   {
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8258,20 +8528,21 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
 #endif
       decode += 16;
       input += 16;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 16 ) )
         break;
       decode = decode_end; // backup and do last couple
       input = end_input_m16;
     }
-    return;
+    return decode_end + 16;
   }
   #endif
 
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8287,6 +8558,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8301,6 +8573,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
     input += stbir__coder_min_num;
   }
   #endif
+  return decode_end;
 }
 
 static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
@@ -8313,6 +8586,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
   {
     float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
     end_output -= stbir__simdfX_float_count*2;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -8323,15 +8597,15 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
       stbir__encode_simdfX_unflip( e0 );
       stbir__encode_simdfX_unflip( e1 );
       #ifdef STBIR_SIMD8
-      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
       stbir__simdi_store( output, i );
       #else
-      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdf_pack_to_8bytes( i, e0, e1 );
       stbir__simdi_store2( output, i );
       #endif
       encode += stbir__simdfX_float_count*2;
       output += stbir__simdfX_float_count*2;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
         break;
@@ -8344,6 +8618,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -8382,6 +8657,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8399,10 +8675,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
   #endif
 }
 
-static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
-  float const * decode_end = (float*) decode + width_times_channels;
+  float * decode_end = (float*) decode + width_times_channels;
   unsigned char const * input = (unsigned char const *)inputp;
 
   // try to do blocks of 4 when you can
@@ -8422,6 +8698,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8436,12 +8713,13 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
     input += stbir__coder_min_num;
   }
   #endif
+  return decode_end;
 }
 
 #define stbir__min_max_shift20( i, f ) \
     stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
     stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one  )) ); \
-    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 ); 
+    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 );
 
 #define stbir__scale_and_convert( i, f ) \
     stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
@@ -8468,7 +8746,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
   temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
   v0 = temp0.m128i_i128; \
   v1 = temp1.m128i_i128; \
-} 
+}
 
 #define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
 { \
@@ -8499,7 +8777,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
   v1 = temp1.m128i_i128; \
   v2 = temp2.m128i_i128; \
   v3 = temp3.m128i_i128; \
-} 
+}
 
 static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
 {
@@ -8507,16 +8785,16 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
-      stbir__simdi i0, i1, i2, i3; 
+      stbir__simdi i0, i1, i2, i3;
       STBIR_SIMD_NO_UNROLL(encode);
 
       stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
@@ -8525,9 +8803,9 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
       stbir__min_max_shift20( i1, f1 );
       stbir__min_max_shift20( i2, f2 );
       stbir__min_max_shift20( i3, f3 );
-      
-      stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb );
-     
+
+      stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
+
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
       stbir__linear_to_srgb_finish( i2, f2 );
@@ -8537,7 +8815,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
 
       encode += 16;
       output += 16;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 16 ) )
         break;
@@ -8551,6 +8829,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while ( output <= end_output )
   {
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8568,7 +8847,8 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
 
   // do the remnants
   #if stbir__coder_min_num < 4
-  while( output < end_output ) 
+  STBIR_NO_UNROLL_LOOP_START
+  while( output < end_output )
   {
     STBIR_NO_UNROLL(encode);
     output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
@@ -8586,11 +8866,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
 
 #if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
 
-static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
-  float const * decode_end = (float*) decode + width_times_channels;
+  float * decode_end = (float*) decode + width_times_channels;
   unsigned char const * input = (unsigned char const *)inputp;
+
   do {
     decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
     decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
@@ -8599,6 +8880,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * de
     input += 4;
     decode += 4;
   } while( decode < decode_end );
+  return decode_end;
 }
 
 
@@ -8608,12 +8890,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
@@ -8625,10 +8907,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
       stbir__min_max_shift20( i0, f0 );
       stbir__min_max_shift20( i1, f1 );
       stbir__min_max_shift20( i2, f2 );
-      stbir__scale_and_convert( i3, f3 ); 
-      
-      stbir__simdi_table_lookup3( i0, i1, i2, to_srgb );
-     
+      stbir__scale_and_convert( i3, f3 );
+
+      stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
+
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
       stbir__linear_to_srgb_finish( i2, f2 );
@@ -8638,7 +8920,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
       output += 16;
       encode += 16;
 
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 16 ) )
         break;
@@ -8649,9 +8931,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
   }
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float f;
-    STBIR_SIMD_NO_UNROLL(encode);                                        
+    STBIR_SIMD_NO_UNROLL(encode);
 
     output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
     output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
@@ -8670,11 +8953,12 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
 
 #if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
 
-static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
-  float const * decode_end = (float*) decode + width_times_channels;
+  float * decode_end = (float*) decode + width_times_channels;
   unsigned char const * input = (unsigned char const *)inputp;
+
   decode += 4;
   while( decode <= decode_end )
   {
@@ -8686,11 +8970,12 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * de
     decode += 4;
   }
   decode -= 4;
-  if( decode < decode_end ) 
+  if( decode < decode_end )
   {
     decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
     decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
   }
+  return decode_end;
 }
 
 static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
@@ -8699,16 +8984,16 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
     float const * end_encode_m16 = encode + width_times_channels - 16;
     end_output -= 16;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
-      stbir__simdi i0, i1, i2, i3; 
+      stbir__simdi i0, i1, i2, i3;
 
       STBIR_SIMD_NO_UNROLL(encode);
       stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
@@ -8717,9 +9002,9 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
       stbir__scale_and_convert( i1, f1 );
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
-      
-      stbir__simdi_table_lookup2( i0, i2, to_srgb );
-     
+
+      stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
+
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i2, f2 );
 
@@ -8727,7 +9012,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
 
       output += 16;
       encode += 16;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 16 ) )
         break;
@@ -8738,6 +9023,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
   }
   #endif
 
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
     float f;
     STBIR_SIMD_NO_UNROLL(encode);
@@ -8755,7 +9041,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
 
 #endif
 
-static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
   float * decode_end = (float*) decode + width_times_channels;
@@ -8766,6 +9052,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
   if ( width_times_channels >= 8 )
   {
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8793,22 +9080,23 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
       stbir__simdf_store( decode + 0,  of0 );
       stbir__simdf_store( decode + 4,  of1 );
       #endif
-      decode += 8;  
+      decode += 8;
       input += 8;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 8 ) )
         break;
       decode = decode_end; // backup and do last couple
       input = end_input_m8;
     }
-    return;
+    return decode_end + 8;
   }
   #endif
 
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -8824,6 +9112,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -8838,6 +9127,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
     input += stbir__coder_min_num;
   }
   #endif
+  return decode_end;
 }
 
 
@@ -8852,6 +9142,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
     {
       float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
       end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
       for(;;)
       {
         stbir__simdfX e0, e1;
@@ -8865,7 +9156,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
         stbir__simdiX_store( output, i );
         encode += stbir__simdfX_float_count*2;
         output += stbir__simdfX_float_count*2;
-        if ( output <= end_output ) 
+        if ( output <= end_output )
           continue;
         if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
           break;
@@ -8879,6 +9170,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e;
@@ -8897,6 +9189,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     stbir__simdf e;
@@ -8912,12 +9205,13 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
     encode += stbir__coder_min_num;
   }
   #endif
-  
+
   #else
 
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float f;
@@ -8934,6 +9228,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -8952,7 +9247,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
   #endif
 }
 
-static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
   float * decode_end = (float*) decode + width_times_channels;
@@ -8963,6 +9258,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
   if ( width_times_channels >= 8 )
   {
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       #ifdef STBIR_SIMD8
@@ -8989,20 +9285,21 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
       #endif
       decode += 8;
       input += 8;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 8 ) )
         break;
       decode = decode_end; // backup and do last couple
       input = end_input_m8;
     }
-    return;
+    return decode_end + 8;
   }
   #endif
 
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9018,6 +9315,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9032,6 +9330,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
     input += stbir__coder_min_num;
   }
   #endif
+  return decode_end;
 }
 
 static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
@@ -9045,6 +9344,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
     {
       float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
       end_output -= stbir__simdfX_float_count*2;
+      STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
       for(;;)
       {
         stbir__simdfX e0, e1;
@@ -9058,7 +9358,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
         stbir__simdiX_store( output, i );
         encode += stbir__simdfX_float_count*2;
         output += stbir__simdfX_float_count*2;
-        if ( output <= end_output ) 
+        if ( output <= end_output )
           continue;
         if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
           break;
@@ -9072,6 +9372,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e;
@@ -9093,6 +9394,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
   // try to do blocks of 4 when you can
   #if  stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float f;
@@ -9111,6 +9413,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float f;
@@ -9128,7 +9431,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
   #endif
 }
 
-static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
   float * decode_end = (float*) decode + width_times_channels;
@@ -9139,6 +9442,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
   {
     stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
     decode_end -= 8;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_NO_UNROLL(decode);
@@ -9166,20 +9470,21 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
       #endif
       decode += 8;
       input += 8;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 8 ) )
         break;
       decode = decode_end; // backup and do last couple
       input = end_input_m8;
     }
-    return;
+    return decode_end + 8;
   }
   #endif
 
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9195,6 +9500,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9209,6 +9515,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
     input += stbir__coder_min_num;
   }
   #endif
+  return decode_end;
 }
 
 static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
@@ -9221,6 +9528,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
   {
     float const * end_encode_m8 = encode + width_times_channels - 8;
     end_output -= 8;
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_SIMD_NO_UNROLL(encode);
@@ -9247,7 +9555,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
       #endif
       encode += 8;
       output += 8;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 8 ) )
         break;
@@ -9261,6 +9569,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     STBIR_SIMD_NO_UNROLL(output);
@@ -9276,6 +9585,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     STBIR_NO_UNROLL(output);
@@ -9292,7 +9602,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
   #endif
 }
 
-static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
 {
   #ifdef stbir__decode_swizzle
   float STBIR_STREAMOUT_PTR( * ) decode = decodep;
@@ -9304,6 +9614,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
   {
     float const * end_input_m16 = input + width_times_channels - 16;
     decode_end -= 16;
+    STBIR_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       STBIR_NO_UNROLL(decode);
@@ -9338,20 +9649,21 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
       #endif
       decode += 16;
       input += 16;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 16 ) )
         break;
       decode = decode_end; // backup and do last couple
       input = end_input_m16;
     }
-    return;
+    return decode_end + 16;
   }
   #endif
 
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   decode += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( decode <= decode_end )
   {
     STBIR_SIMD_NO_UNROLL(decode);
@@ -9367,6 +9679,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( decode < decode_end )
   {
     STBIR_NO_UNROLL(decode);
@@ -9381,12 +9694,15 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
     input += stbir__coder_min_num;
   }
   #endif
+  return decode_end;
 
   #else
-  
+
   if ( (void*)decodep != inputp )
     STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
-  
+
+  return decodep + width_times_channels;
+
   #endif
 }
 
@@ -9426,6 +9742,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   {
     float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
     end_output -= ( stbir__simdfX_float_count * 2 );
+    STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR
     for(;;)
     {
       stbir__simdfX e0, e1;
@@ -9435,18 +9752,18 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
 #ifdef STBIR_FLOAT_HIGH_CLAMP
       stbir__simdfX_min( e0, e0, high_clamp );
       stbir__simdfX_min( e1, e1, high_clamp );
-#endif      
+#endif
 #ifdef STBIR_FLOAT_LOW_CLAMP
       stbir__simdfX_max( e0, e0, low_clamp );
       stbir__simdfX_max( e1, e1, low_clamp );
-#endif      
+#endif
       stbir__encode_simdfX_unflip( e0 );
       stbir__encode_simdfX_unflip( e1 );
       stbir__simdfX_store( output, e0 );
       stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
       encode += stbir__simdfX_float_count * 2;
       output += stbir__simdfX_float_count * 2;
-      if ( output < end_output ) 
+      if ( output < end_output )
         continue;
       if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
         break;
@@ -9459,6 +9776,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     stbir__simdf e0;
@@ -9466,10 +9784,10 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
     stbir__simdf_load( e0, encode );
 #ifdef STBIR_FLOAT_HIGH_CLAMP
     stbir__simdf_min( e0, e0, high_clamp );
-#endif      
+#endif
 #ifdef STBIR_FLOAT_LOW_CLAMP
     stbir__simdf_max( e0, e0, low_clamp );
-#endif      
+#endif
     stbir__encode_simdf4_unflip( e0 );
     stbir__simdf_store( output-4, e0 );
     output += 4;
@@ -9483,6 +9801,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
   // try to do blocks of 4 when you can
   #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
   output += 4;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   while( output <= end_output )
   {
     float e;
@@ -9502,6 +9821,7 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
 
   // do the remnants
   #if stbir__coder_min_num < 4
+  STBIR_NO_UNROLL_LOOP_START
   while( output < end_output )
   {
     float e;
@@ -9517,18 +9837,18 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
     encode += stbir__coder_min_num;
   }
   #endif
-  
+
   #endif
 }
 
-#undef stbir__decode_suffix 
+#undef stbir__decode_suffix
 #undef stbir__decode_simdf8_flip
 #undef stbir__decode_simdf4_flip
-#undef stbir__decode_order0 
+#undef stbir__decode_order0
 #undef stbir__decode_order1
 #undef stbir__decode_order2
 #undef stbir__decode_order3
-#undef stbir__encode_order0 
+#undef stbir__encode_order0
 #undef stbir__encode_order1
 #undef stbir__encode_order2
 #undef stbir__encode_order3
@@ -9612,7 +9932,8 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
-    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) ) 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
       STBIR_SIMD_NO_UNROLL(output0);
@@ -9621,52 +9942,53 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
 
       #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
       stbIF0( stbir__simdfX_load( o0, output0 );     stbir__simdfX_load( o1, output0+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );           
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );
               stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
       stbIF1( stbir__simdfX_load( o0, output1 );     stbir__simdfX_load( o1, output1+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );             
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );
               stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
       stbIF2( stbir__simdfX_load( o0, output2 );     stbir__simdfX_load( o1, output2+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );             
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );
               stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
       stbIF3( stbir__simdfX_load( o0, output3 );     stbir__simdfX_load( o1, output3+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );             
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );
               stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
       stbIF4( stbir__simdfX_load( o0, output4 );     stbir__simdfX_load( o1, output4+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );             
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );
               stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
       stbIF5( stbir__simdfX_load( o0, output5 );     stbir__simdfX_load( o1, output5+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count));    stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );             
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );
               stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
       stbIF6( stbir__simdfX_load( o0, output6 );     stbir__simdfX_load( o1, output6+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );             
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );
               stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
       stbIF7( stbir__simdfX_load( o0, output7 );     stbir__simdfX_load( o1, output7+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );             
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );
               stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
       #else
-      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );  
+      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );
               stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );  
+      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );
               stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );  
+      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );
               stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );  
+      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );
               stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );  
+      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );
               stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );  
+      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );
               stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );  
+      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );
               stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );  
+      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );
               stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
       #endif
 
       input += (4*stbir__simdfX_float_count);
       stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
     }
-    while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input_end - (char*) input ) >= 16 )
     {
       stbir__simdf o0, r0;
       STBIR_SIMD_NO_UNROLL(output0);
@@ -9692,13 +10014,14 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
       stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );   stbir__simdf_store( output6, o0 ); )
       stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );   stbir__simdf_store( output7, o0 ); )
       #endif
-      
+
       input += 4;
       stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
     }
   }
   #else
-  while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+  STBIR_NO_UNROLL_LOOP_START
+  while ( ( (char*)input_end - (char*) input ) >= 16 )
   {
     float r0, r1, r2, r3;
     STBIR_NO_UNROLL(input);
@@ -9729,7 +10052,8 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
   }
   #endif
-  while ( input < input_end ) 
+  STBIR_NO_UNROLL_LOOP_START
+  while ( input < input_end )
   {
     float r = input[0];
     STBIR_NO_UNROLL(output0);
@@ -9779,7 +10103,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
     return;
   }
-#endif  
+#endif
 
   #ifdef STBIR_SIMD
   {
@@ -9791,14 +10115,15 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
-    
-    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) ) 
+
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
       STBIR_SIMD_NO_UNROLL(output);
 
       // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
-      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); ) 
+      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); )
       stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
       stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
       stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
@@ -9836,7 +10161,8 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
       stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
     }
 
-    while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+    STBIR_SIMD_NO_UNROLL_LOOP_START
+    while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
     {
       stbir__simdf o0, r0;
       STBIR_SIMD_NO_UNROLL(output);
@@ -9860,7 +10186,8 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     }
   }
   #else
-  while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+  STBIR_NO_UNROLL_LOOP_START
+  while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
   {
     float o0, o1, o2, o3;
     STBIR_NO_UNROLL(output);
@@ -9881,7 +10208,8 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
   }
   #endif
-  while ( input0 < input0_end ) 
+  STBIR_NO_UNROLL_LOOP_START
+  while ( input0 < input0_end )
   {
     float o0;
     STBIR_NO_UNROLL(output);
@@ -9897,7 +10225,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF5( o0 += input5[0] * c5s; )
     stbIF6( o0 += input6[0] * c6s; )
     stbIF7( o0 += input7[0] * c7s; )
-    output[0] = o0; 
+    output[0] = o0;
     ++output;
     stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
   }
@@ -9928,25 +10256,25 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
 #ifndef stbir__2_coeff_only
 #define stbir__2_coeff_only()             \
     stbir__1_coeff_only();                \
-    stbir__1_coeff_remnant(1);            
+    stbir__1_coeff_remnant(1);
 #endif
 
 #ifndef stbir__2_coeff_remnant
 #define stbir__2_coeff_remnant( ofs )     \
     stbir__1_coeff_remnant(ofs);          \
-    stbir__1_coeff_remnant((ofs)+1);      
+    stbir__1_coeff_remnant((ofs)+1);
 #endif
-    
+
 #ifndef stbir__3_coeff_only
 #define stbir__3_coeff_only()             \
     stbir__2_coeff_only();                \
-    stbir__1_coeff_remnant(2);            
+    stbir__1_coeff_remnant(2);
 #endif
-    
+
 #ifndef stbir__3_coeff_remnant
 #define stbir__3_coeff_remnant( ofs )     \
     stbir__2_coeff_remnant(ofs);          \
-    stbir__1_coeff_remnant((ofs)+2);      
+    stbir__1_coeff_remnant((ofs)+2);
 #endif
 
 #ifndef stbir__3_coeff_setup
@@ -9956,13 +10284,13 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
 #ifndef stbir__4_coeff_start
 #define stbir__4_coeff_start()            \
     stbir__2_coeff_only();                \
-    stbir__2_coeff_remnant(2);            
+    stbir__2_coeff_remnant(2);
 #endif
-    
+
 #ifndef stbir__4_coeff_continue_from_4
 #define stbir__4_coeff_continue_from_4( ofs )     \
     stbir__2_coeff_remnant(ofs);                  \
-    stbir__2_coeff_remnant((ofs)+2);      
+    stbir__2_coeff_remnant((ofs)+2);
 #endif
 
 #ifndef stbir__store_output_tiny
@@ -9973,8 +10301,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( floa
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__1_coeff_only();
     stbir__store_output_tiny();
@@ -9985,8 +10314,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__2_coeff_only();
     stbir__store_output_tiny();
@@ -9997,8 +10327,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__3_coeff_only();
     stbir__store_output_tiny();
@@ -10009,8 +10340,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__store_output();
@@ -10021,8 +10353,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__1_coeff_remnant(4);
@@ -10034,8 +10367,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__2_coeff_remnant(4);
@@ -10048,10 +10382,11 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
-  
+
     stbir__4_coeff_start();
     stbir__3_coeff_remnant(4);
     stbir__store_output();
@@ -10062,8 +10397,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10075,8 +10411,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( flo
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10089,8 +10426,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( fl
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10104,8 +10442,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( fl
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10118,8 +10457,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( fl
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10132,12 +10472,14 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
@@ -10152,19 +10494,21 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
       stbir__4_coeff_continue_from_4( 0 );
       --n;
     } while ( n > 0 );
-    stbir__1_coeff_remnant( 4 ); 
+    stbir__1_coeff_remnant( 4 );
     stbir__store_output();
   } while ( output < output_end );
 }
@@ -10173,19 +10517,21 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2
 {
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
       stbir__4_coeff_continue_from_4( 0 );
       --n;
     } while ( n > 0 );
-    stbir__2_coeff_remnant( 4 ); 
+    stbir__2_coeff_remnant( 4 );
 
     stbir__store_output();
   } while ( output < output_end );
@@ -10196,19 +10542,21 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
+  STBIR_SIMD_NO_UNROLL_LOOP_START
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
+    STBIR_SIMD_NO_UNROLL_LOOP_START
     do {
       hc += 4;
       decode += STBIR__horizontal_channels * 4;
       stbir__4_coeff_continue_from_4( 0 );
       --n;
     } while ( n > 0 );
-    stbir__3_coeff_remnant( 4 ); 
+    stbir__3_coeff_remnant( 4 );
 
     stbir__store_output();
   } while ( output < output_end );
@@ -10216,26 +10564,26 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3
 
 static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
 {
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),
 };
 
 static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
 {
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),
   STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),
   STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),
 };
 
 #undef STBIR__horizontal_channels
@@ -10266,38 +10614,38 @@ This software is available under 2 licenses -- choose whichever you prefer.
 ------------------------------------------------------------------------------
 ALTERNATIVE A - MIT License
 Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of 
-this software and associated documentation files (the "Software"), to deal in 
-the Software without restriction, including without limitation the rights to 
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
-of the Software, and to permit persons to whom the Software is furnished to do 
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all 
+The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ------------------------------------------------------------------------------
 ALTERNATIVE B - Public Domain (www.unlicense.org)
 This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
-software, either in source code form or as a compiled binary, for any purpose, 
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
 commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this 
-software dedicate any and all copyright interest in the software to the public 
-domain. We make this dedication for the benefit of the public at large and to 
-the detriment of our heirs and successors. We intend this dedication to be an 
-overt act of relinquishment in perpetuity of all present and future rights to 
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
 this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ------------------------------------------------------------------------------
 */
diff --git a/Engine/source/gfx/gfxAPI.cpp b/Engine/source/gfx/gfxAPI.cpp
index be9564732..be05be46f 100644
--- a/Engine/source/gfx/gfxAPI.cpp
+++ b/Engine/source/gfx/gfxAPI.cpp
@@ -180,3 +180,37 @@ ImplementEnumType( GFXBlendOp,
    { GFXBlendOpMax, "GFXBlendOpMax" }
 
 EndImplementEnumType;
+
+ImplementEnumType(GFXShaderConstType,
+   "The shader const types.\n"
+   "@ingroup GFX")
+
+   { GFXSCT_Uknown, "GFXSCT_Uknown" },
+   { GFXSCT_ConstBuffer, "GFXSCT_ConstBuffer" },
+   { GFXSCT_Float, "GFXSCT_Float" },
+   { GFXSCT_Float2, "GFXSCT_Float2" },
+   { GFXSCT_Float3, "GFXSCT_Float3" },
+   { GFXSCT_Float4, "GFXSCT_Float4" },
+   { GFXSCT_Float2x2, "GFXSCT_Float2x2" },
+   { GFXSCT_Float3x3, "GFXSCT_Float3x3" },
+   { GFXSCT_Float3x4, "GFXSCT_Float3x4" },
+   { GFXSCT_Float4x3, "GFXSCT_Float4x3" },
+   { GFXSCT_Float4x4, "GFXSCT_Float4x4" },
+   { GFXSCT_Int, "GFXSCT_Int" },
+   { GFXSCT_Int2, "GFXSCT_Int2" },
+   { GFXSCT_Int3, "GFXSCT_Int3" },
+   { GFXSCT_Int4, "GFXSCT_Int4" },
+   { GFXSCT_UInt, "GFXSCT_UInt" },
+   { GFXSCT_UInt2, "GFXSCT_UInt2" },
+   { GFXSCT_UInt3, "GFXSCT_UInt3" },
+   { GFXSCT_UInt4, "GFXSCT_UInt4" },
+   { GFXSCT_Bool, "GFXSCT_Bool" },
+   { GFXSCT_Bool2, "GFXSCT_Bool2" },
+   { GFXSCT_Bool3, "GFXSCT_Bool3" },
+   { GFXSCT_Bool4, "GFXSCT_Bool4" },
+   { GFXSCT_Sampler, "GFXSCT_Sampler" },
+   { GFXSCT_SamplerCube, "GFXSCT_SamplerCube" },
+   { GFXSCT_SamplerCubeArray, "GFXSCT_SamplerCubeArray" },
+   { GFXSCT_SamplerTextureArray, "GFXSCT_SamplerTextureArray" }
+
+EndImplementEnumType;
diff --git a/Engine/source/gfx/gfxAPI.h b/Engine/source/gfx/gfxAPI.h
index 2c7f420d0..00f219166 100644
--- a/Engine/source/gfx/gfxAPI.h
+++ b/Engine/source/gfx/gfxAPI.h
@@ -44,6 +44,7 @@ DefineEnumType( GFXTextureFilterType );
 DefineEnumType( GFXCullMode );
 DefineEnumType( GFXStencilOp );
 DefineEnumType( GFXBlendOp );
+DefineEnumType(GFXShaderConstType);
 DefineEnumType( GFXAdapterType );
 
 DECLARE_STRUCT( GFXVideoMode );
@@ -57,5 +58,6 @@ DefineConsoleType( TypeGFXTextureFilterType, GFXTextureFilterType );
 DefineConsoleType( TypeGFXCullMode, GFXCullMode );
 DefineConsoleType( TypeGFXStencilOp, GFXStencilOp );
 DefineConsoleType( TypeGFXBlendOp, GFXBlendOp );
+DefineConsoleType( TypeGFXShaderConstType, GFXShaderConstType);
 
 #endif // !_GFXAPI_H_
diff --git a/Engine/source/gfx/gfxShader.cpp b/Engine/source/gfx/gfxShader.cpp
index 62f6aff96..af4930d63 100644
--- a/Engine/source/gfx/gfxShader.cpp
+++ b/Engine/source/gfx/gfxShader.cpp
@@ -42,8 +42,12 @@ GFXShader::GFXShader()
 
 GFXShader::~GFXShader()
 {
-   Torque::FS::RemoveChangeNotification( mVertexFile, this, &GFXShader::_onFileChanged );
-   Torque::FS::RemoveChangeNotification( mPixelFile, this, &GFXShader::_onFileChanged );
+   if (!mVertexFile.isEmpty())
+      Torque::FS::RemoveChangeNotification( mVertexFile, this, &GFXShader::_onFileChanged );
+   if (!mPixelFile.isEmpty())
+      Torque::FS::RemoveChangeNotification( mPixelFile, this, &GFXShader::_onFileChanged );
+   if (!mGeometryFile.isEmpty())
+      Torque::FS::RemoveChangeNotification(mGeometryFile, this, &GFXShader::_onFileChanged);
 
    SAFE_DELETE(mInstancingFormat);
 }
diff --git a/Engine/source/gfx/gfxShader.h b/Engine/source/gfx/gfxShader.h
index edb20a127..720d76290 100644
--- a/Engine/source/gfx/gfxShader.h
+++ b/Engine/source/gfx/gfxShader.h
@@ -77,7 +77,9 @@ enum GFXShaderStage
    GEOMETRY_SHADER   = BIT(2),
    DOMAIN_SHADER     = BIT(3),
    HULL_SHADER       = BIT(4),
-   COMPUTE_SHADER    = BIT(5)
+   COMPUTE_SHADER    = BIT(5),
+   ALL_STAGES = VERTEX_SHADER | PIXEL_SHADER | GEOMETRY_SHADER |
+   DOMAIN_SHADER | HULL_SHADER | COMPUTE_SHADER
 };
 
 /// Instances of this struct are returned GFXShaderConstBuffer
diff --git a/Engine/source/gfx/gfxTarget.h b/Engine/source/gfx/gfxTarget.h
index 52dfcff1e..27975ed53 100644
--- a/Engine/source/gfx/gfxTarget.h
+++ b/Engine/source/gfx/gfxTarget.h
@@ -178,7 +178,7 @@ public:
    /// @param mipLevel What level of this texture are we rendering to?
    /// @param zOffset  If this is a depth texture, what z level are we 
    ///                 rendering to?
-   virtual void attachTexture(RenderSlot slot, GFXTextureObject *tex, U32 mipLevel=0, U32 zOffset = 0) = 0;
+   virtual void attachTexture(RenderSlot slot, GFXTextureObject *tex, U32 mipLevel=0, U32 zOffset = 0, U32 faceIndex = 0) = 0;
 
    /// Support binding to cubemaps.
    ///
diff --git a/Engine/source/gfx/gfxTextureHandle.cpp b/Engine/source/gfx/gfxTextureHandle.cpp
index 53b56ff10..f76ad05b5 100644
--- a/Engine/source/gfx/gfxTextureHandle.cpp
+++ b/Engine/source/gfx/gfxTextureHandle.cpp
@@ -137,19 +137,19 @@ bool GFXTexHandle::set( DDSFile *dds, GFXTextureProfile *profile, bool deleteDDS
    return isValid();
 }
 
-GFXTexHandle::GFXTexHandle( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels, S32 antialiasLevel)
+GFXTexHandle::GFXTexHandle( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels, S32 antialiasLevel, U32 arraySize)
 {
-   set( width, height, format, profile, desc, numMipLevels, antialiasLevel );
+   set( width, height, format, profile, desc, numMipLevels, antialiasLevel, arraySize );
 }
 
-bool GFXTexHandle::set( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels, S32 antialiasLevel)
+bool GFXTexHandle::set( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels, S32 antialiasLevel, U32 arraySize)
 {
    // Clear the existing texture first, so that
    // its memory is free for the new allocation.
    free();
 
    // Create and set the new texture.
-   StrongObjectRef::set( TEXMGR->createTexture( width, height, format, profile, numMipLevels, antialiasLevel ) );
+   StrongObjectRef::set( TEXMGR->createTexture( width, height, format, profile, numMipLevels, antialiasLevel, arraySize) );
 
    #ifdef TORQUE_DEBUG
       if ( getPointer() )
@@ -159,14 +159,14 @@ bool GFXTexHandle::set( U32 width, U32 height, GFXFormat format, GFXTextureProfi
    return isValid();
 }
 
-bool GFXTexHandle::set(U32 width, U32 height, U32 depth, GFXFormat format, GFXTextureProfile* profile, const String& desc, U32 numMipLevels)
+bool GFXTexHandle::set(U32 width, U32 height, U32 depth, GFXFormat format, GFXTextureProfile* profile, const String& desc, U32 numMipLevels, U32 arraySize)
 {
    // Clear the existing texture first, so that
    // its memory is free for the new allocation.
    free();
 
    // Create and set the new texture.
-   StrongObjectRef::set(TEXMGR->createTexture(width, height, depth, format, profile, numMipLevels));
+   StrongObjectRef::set(TEXMGR->createTexture(width, height, depth, format, profile, numMipLevels, arraySize));
 
    #ifdef TORQUE_DEBUG
       if ( getPointer() )
diff --git a/Engine/source/gfx/gfxTextureHandle.h b/Engine/source/gfx/gfxTextureHandle.h
index 08ff40f4f..61940d899 100644
--- a/Engine/source/gfx/gfxTextureHandle.h
+++ b/Engine/source/gfx/gfxTextureHandle.h
@@ -58,9 +58,9 @@ public:
    bool set( DDSFile *bmp, GFXTextureProfile *profile, bool deleteDDS, const String &desc );
 
    // Sized bitmap
-   GFXTexHandle( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels = 1, S32 antialiasLevel = 0);
-   bool set( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels = 1, S32 antialiasLevel = 0);
-   bool set( U32 width, U32 height, U32 depth, GFXFormat format, GFXTextureProfile* profile, const String& desc, U32 numMipLevels = 1);
+   GFXTexHandle( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels = 1, S32 antialiasLevel = 0, U32 arraySize = 1);
+   bool set( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, const String &desc, U32 numMipLevels = 1, S32 antialiasLevel = 0, U32 arraySize = 1);
+   bool set( U32 width, U32 height, U32 depth, GFXFormat format, GFXTextureProfile* profile, const String& desc, U32 numMipLevels = 1, U32 arraySize = 1);
 
    /// Returns the width and height as a point.
    Point2I getWidthHeight() const { return getPointer() ? Point2I( getPointer()->getWidth(), getPointer()->getHeight() ) : Point2I::Zero; }
@@ -68,6 +68,7 @@ public:
    U32 getWidth() const    { return getPointer() ? getPointer()->getWidth()  : 0; }
    U32 getHeight() const   { return getPointer() ? getPointer()->getHeight() : 0; }
    U32 getDepth() const    { return getPointer() ? getPointer()->getDepth()  : 0; }
+   U32 getArraySize() const    { return getPointer() ? getPointer()->getArraySize()  : 0; }
    GFXFormat getFormat() const { return getPointer() ? getPointer()->getFormat() : GFXFormat_COUNT; }
    
    /// Reloads the texture.
diff --git a/Engine/source/gfx/gfxTextureManager.cpp b/Engine/source/gfx/gfxTextureManager.cpp
index c37c3fb96..177978ff7 100644
--- a/Engine/source/gfx/gfxTextureManager.cpp
+++ b/Engine/source/gfx/gfxTextureManager.cpp
@@ -361,7 +361,7 @@ GFXTextureObject *GFXTextureManager::_createTexture(  GBitmap *bmp,
       if (  inObj->getWidth() != realWidth ||
             inObj->getHeight() != realHeight ||
             inObj->getFormat() != realFmt )
-         ret = _createTextureObject( realHeight, realWidth, 0, realFmt, profile, numMips, false, 0, inObj );
+         ret = _createTextureObject( realHeight, realWidth, 0, realFmt, profile, numMips, false, 0, 1, inObj );
       else
          ret = inObj;
    }
@@ -569,7 +569,7 @@ GFXTextureObject *GFXTextureManager::_createTexture(  DDSFile *dds,
             inObj->getMipLevels() != numMips )
          ret = _createTextureObject(   dds->getHeight(), dds->getWidth(), 0, 
                                        fmt, profile, numMips, 
-                                       true, 0, inObj );
+                                       true, 0, 1, inObj );
       else
          ret = inObj;
    }
@@ -744,7 +744,7 @@ GFXTextureObject *GFXTextureManager::createTexture(  U32 width, U32 height, void
    return createTexture( bmp, String::EmptyString, profile, true );
 }
 
-GFXTextureObject *GFXTextureManager::createTexture( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, U32 numMipLevels, S32 antialiasLevel )
+GFXTextureObject *GFXTextureManager::createTexture( U32 width, U32 height, GFXFormat format, GFXTextureProfile *profile, U32 numMipLevels, S32 antialiasLevel, U32 arraySize)
 {
    // Deal with sizing issues...
    U32 localWidth = width;
@@ -783,7 +783,7 @@ GFXTextureObject *GFXTextureManager::createTexture( U32 width, U32 height, GFXFo
    // Create the texture if we didn't get one from the pool.
    if ( !outTex )
    {
-      outTex = _createTextureObject( localHeight, localWidth, 0, format, profile, numMips, false, antialiasLevel );
+      outTex = _createTextureObject( localHeight, localWidth, 0, format, profile, numMips, false, antialiasLevel, arraySize );
 
       // Make sure we add it to the pool.
       if ( outTex && profile->isPooled() )
@@ -814,12 +814,13 @@ GFXTextureObject *GFXTextureManager::createTexture(   U32 width,
                                                       U32 depth,
                                                       GFXFormat format,
                                                       GFXTextureProfile *profile,
-                                                      U32 numMipLevels)
+                                                      U32 numMipLevels,
+                                                      U32 arraySize)
 {
    PROFILE_SCOPE( GFXTextureManager_CreateTexture_3D );
 
    // Create texture...
-   GFXTextureObject *ret = _createTextureObject( height, width, depth, format, profile, numMipLevels );
+   GFXTextureObject *ret = _createTextureObject( height, width, depth, format, profile, numMipLevels, arraySize );
 
    if(!ret)
    {
diff --git a/Engine/source/gfx/gfxTextureManager.h b/Engine/source/gfx/gfxTextureManager.h
index 3ea716bac..09c90335a 100644
--- a/Engine/source/gfx/gfxTextureManager.h
+++ b/Engine/source/gfx/gfxTextureManager.h
@@ -120,14 +120,16 @@ public:
       U32 depth,
       GFXFormat format,
       GFXTextureProfile *profile,
-      U32 numMipLevels = 1);
+      U32 numMipLevels = 1,
+      U32 arraySize = 1);
 
    virtual GFXTextureObject *createTexture(  U32 width,
       U32 height,
       GFXFormat format,
       GFXTextureProfile *profile,
       U32 numMipLevels,
-      S32 antialiasLevel);
+      S32 antialiasLevel,
+      U32 arraySize = 1);
 
    Torque::Path validatePath(const Torque::Path &path);
    GBitmap *loadUncompressedTexture(const Torque::Path& path, GFXTextureProfile* profile, U32 width, U32 height, bool genMips = false);
@@ -319,7 +321,8 @@ protected:
                                                    GFXTextureProfile *profile, 
                                                    U32 numMipLevels, 
                                                    bool forceMips = false, 
-                                                   S32 antialiasLevel = 0, 
+                                                   S32 antialiasLevel = 0,
+                                                   U32 arraySize = 1,
                                                    GFXTextureObject *inTex = NULL ) = 0;
 
    /// Load a texture from a proper DDSFile instance.
diff --git a/Engine/source/gfx/gfxTextureObject.cpp b/Engine/source/gfx/gfxTextureObject.cpp
index fc265291d..2b515f8bc 100644
--- a/Engine/source/gfx/gfxTextureObject.cpp
+++ b/Engine/source/gfx/gfxTextureObject.cpp
@@ -99,6 +99,8 @@ GFXTextureObject::GFXTextureObject(GFXDevice *aDevice, GFXTextureProfile *aProfi
 
    mHasTransparency = false;
 
+   mArraySize = 1;
+
 #if defined(TORQUE_DEBUG)
    // Active object tracking.
    smActiveTOCount++;
diff --git a/Engine/source/gfx/gfxTextureObject.h b/Engine/source/gfx/gfxTextureObject.h
index 049b0d1c0..e0cd6a46a 100644
--- a/Engine/source/gfx/gfxTextureObject.h
+++ b/Engine/source/gfx/gfxTextureObject.h
@@ -45,6 +45,7 @@ class GFXTextureProfile;
 class GBitmap;
 struct DDSFile;
 class RectI;
+class GFXTexHandle;
 
 /// Contains information on a locked region of a texture.
 ///
@@ -92,6 +93,8 @@ public:
 
    bool mDead;
 
+   U32 mArraySize;
+
    /// The device this texture belongs to.
    GFXDevice *mDevice;   
 
@@ -150,10 +153,13 @@ public:
    U32 getBitmapHeight() const { return mBitmapSize.y; }
    U32 getBitmapDepth() const { return mBitmapSize.z; }
    GFXFormat getFormat() const { return mFormat; }
+   U32 getArraySize() const { return mArraySize; }
 
    /// Returns true if this texture is a render target.
    bool isRenderTarget() const { return mProfile->isRenderTarget(); }
 
+   bool isCubeMap() const { return mProfile->isCubeMap(); }
+
    /// Returns the file path to the texture if
    /// it was loaded from disk.
    const String& getPath() const { return mPath; }
@@ -167,11 +173,11 @@ public:
 
    /// Acquire a lock on part of the texture. The GFXLockedRect returned
    /// is managed by the GFXTextureObject and does not need to be freed.
-   virtual GFXLockedRect * lock( U32 mipLevel = 0, RectI *inRect = NULL ) = 0;
+   virtual GFXLockedRect * lock( U32 mipLevel = 0, RectI *inRect = NULL, U32 faceIndex = 0) = 0;
 
    /// Releases a lock previously acquired. Note that the mipLevel parameter
    /// must match the corresponding lock!
-   virtual void unlock( U32 mipLevel = 0) = 0;
+   virtual void unlock( U32 mipLevel = 0, U32 faceIndex = 0) = 0;
 
    // copy the texture data into the specified bitmap.  
    //   - this texture object must be a render target.  the function will assert if this is not the case.
@@ -182,6 +188,10 @@ public:
    //   - this process is not fast.
    virtual bool copyToBmp(GBitmap* bmp) = 0;
 
+   virtual void updateTextureSlot(const GFXTexHandle& texHandle, const U32 slot, const S32 face = -1) = 0;
+   virtual void copyTo(GFXTextureObject* dstTex) = 0;
+   virtual void generateMipMaps() = 0;
+
    #ifdef TORQUE_DEBUG
 
    // It is important for any derived objects to define this method
diff --git a/Engine/source/gfx/gfxTextureProfile.cpp b/Engine/source/gfx/gfxTextureProfile.cpp
index 98a63439c..cf3658be3 100644
--- a/Engine/source/gfx/gfxTextureProfile.cpp
+++ b/Engine/source/gfx/gfxTextureProfile.cpp
@@ -83,6 +83,21 @@ GFX_ImplementTextureProfile(GFXDynamicTextureSRGBProfile,
                             GFXTextureProfile::DiffuseMap,
                             GFXTextureProfile::Dynamic | GFXTextureProfile::SRGB,
                             GFXTextureProfile::NONE);
+GFX_ImplementTextureProfile(GFXDynamicCubemapTextureProfile,
+                           GFXTextureProfile::DiffuseMap,
+                           GFXTextureProfile::Dynamic | GFXTextureProfile::CubeMap,
+                           GFXTextureProfile::NONE);
+GFX_ImplementTextureProfile(GFXCubemapRenderTargetProfile,
+                           GFXTextureProfile::DiffuseMap,
+                           GFXTextureProfile::PreserveSize | GFXTextureProfile::RenderTarget | GFXTextureProfile::CubeMap,
+                           GFXTextureProfile::NONE);
+GFX_ImplementTextureProfile(GFXCubemapStaticTextureProfile, GFXTextureProfile::DiffuseMap,
+                           GFXTextureProfile::Static | GFXTextureProfile::CubeMap,
+                           GFXTextureProfile::NONE);
+GFX_ImplementTextureProfile(GFXCubemapTexturePersistentProfile,
+                           GFXTextureProfile::DiffuseMap,
+                           GFXTextureProfile::PreserveSize | GFXTextureProfile::Static | GFXTextureProfile::KeepBitmap | GFXTextureProfile::CubeMap,
+                           GFXTextureProfile::NONE);
 
 //-----------------------------------------------------------------------------
 
diff --git a/Engine/source/gfx/gfxTextureProfile.h b/Engine/source/gfx/gfxTextureProfile.h
index 46f2b690b..2c1170b73 100644
--- a/Engine/source/gfx/gfxTextureProfile.h
+++ b/Engine/source/gfx/gfxTextureProfile.h
@@ -88,7 +88,7 @@ public:
       KeepBitmap     = BIT(7),  ///< Always keep a copy of this texture's bitmap. (Potentially in addition to the API managed copy?)
       ZTarget        = BIT(8),  ///< This texture will be used as a Z target.
       SRGB           = BIT(9),  ///< sRGB texture
-
+      CubeMap        = BIT(10), ///< Cubemap texture
       /// Track and pool textures of this type for reuse.
       ///
       /// You should use this profile flag sparingly.  Odd
@@ -96,16 +96,15 @@ public:
       /// the pool to contain unused textures which will remain
       /// in memory until a flush occurs.
       ///
-      Pooled = BIT(10), 
+      Pooled = BIT(11), 
 
       /// A hint that the device is not allowed to discard the content
       /// of a target texture after presentation or deactivated.
       ///
       /// This is mainly a depth buffer optimization.
-      NoDiscard = BIT(11),
+      NoDiscard = BIT(12),
 
-      
-      NoModify = BIT(11)
+      NoModify = BIT(12)
 
    };
 
@@ -173,6 +172,7 @@ public:
    inline bool isPooled() const { return testFlag(Pooled); }
    inline bool canDiscard() const { return !testFlag(NoDiscard); }
    inline bool isSRGB() const { return testFlag(SRGB); }
+   inline bool isCubeMap() const { return testFlag(CubeMap); }
    //compare profile flags for equality
    inline bool compareFlags(const GFXTextureProfile& in_Cmp) const{ return (mProfile == in_Cmp.mProfile); }
 private:
@@ -209,7 +209,7 @@ private:
 #define GFX_DeclareTextureProfile(name)  extern GFXTextureProfile name
 #define GFX_ImplementTextureProfile(name, type,  flags, compression) GFXTextureProfile name(#name, type, flags, compression)
 
-// Default Texture profiles
+// Default 2D Texture profiles
 // Texture we can render to.
 GFX_DeclareTextureProfile(GFXRenderTargetProfile);
 GFX_DeclareTextureProfile(GFXRenderTargetSRGBProfile);
@@ -231,4 +231,14 @@ GFX_DeclareTextureProfile(GFXZTargetProfile);
 GFX_DeclareTextureProfile(GFXDynamicTextureProfile);
 GFX_DeclareTextureProfile(GFXDynamicTextureSRGBProfile);
 
+// Default Cubemap Texture profiles
+// Dynamic Texure
+GFX_DeclareTextureProfile(GFXDynamicCubemapTextureProfile);
+// Texture we can render to.
+GFX_DeclareTextureProfile(GFXCubemapRenderTargetProfile);
+// Standard static diffuse textures
+GFX_DeclareTextureProfile(GFXCubemapStaticTextureProfile);
+// Standard static diffuse textures that are persistent in memory
+GFX_DeclareTextureProfile(GFXCubemapTexturePersistentProfile);
+
 #endif
diff --git a/Engine/source/gfx/gl/gfxGLTextureManager.cpp b/Engine/source/gfx/gl/gfxGLTextureManager.cpp
index ae6ce898c..ca0a958f3 100644
--- a/Engine/source/gfx/gl/gfxGLTextureManager.cpp
+++ b/Engine/source/gfx/gl/gfxGLTextureManager.cpp
@@ -55,6 +55,7 @@ GFXTextureObject *GFXGLTextureManager::_createTextureObject(   U32 height,
                                                                U32 numMipLevels,
                                                                bool forceMips,
                                                                S32 antialiasLevel,
+                                                               U32 arraySize,   
                                                                GFXTextureObject *inTex )
 {
    AssertFatal(format >= 0 && format < GFXFormat_COUNT, "GFXGLTextureManager::_createTexture - invalid format!");
@@ -73,7 +74,7 @@ GFXTextureObject *GFXGLTextureManager::_createTextureObject(   U32 height,
       retTex->registerResourceWithDevice( GFX );
    }
 
-   innerCreateTexture(retTex, height, width, depth, format, profile, numMipLevels, forceMips);
+   innerCreateTexture(retTex, height, width, depth, format, profile, numMipLevels, forceMips, arraySize);
 
    return retTex;
 }
@@ -89,19 +90,40 @@ void GFXGLTextureManager::innerCreateTexture( GFXGLTextureObject *retTex,
                                                GFXFormat format, 
                                                GFXTextureProfile *profile, 
                                                U32 numMipLevels,
-                                               bool forceMips)
+                                               bool forceMips,
+                                               U32 arraySize)
 {
    // No 24 bit formats.  They trigger various oddities because hardware (and Apple's drivers apparently...) don't natively support them.
    if (format == GFXFormatR8G8B8)
       format = GFXFormatR8G8B8A8;
    else if (format == GFXFormatR8G8B8_SRGB)
       format = GFXFormatR8G8B8A8_SRGB;
-      
+
+   retTex->mProfile = profile;
    retTex->mFormat = format;
    retTex->mIsZombie = false;
    retTex->mIsNPoT2 = false;
    
-   GLenum binding = ( (height == 1 || width == 1) && ( height != width ) ) ? GL_TEXTURE_1D : ( (depth == 0) ? GL_TEXTURE_2D : GL_TEXTURE_3D );
+   const bool isCube = profile->isCubeMap();
+   GLenum binding;
+
+   if (isCube)
+   {
+      binding = (arraySize > 1) ? GL_TEXTURE_CUBE_MAP_ARRAY : GL_TEXTURE_CUBE_MAP;
+   }
+   else
+   {
+      const bool is3D = (depth > 1);
+      const bool is1D = (height == 1 && width > 1);
+
+      if (is3D)
+         binding = GL_TEXTURE_3D;
+      else if (is1D)
+         binding = (arraySize > 1) ? GL_TEXTURE_1D_ARRAY : GL_TEXTURE_1D;
+      else
+         binding = (arraySize > 1) ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
+   }
+
    if((profile->testFlag(GFXTextureProfile::RenderTarget) || profile->testFlag(GFXTextureProfile::ZTarget)) && (!isPow2(width) || !isPow2(height)) && !depth)
       retTex->mIsNPoT2 = true;
    retTex->mBinding = binding;
@@ -155,55 +177,155 @@ void GFXGLTextureManager::innerCreateTexture( GFXGLTextureObject *retTex,
       retTex->mMipLevels = getMaxMipmaps(width, height, 1);
 
     glTexParameteri(binding, GL_TEXTURE_MAX_LEVEL, retTex->mMipLevels-1 );
-    
-    if( GFXGL->mCapabilities.textureStorage )
+
+    bool hasTexStorage = false;
+    // not supported when creating these.
+    if (arraySize > 1 || isCube || profile->isDynamic())
+       hasTexStorage = false;
+
+    const bool isCompressed = ImageUtil::isCompressedFormat(format);
+
+    // --- Allocation by binding ---
+    if (binding == GL_TEXTURE_CUBE_MAP)
     {
-        if(binding == GL_TEXTURE_2D)
-            glTexStorage2D( retTex->getBinding(), retTex->mMipLevels, GFXGLTextureInternalFormat[format], width, height );
-        else if(binding == GL_TEXTURE_1D)
-            glTexStorage1D( retTex->getBinding(), retTex->mMipLevels, GFXGLTextureInternalFormat[format], getMax(width, height) );
-        else
-            glTexStorage3D( retTex->getBinding(), retTex->mMipLevels, GFXGLTextureInternalFormat[format], width, height, depth );
+       // Single cubemap: prefer glTexStorage2D if available, else per-face texImage2D
+       if (hasTexStorage)
+       {
+          // Some drivers accept texStorage2D with GL_TEXTURE_CUBE_MAP
+          glTexStorage2D(GL_TEXTURE_CUBE_MAP, retTex->mMipLevels, GFXGLTextureInternalFormat[format], width, height);
+       }
+       else
+       {
+          // Explicitly allocate each face/level
+          for (U32 face = 0; face < 6; ++face)
+          {
+             for (U32 mip = 0; mip < retTex->mMipLevels; ++mip)
+             {
+                U32 mipW = getMax(1u, width >> mip);
+                U32 mipH = getMax(1u, height >> mip);
+
+                if (isCompressed)
+                {
+                   U32 size = getCompressedSurfaceSize(format, width, height, mip);
+                   glCompressedTexImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + face, mip, GFXGLTextureInternalFormat[format], mipW, mipH, 0, size, nullptr);
+                }
+                else
+                {
+                   glTexImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + face, mip, GFXGLTextureInternalFormat[format], mipW, mipH, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], nullptr);
+                }
+             }
+          }
+       }
     }
-    else
+    else if (binding == GL_TEXTURE_CUBE_MAP_ARRAY)
     {
-        //If it wasn't for problems on amd drivers this next part could be really simplified and we wouldn't need to go through manually creating our
-        //mipmap pyramid and instead just use glGenerateMipmap
-        if(ImageUtil::isCompressedFormat(format))
-        {
-            AssertFatal(binding == GL_TEXTURE_2D, 
-            "GFXGLTextureManager::innerCreateTexture - Only compressed 2D textures are supported");
-
-            U32 tempWidth = width;
-            U32 tempHeight = height;
-            U32 size = getCompressedSurfaceSize(format,height,width);
-            //Fill compressed images with 0's
-            U8 *pTemp = (U8*)dMalloc(sizeof(U8)*size);
-            dMemset(pTemp,0,size);
-     
-            for(U32 i=0;i< retTex->mMipLevels;i++)
-            {
-                tempWidth = getMax( U32(1), width >> i );
-                tempHeight = getMax( U32(1), height >> i );
-                size = getCompressedSurfaceSize(format,width,height,i);
-                glCompressedTexImage2D(binding,i,GFXGLTextureInternalFormat[format],tempWidth,tempHeight,0,size,pTemp);
-            }
-
-            dFree(pTemp);
-        }
-        else
-        {   
-            if(binding == GL_TEXTURE_2D)
-                glTexImage2D(binding, 0, GFXGLTextureInternalFormat[format], width, height, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
-            else if(binding == GL_TEXTURE_1D)
-                glTexImage1D(binding, 0, GFXGLTextureInternalFormat[format], (width > 1 ? width : height), 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
-            else
-                glTexImage3D(GL_TEXTURE_3D, 0, GFXGLTextureInternalFormat[format], width, height, depth, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
-
-            if(retTex->mMipLevels > 1)
-                glGenerateMipmap(binding);
-        }
+       // cube-map array: layers = arraySize * 6
+       U32 layers = getMax(1u, arraySize) * 6u;
+       if (hasTexStorage)
+       {
+          glTexStorage3D(GL_TEXTURE_CUBE_MAP_ARRAY, retTex->mMipLevels, GFXGLTextureInternalFormat[format], width, height, layers);
+       }
+       else
+       {
+          // fallback to glTexImage3D with NULL data
+          for (U32 mip = 0; mip < retTex->mMipLevels; ++mip)
+          {
+             U32 mipW = getMax(1u, width >> mip);
+             U32 mipH = getMax(1u, height >> mip);
+             glTexImage3D(GL_TEXTURE_CUBE_MAP_ARRAY, mip, GFXGLTextureInternalFormat[format], mipW, mipH, layers, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
+          }
+       }
     }
+    else if (binding == GL_TEXTURE_2D_ARRAY)
+    {
+       // 2D texture array: depth = arraySize (layers)
+       U32 layers = getMax(1u, arraySize);
+       if (hasTexStorage)
+       {
+          glTexStorage3D(GL_TEXTURE_2D_ARRAY, retTex->mMipLevels, GFXGLTextureInternalFormat[format], width, height, layers);
+       }
+       else
+       {
+          for (U32 mip = 0; mip < retTex->mMipLevels; ++mip)
+          {
+             U32 mipW = getMax(1u, width >> mip);
+             U32 mipH = getMax(1u, height >> mip);
+             glTexImage3D(GL_TEXTURE_2D_ARRAY, mip, GFXGLTextureInternalFormat[format], mipW, mipH, layers, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
+          }
+       }
+    }
+    else if (binding == GL_TEXTURE_1D_ARRAY)
+    {
+       // 1D array stored as GL_TEXTURE_1D_ARRAY. glTexStorage2D can be used for 1D arrays with height=layers on many drivers.
+       U32 layers = getMax(1u, arraySize);
+       if (hasTexStorage)
+       {
+          // glTexStorage2D works for GL_TEXTURE_1D_ARRAY (width, layers)
+          glTexStorage2D(GL_TEXTURE_1D_ARRAY, retTex->mMipLevels, GFXGLTextureInternalFormat[format], getMax(width, height), layers);
+       }
+       else
+       {
+          // fallback: allocate as 2D where the "height" dimension is layers via glTexImage2D? Not ideal.
+          // Safer: use glTexImage2D with target GL_TEXTURE_1D_ARRAY is invalid; instead use glTexImage3D with depth=layers
+          for (U32 mip = 0; mip < retTex->mMipLevels; ++mip)
+          {
+             U32 mipW = getMax(1u, getMax(width, height) >> mip);
+             glTexImage3D(GL_TEXTURE_1D_ARRAY, mip, GFXGLTextureInternalFormat[format], mipW, layers, 1, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
+          }
+       }
+    }
+    else if (binding == GL_TEXTURE_1D)
+    {
+       if (hasTexStorage)
+          glTexStorage1D(GL_TEXTURE_1D, retTex->mMipLevels, GFXGLTextureInternalFormat[format], getMax(width, height));
+       else
+       {
+          for (U32 mip = 0; mip < retTex->mMipLevels; ++mip)
+          {
+             U32 mipW = getMax(1u, getMax(width, height) >> mip);
+             glTexImage1D(GL_TEXTURE_1D, mip, GFXGLTextureInternalFormat[format], mipW, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
+          }
+       }
+    }
+    else if (binding == GL_TEXTURE_3D)
+    {
+       if (hasTexStorage)
+          glTexStorage3D(GL_TEXTURE_3D, retTex->mMipLevels, GFXGLTextureInternalFormat[format], width, height, depth);
+       else
+       {
+          for (U32 mip = 0; mip < retTex->mMipLevels; ++mip)
+          {
+             U32 mipW = getMax(1u, width >> mip);
+             U32 mipH = getMax(1u, height >> mip);
+             U32 mipD = getMax(1u, depth >> mip);
+             glTexImage3D(GL_TEXTURE_3D, mip, GFXGLTextureInternalFormat[format], mipW, mipH, mipD, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
+          }
+       }
+    }
+    else // GL_TEXTURE_2D (default)
+    {
+       if (hasTexStorage)
+          glTexStorage2D(GL_TEXTURE_2D, retTex->mMipLevels, GFXGLTextureInternalFormat[format], width, height);
+       else
+       {
+          for (U32 mip = 0; mip < retTex->mMipLevels; ++mip)
+          {
+             U32 mipW = getMax(1u, width >> mip);
+             U32 mipH = getMax(1u, height >> mip);
+
+             if (isCompressed)
+             {
+                U32 size = getCompressedSurfaceSize(format, width, height, mip);
+                glCompressedTexImage2D(GL_TEXTURE_2D, mip, GFXGLTextureInternalFormat[format], mipW, mipH, 0, size, nullptr);
+             }
+             else
+             {
+                glTexImage2D(GL_TEXTURE_2D, mip, GFXGLTextureInternalFormat[format], mipW, mipH, 0, GFXGLTextureFormat[format], GFXGLTextureType[format], NULL);
+             }
+          }
+       }
+    }
+
    
    // Complete the texture
    // Complete the texture - this does get changed later but we need to complete the texture anyway
@@ -221,14 +343,20 @@ void GFXGLTextureManager::innerCreateTexture( GFXGLTextureObject *retTex,
    if(GFXGLTextureSwizzle[format])         
       glTexParameteriv(binding, GL_TEXTURE_SWIZZLE_RGBA, GFXGLTextureSwizzle[format]);   
    
-   // Get the size from GL (you never know...)
-   GLint texHeight, texWidth, texDepth = 0;
-   
-   glGetTexLevelParameteriv(binding, 0, GL_TEXTURE_WIDTH, &texWidth);
-   glGetTexLevelParameteriv(binding, 0, GL_TEXTURE_HEIGHT, &texHeight);
-   if(binding == GL_TEXTURE_3D)
-      glGetTexLevelParameteriv(binding, 0, GL_TEXTURE_DEPTH, &texDepth);
-   
+   GLint texHeight = 0, texWidth = 0, texDepth = 0;
+
+   GLenum queryTarget = binding;
+   if (binding == GL_TEXTURE_CUBE_MAP)
+   {
+      // Query a specific face, e.g. +X
+      queryTarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+   }
+
+   glGetTexLevelParameteriv(queryTarget, 0, GL_TEXTURE_WIDTH, &texWidth);
+   glGetTexLevelParameteriv(queryTarget, 0, GL_TEXTURE_HEIGHT, &texHeight);
+   if (binding == GL_TEXTURE_3D)
+      glGetTexLevelParameteriv(GL_TEXTURE_3D, 0, GL_TEXTURE_DEPTH, &texDepth);
+
    retTex->mTextureSize.set(texWidth, texHeight, texDepth);
 }
 
@@ -236,7 +364,7 @@ void GFXGLTextureManager::innerCreateTexture( GFXGLTextureObject *retTex,
 // loadTexture - GBitmap
 //-----------------------------------------------------------------------------
 
-static void _textureUpload(const S32 width, const S32 height,const S32 bytesPerPixel,const GFXGLTextureObject* texture, const GFXFormat fmt, const U8* data,const S32 mip=0, Swizzle<U8, 4> *pSwizzle = NULL)
+static void _textureUpload(const S32 width, const S32 height,const S32 bytesPerPixel,const GFXGLTextureObject* texture, const GFXFormat fmt, const U8* data,const S32 mip=0, const U32 face = 0, Swizzle<U8, 4> *pSwizzle = NULL)
 {
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture->getBuffer());
    U32 bufSize = width * height * bytesPerPixel;
@@ -256,7 +384,9 @@ static void _textureUpload(const S32 width, const S32 height,const S32 bytesPerP
       glBufferSubData(GL_PIXEL_UNPACK_BUFFER, 0, bufSize, data);
    }
 
-   if (texture->getBinding() == GL_TEXTURE_2D)
+   if(texture->getBinding() == GL_TEXTURE_CUBE_MAP)
+      glTexSubImage2D(GFXGLFaceType[face], mip, 0, 0, width, height, GFXGLTextureFormat[fmt], GFXGLTextureType[fmt], NULL);
+   else if (texture->getBinding() == GL_TEXTURE_2D)
       glTexSubImage2D(texture->getBinding(), mip, 0, 0, width, height, GFXGLTextureFormat[fmt], GFXGLTextureType[fmt], NULL);
    else
       glTexSubImage1D(texture->getBinding(), mip, 0, (width > 1 ? width : height), GFXGLTextureFormat[fmt], GFXGLTextureType[fmt], NULL);
@@ -266,76 +396,125 @@ static void _textureUpload(const S32 width, const S32 height,const S32 bytesPerP
 
 bool GFXGLTextureManager::_loadTexture(GFXTextureObject *aTexture, GBitmap *pDL)
 {
-   PROFILE_SCOPE(GFXGLTextureManager_loadTexture);
+   PROFILE_SCOPE(GFXGLTextureManager_loadTextureGBitmap);
    GFXGLTextureObject *texture = static_cast<GFXGLTextureObject*>(aTexture);
-   
-   AssertFatal(texture->getBinding() == GL_TEXTURE_1D || texture->getBinding() == GL_TEXTURE_2D, 
-      "GFXGLTextureManager::_loadTexture(GBitmap) - This method can only be used with 1D/2D textures");
+
+   const GLenum target = texture->getBinding();
+
+   AssertFatal(target == GL_TEXTURE_1D || target == GL_TEXTURE_2D || target == GL_TEXTURE_CUBE_MAP,
+      "GFXGLTextureManager::_loadTexture(GBitmap) - This method can only be used with 1D/2D and CubeMap textures");
       
    if(texture->getBinding() == GL_TEXTURE_3D)
       return false;
-         
-   // No 24bit formats.
-   if(pDL->getFormat() == GFXFormatR8G8B8)
-      pDL->setFormat(GFXFormatR8G8B8A8);
-   else if (pDL->getFormat() == GFXFormatR8G8B8_SRGB)
-      pDL->setFormat(GFXFormatR8G8B8A8_SRGB);
+   //      
+   //// No 24bit formats.
+   //if(pDL->getFormat() == GFXFormatR8G8B8)
+   //   pDL->setFormat(GFXFormatR8G8B8A8);
+   //else if (pDL->getFormat() == GFXFormatR8G8B8_SRGB)
+   //   pDL->setFormat(GFXFormatR8G8B8A8_SRGB);
+
    // Bind to edit
    PRESERVE_TEXTURE(texture->getBinding());
    glBindTexture(texture->getBinding(), texture->getHandle());
 
-  _textureUpload(pDL->getWidth(),pDL->getHeight(),pDL->getBytesPerPixel(),texture,pDL->getFormat(), pDL->getBits(), 0);
+   const U32 mipLevels = texture->getMipLevels();
+   const bool isCubemap = (target == GL_TEXTURE_CUBE_MAP) && pDL->getNumFaces() > 1;
+   U32 faceCount = isCubemap ? 6 : 1;
 
-  if(!ImageUtil::isCompressedFormat(pDL->getFormat()))
-   glGenerateMipmap(texture->getBinding());
-   
+   for (U32 mip = 0; mip < mipLevels; mip++)
+   {
+      const GLsizei width = getMax(1u, pDL->getWidth(mip));
+      const GLsizei height = getMax(1u, pDL->getHeight(mip));
+      for (U32 face = 0; face < faceCount; ++face)
+      {
+         _textureUpload(width, height, pDL->getBytesPerPixel(), texture, pDL->getFormat(), pDL->getBits(mip,face), mip, face);
+      }
+   }
+
+   if(!ImageUtil::isCompressedFormat(pDL->getFormat()))
+      glGenerateMipmap(texture->getBinding());
+
+   glBindTexture(target, 0);
    return true;
 }
 
 bool GFXGLTextureManager::_loadTexture(GFXTextureObject *aTexture, DDSFile *dds)
 {
+	PROFILE_SCOPE(GFXGLTextureManager_loadTextureDDS);
    GFXGLTextureObject* texture = static_cast<GFXGLTextureObject*>(aTexture);
-   
-   AssertFatal(texture->getBinding() == GL_TEXTURE_2D, 
-      "GFXGLTextureManager::_loadTexture(DDSFile) - This method can only be used with 2D textures");
-      
-   if(texture->getBinding() != GL_TEXTURE_2D)
-      return false;
-   
-   PRESERVE_TEXTURE(texture->getBinding());
-   glBindTexture(texture->getBinding(), texture->getHandle());
-   U32 numMips = dds->mSurfaces[0]->mMips.size();
+
+   const GLenum target = texture->getBinding();
+
+   const bool isCube = texture->getBinding() == GL_TEXTURE_CUBE_MAP && dds->isCubemap();
+   const bool isCompressed = ImageUtil::isCompressedFormat(texture->mFormat);
+
+   AssertFatal(target == GL_TEXTURE_1D || target == GL_TEXTURE_2D || target == GL_TEXTURE_CUBE_MAP,
+	   "GFXGLTextureManager::_loadTexture(DDS) - This method can only be used with 1D/2D and CubeMap textures");
+
+   if (texture->getBinding() == GL_TEXTURE_3D)
+	   return false;
+
+   PRESERVE_TEXTURE(target);
+   glBindTexture(target, texture->getHandle());
+
+   const U32 numFaces = isCube ? 6 : 1;
+   const U32 numMips = dds->mSurfaces[0]->mMips.size();
    const GFXFormat fmt = texture->mFormat;
 
-   for(U32 i = 0; i < numMips; i++)
+   for (U32 face = 0; face < numFaces; ++face)
    {
-      PROFILE_SCOPE(GFXGLTexMan_loadSurface);
+      // Skip empty surfaces
+      if (!dds->mSurfaces[face])
+         continue;
 
-      if(ImageUtil::isCompressedFormat(texture->mFormat))
+      for (U32 mip = 0; mip < numMips; ++mip)
       {
-         if((!isPow2(dds->getWidth()) || !isPow2(dds->getHeight())) && GFX->getCardProfiler()->queryProfile("GL::Workaround::noCompressedNPoTTextures"))
+         const U32 mipWidth = getMax(1u, dds->getWidth(mip));
+         const U32 mipHeight = getMax(1u, dds->getHeight(mip));
+
+			GLenum uploadTarget = target;
+         if (isCube)
+            uploadTarget = GFXGLFaceType[face];
+
+         if (isCompressed)
          {
-            U8* uncompressedTex = new U8[dds->getWidth(i) * dds->getHeight(i) * 4];
-            ImageUtil::decompress(dds->mSurfaces[0]->mMips[i],uncompressedTex, dds->getWidth(i), dds->getHeight(i), fmt);
-            glTexSubImage2D(texture->getBinding(), i, 0, 0, dds->getWidth(i), dds->getHeight(i), GL_RGBA, GL_UNSIGNED_BYTE, uncompressedTex);
-            delete[] uncompressedTex;
+            // Handle NPOT workaround
+            if ((!isPow2(mipWidth) || !isPow2(mipHeight)) && GFX->getCardProfiler()->queryProfile("GL::Workaround::noCompressedNPoTTextures"))
+            {
+               U8* uncompressedTex = new U8[mipWidth * mipHeight * 4];
+               ImageUtil::decompress(dds->mSurfaces[face]->mMips[mip], uncompressedTex, mipWidth, mipHeight, fmt);
+               glTexSubImage2D(uploadTarget,
+                  mip, 0, 0, mipWidth, mipHeight, GL_RGBA, GL_UNSIGNED_BYTE, uncompressedTex
+               );
+               delete[] uncompressedTex;
+            }
+            else
+            {
+               glCompressedTexImage2D(uploadTarget,
+                  mip, GFXGLTextureInternalFormat[fmt], mipWidth, mipHeight, 0,
+                  dds->getSurfaceSize(mip), dds->mSurfaces[face]->mMips[mip]
+               );
+            }
          }
          else
-            glCompressedTexSubImage2D(texture->getBinding(), i, 0, 0, dds->getWidth(i), dds->getHeight(i), GFXGLTextureInternalFormat[fmt], dds->getSurfaceSize(dds->getHeight(), dds->getWidth(), i), dds->mSurfaces[0]->mMips[i]);
-      }
-      else
-      {
-         Swizzle<U8, 4> *pSwizzle = NULL;
-         if (fmt == GFXFormatR8G8B8A8 || fmt == GFXFormatR8G8B8X8 || fmt == GFXFormatR8G8B8A8_SRGB || fmt == GFXFormatR8G8B8A8_LINEAR_FORCE || fmt == GFXFormatB8G8R8A8)
-            pSwizzle = &Swizzles::bgra;
+         {
+            Swizzle<U8, 4>* pSwizzle = nullptr;
+            if (fmt == GFXFormatR8G8B8A8 || fmt == GFXFormatR8G8B8X8 || fmt == GFXFormatR8G8B8A8_SRGB ||
+               fmt == GFXFormatR8G8B8A8_LINEAR_FORCE || fmt == GFXFormatB8G8R8A8)
+               pSwizzle = &Swizzles::bgra;
+
+            _textureUpload(
+               mipWidth, mipHeight, dds->mBytesPerPixel, texture, fmt,
+               dds->mSurfaces[face]->mMips[mip], mip, face, pSwizzle);
+         }
 
-         _textureUpload(dds->getWidth(i), dds->getHeight(i),dds->mBytesPerPixel, texture, fmt, dds->mSurfaces[0]->mMips[i],i, pSwizzle);
       }
    }
 
-   if(numMips !=1 && !ImageUtil::isCompressedFormat(texture->mFormat))
+   if (numMips != 1 && !isCompressed)
       glGenerateMipmap(texture->getBinding());
-   
+
+   glBindTexture(target, 0);
    return true;
 }
 
diff --git a/Engine/source/gfx/gl/gfxGLTextureManager.h b/Engine/source/gfx/gl/gfxGLTextureManager.h
index 5e5aaf529..2c5f1e09e 100644
--- a/Engine/source/gfx/gl/gfxGLTextureManager.h
+++ b/Engine/source/gfx/gl/gfxGLTextureManager.h
@@ -45,6 +45,7 @@ protected:
                                              U32 numMipLevels,
                                              bool forceMips = false,
                                              S32 antialiasLevel = 0,
+                                             U32 arraySize = 1,
                                              GFXTextureObject *inTex = NULL ) override;
    bool _loadTexture(GFXTextureObject *texture, DDSFile *dds) override;
    bool _loadTexture(GFXTextureObject *texture, GBitmap *bmp) override;
@@ -56,7 +57,7 @@ private:
    friend class GFXGLTextureObject;
    
    /// Creates internal GL texture
-   void innerCreateTexture(GFXGLTextureObject *obj, U32 height, U32 width, U32 depth, GFXFormat format, GFXTextureProfile *profile, U32 numMipLevels, bool forceMips = false);
+   void innerCreateTexture(GFXGLTextureObject *obj, U32 height, U32 width, U32 depth, GFXFormat format, GFXTextureProfile *profile, U32 numMipLevels, bool forceMips = false, U32 arraySize = 1);
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/Engine/source/gfx/gl/gfxGLTextureObject.cpp b/Engine/source/gfx/gl/gfxGLTextureObject.cpp
index 06ee43dc0..651059d34 100644
--- a/Engine/source/gfx/gl/gfxGLTextureObject.cpp
+++ b/Engine/source/gfx/gl/gfxGLTextureObject.cpp
@@ -64,7 +64,7 @@ GFXGLTextureObject::~GFXGLTextureObject()
    kill();
 }
 
-GFXLockedRect* GFXGLTextureObject::lock(U32 mipLevel, RectI *inRect)
+GFXLockedRect* GFXGLTextureObject::lock(U32 mipLevel /*= 0*/, RectI* inRect /*= NULL*/, U32 faceIndex /*= 0*/)
 {
    //AssertFatal(mBinding != GL_TEXTURE_3D, "GFXGLTextureObject::lock - We don't support locking 3D textures yet");
    U32 width = mTextureSize.x >> mipLevel;
@@ -100,7 +100,7 @@ GFXLockedRect* GFXGLTextureObject::lock(U32 mipLevel, RectI *inRect)
    return &mLockedRect;
 }
 
-void GFXGLTextureObject::unlock(U32 mipLevel)
+void GFXGLTextureObject::unlock(U32 mipLevel /*= 0*/, U32 faceIndex /*= 0*/)
 {
    if(!mLockedRect.bits)
       return;
@@ -175,38 +175,231 @@ bool GFXGLTextureObject::copyToBmp(GBitmap * bmp)
 
    FrameAllocatorMarker mem;
 
+   const bool isCubemap = (mBinding == GL_TEXTURE_CUBE_MAP);
+   const U32 numFaces = isCubemap ? 6 : 1;
 
-   U32 mipLevels = getMipLevels();
-   for (U32 mip = 0; mip < mipLevels; mip++)
+   for (U32 mip = 0; mip < getMipLevels(); mip++)
    {
-      U32 srcPixelCount = bmp->getSurfaceSize(mip)/ srcBytesPerPixel;
+      U32 width = getWidth() >> mip;
+      U32 height = getHeight() >> mip;
+      if (width == 0) width = 1;
+      if (height == 0) height = 1;
 
-      U8* dest = bmp->getWritableBits(mip);
-      U8* orig = (U8*)mem.alloc(srcPixelCount * srcBytesPerPixel);
+      // Check if multisampled
+      GLint samples = 0;
+      GLenum target = mBinding;
+      if (mBinding == GL_TEXTURE_CUBE_MAP)
+         target = GL_TEXTURE_CUBE_MAP_POSITIVE_X;
 
-      glGetTexImage(mBinding, mip, GFXGLTextureFormat[mFormat], GFXGLTextureType[mFormat], orig);
-      if (mFormat == GFXFormatR16G16B16A16F)
+      glGetTexLevelParameteriv(target, mip, GL_TEXTURE_SAMPLES, &samples);
+      if (samples > 0)
       {
-         dMemcpy(dest, orig, srcPixelCount * srcBytesPerPixel);
+         Con::warnf("GFXGLTextureObject::copyToBmp - Texture is multisampled (%d samples) at mip %d; resolve first.", samples, mip);
+         return false;
       }
-      else
-      {
-         for (int i = 0; i < srcPixelCount; ++i)
-         {
-            dest[0] = orig[0];
-            dest[1] = orig[1];
-            dest[2] = orig[2];
-            if (dstBytesPerPixel == 4)
-               dest[3] = orig[3];
 
-            orig += srcBytesPerPixel;
-            dest += dstBytesPerPixel;
+      for (U32 face = 0; face < numFaces; face++)
+      {
+         GLenum faceTarget = isCubemap ? GFXGLFaceType[face] : mBinding;
+
+         U32 pixelCount = width * height;
+         U8* srcPixels = (U8*)mem.alloc(pixelCount * srcBytesPerPixel);
+         U8* dest = bmp->getWritableBits(mip, face);
+
+         if (!dest)
+         {
+            Con::errorf("GFXGLTextureObject::copyToBmp - No destination bits for mip=%u face=%u", mip, face);
+            continue;
+         }
+
+         // Read texture data
+         glGetTexImage(faceTarget, mip, GFXGLTextureFormat[mFormat], GFXGLTextureType[mFormat], srcPixels);
+
+         if (mFormat == GFXFormatR16G16B16A16F)
+         {
+            dMemcpy(dest, srcPixels, pixelCount * srcBytesPerPixel);
+         }
+         else
+         {
+            // Simple 8-bit per channel copy (RGBA)
+            U8* src = srcPixels;
+            for (U32 i = 0; i < pixelCount; ++i)
+            {
+               dest[0] = src[0];
+               dest[1] = src[1];
+               dest[2] = src[2];
+               if (dstBytesPerPixel == 4)
+                  dest[3] = src[3];
+
+               src += srcBytesPerPixel;
+               dest += dstBytesPerPixel;
+            }
+         }
+      } // face
+   } // mip
+
+   glBindTexture(mBinding, 0);
+   return true;
+}
+
+void GFXGLTextureObject::updateTextureSlot(const GFXTexHandle& texHandle, const U32 slot, const S32 face)
+{
+   if (!texHandle.isValid())
+      return;
+
+   GFXGLTextureObject* srcTex = static_cast<GFXGLTextureObject*>(texHandle.getPointer());
+   if (!srcTex || srcTex->getHandle() == 0)
+      return;
+
+   const GLenum dstTarget = mBinding;             // destination binding (this)
+   const GLenum srcTarget = srcTex->getBinding(); // source binding
+   const bool srcIsCube = (srcTarget == GL_TEXTURE_CUBE_MAP || srcTarget == GL_TEXTURE_CUBE_MAP_ARRAY);
+
+   // Determine list of faces to copy from source
+   U32 firstFace = 0;
+   U32 faceCount = 1;
+   if (face >= 0)
+   {
+      firstFace = (U32)face;
+      faceCount = 1;
+   }
+   else if (srcIsCube)
+   {
+      firstFace = 0;
+      faceCount = 6;
+   }
+   else
+   {
+      firstFace = 0;
+      faceCount = 1;
+   }
+
+   // Ensure textures are valid
+   if (!glIsTexture(mHandle) || !glIsTexture(srcTex->getHandle()))
+   {
+      Con::errorf("updateTextureSlot: invalid GL texture handle src=%u dst=%u", srcTex->getHandle(), mHandle);
+      return;
+   }
+
+   // If copyImage supported, prefer that. We'll copy face-by-face (one-layer depth = 1)
+   if (GFXGL->mCapabilities.copyImage)
+   {
+      for (U32 mip = 0; mip < getMipLevels(); ++mip)
+      {
+         const GLsizei mipW = getMax(1u, srcTex->getWidth() >> mip);
+         const GLsizei mipH = getMax(1u, srcTex->getHeight() >> mip);
+
+         for (U32 f = firstFace; f < firstFace + faceCount; ++f)
+         {
+            // Compute source z offset (for cube arrays it's layer index; for cubemap it's face index)
+            GLint srcZ = 0;
+            if (srcTarget == GL_TEXTURE_CUBE_MAP_ARRAY)
+            {
+               srcZ = f;
+            }
+            else if (srcTarget == GL_TEXTURE_CUBE_MAP)
+            {
+               srcZ = f;
+            }
+            else
+            {
+               srcZ = 0; // 2D source
+            }
+
+            // Compute destination layer (z offset) depending on destination type
+            GLint dstZ = 0;
+            if (dstTarget == GL_TEXTURE_CUBE_MAP_ARRAY)
+            {
+               // each slot is a whole cubemap => slot * 6 + faceIndex
+               dstZ = (GLint)(slot * 6 + f);
+            }
+            else if (dstTarget == GL_TEXTURE_2D_ARRAY)
+            {
+               dstZ = (GLint)slot; // each slot is a single layer
+            }
+            else if (dstTarget == GL_TEXTURE_CUBE_MAP)
+            {
+               dstZ = (GLint)f;
+            }
+            else
+            {
+               dstZ = 0; // 2D texture target
+            }
+
+            // Copy single layer/face at this mip
+            glCopyImageSubData(
+               srcTex->getHandle(), srcTarget, mip, 0, 0, srcZ,
+               mHandle, dstTarget, mip, 0, 0, dstZ,
+               mipW, mipH, 1
+            );
+
+            GLenum err = glGetError();
+            if (err != GL_NO_ERROR)
+               Con::errorf("glCopyImageSubData failed with 0x%X (mip=%u face=%u)", err, mip, f);
          }
       }
-   }
-   glBindTexture(mBinding, 0);
 
-   return true;
+      return;
+   }
+
+   // Fallback: CPU-side copy using glGetTexImage + glTexSubImage
+   for (U32 mip = 0; mip < getMipLevels() && mip < srcTex->getMipLevels(); ++mip)
+   {
+      const GLsizei mipW = getMax(1u, srcTex->getWidth() >> mip);
+      const GLsizei mipH = getMax(1u, srcTex->getHeight() >> mip);
+      const U32 pixelSize = GFXFormat_getByteSize(mFormat); // assuming same fmt for src/dst
+      const U32 dataSize = mipW * mipH * pixelSize;
+
+      FrameAllocatorMarker mem;
+      U8* buffer = (U8*)mem.alloc(dataSize);
+
+      glBindTexture(srcTarget, srcTex->getHandle());
+      glBindTexture(dstTarget, mHandle);
+
+      for (U32 f = firstFace; f < firstFace + faceCount; ++f)
+      {
+         GLenum srcFaceTarget = srcTarget;
+         if (srcTarget == GL_TEXTURE_CUBE_MAP)
+            srcFaceTarget = GFXGLFaceType[f];
+
+         // read pixels from source
+         glGetTexImage(srcFaceTarget, mip, GFXGLTextureFormat[mFormat], GFXGLTextureType[mFormat], buffer);
+
+         GLint dstLayer = 0;
+         if (dstTarget == GL_TEXTURE_CUBE_MAP_ARRAY)
+            dstLayer = (GLint)(slot * 6 + f);
+         else if (dstTarget == GL_TEXTURE_2D_ARRAY)
+            dstLayer = (GLint)slot;
+         else if (dstTarget == GL_TEXTURE_CUBE_MAP)
+            dstLayer = (GLint)f;
+         else
+            dstLayer = 0;
+
+         if (dstTarget == GL_TEXTURE_CUBE_MAP)
+         {
+            GLenum dstFaceTarget = GFXGLFaceType[f];
+            glTexSubImage2D(dstFaceTarget, mip, 0, 0, mipW, mipH,
+               GFXGLTextureFormat[mFormat], GFXGLTextureType[mFormat], buffer);
+         }
+         else if (dstTarget == GL_TEXTURE_2D)
+         {
+            glTexSubImage2D(GL_TEXTURE_2D, mip, 0, 0, mipW, mipH,
+               GFXGLTextureFormat[mFormat], GFXGLTextureType[mFormat], buffer);
+         }
+         else if (dstTarget == GL_TEXTURE_2D_ARRAY || dstTarget == GL_TEXTURE_CUBE_MAP_ARRAY)
+         {
+            glTexSubImage3D(dstTarget, mip, 0, 0, dstLayer, mipW, mipH, 1,
+               GFXGLTextureFormat[mFormat], GFXGLTextureType[mFormat], buffer);
+         }
+      }
+
+      glBindTexture(dstTarget, 0);
+      glBindTexture(srcTarget, 0);
+   }
+}
+
+void GFXGLTextureObject::copyTo(GFXTextureObject* dstTex)
+{
 }
 
 void GFXGLTextureObject::initSamplerState(const GFXSamplerStateDesc &ssd)
diff --git a/Engine/source/gfx/gl/gfxGLTextureObject.h b/Engine/source/gfx/gl/gfxGLTextureObject.h
index 2bd7b1b7f..b62f6da40 100644
--- a/Engine/source/gfx/gl/gfxGLTextureObject.h
+++ b/Engine/source/gfx/gl/gfxGLTextureObject.h
@@ -26,7 +26,9 @@
 #include "gfx/gfxTextureObject.h"
 #include "gfx/gl/tGL/tGL.h"
 #include "gfx/gfxStateBlock.h"
-
+#ifndef _MRECT_H_
+#include "math/mRect.h"
+#endif
 class GFXGLDevice;
 
 class GFXGLTextureObject : public GFXTextureObject 
@@ -64,11 +66,13 @@ public:
 
    /// Get/set data from texture (for dynamic textures and render targets)
    /// @attention DO NOT READ FROM THE RETURNED RECT! It is not guaranteed to work and may incur significant performance penalties.
-   GFXLockedRect* lock(U32 mipLevel = 0, RectI *inRect = NULL) override;
-   void unlock(U32 mipLevel = 0 ) override;
+   GFXLockedRect* lock(U32 mipLevel = 0, RectI *inRect = NULL, U32 faceIndex = 0) override;
+   void unlock(U32 mipLevel = 0, U32 faceIndex = 0) override;
 
    bool copyToBmp(GBitmap *) override; ///< Not implemented
-   
+   void updateTextureSlot(const GFXTexHandle& texHandle, const U32 slot, const S32 face = -1) override;
+   void copyTo(GFXTextureObject* dstTex) override;
+   void generateMipMaps() override {};
    bool mIsNPoT2;
 
    // GFXResource interface
diff --git a/Engine/source/gfx/gl/gfxGLTextureTarget.cpp b/Engine/source/gfx/gl/gfxGLTextureTarget.cpp
index 9a5661b3b..6d90faedb 100644
--- a/Engine/source/gfx/gl/gfxGLTextureTarget.cpp
+++ b/Engine/source/gfx/gl/gfxGLTextureTarget.cpp
@@ -38,9 +38,9 @@ public:
       mipLevel(_mipLevel), zOffset(_zOffset)
    {
    }
-   
+
    virtual ~_GFXGLTargetDesc() {}
-   
+
    virtual U32 getHandle() = 0;
    virtual U32 getWidth() = 0;
    virtual U32 getHeight() = 0;
@@ -49,10 +49,10 @@ public:
    virtual GLenum getBinding() = 0;
    virtual GFXFormat getFormat() = 0;
    virtual bool isCompatible(const GFXGLTextureObject* tex) = 0;
-   
+
    U32 getMipLevel() { return mipLevel; }
    U32 getZOffset() { return zOffset; }
-   
+
 private:
    U32 mipLevel;
    U32 zOffset;
@@ -62,19 +62,21 @@ private:
 class _GFXGLTextureTargetDesc : public _GFXGLTargetDesc
 {
 public:
-   _GFXGLTextureTargetDesc(GFXGLTextureObject* tex, U32 _mipLevel, U32 _zOffset) 
-      : _GFXGLTargetDesc(_mipLevel, _zOffset), mTex(tex)
+  
+   _GFXGLTextureTargetDesc(GFXGLTextureObject* tex, U32 _mipLevel, U32 _zOffset, U32 _face = 0, bool isCube = false)
+      : _GFXGLTargetDesc(_mipLevel, _zOffset), mTex(tex), mFace(_face), mIsCube(isCube)
    {
    }
-   
+
    virtual ~_GFXGLTextureTargetDesc() {}
-   
+
    U32 getHandle() override { return mTex->getHandle(); }
    U32 getWidth() override { return mTex->getWidth(); }
    U32 getHeight() override { return mTex->getHeight(); }
    U32 getDepth() override { return mTex->getDepth(); }
+   U32 getFace() { return mFace; }
    bool hasMips() override { return mTex->mMipLevels != 1; }
-   GLenum getBinding() override { return mTex->getBinding(); }
+   GLenum getBinding() override { return mIsCube ? GFXGLFaceType[mFace] : mTex->getBinding(); }
    GFXFormat getFormat() override { return mTex->getFormat(); }
    bool isCompatible(const GFXGLTextureObject* tex) override
    {
@@ -82,40 +84,12 @@ public:
          && mTex->getWidth() == tex->getWidth()
          && mTex->getHeight() == tex->getHeight();
    }
-   GFXGLTextureObject* getTextureObject() const {return mTex; }
-   
+   GFXGLTextureObject* getTextureObject() const { return mTex; }
+
 private:
    StrongRefPtr<GFXGLTextureObject> mTex;
-};
-
-/// Internal struct used to track Cubemap texture information for FBO attachment
-class _GFXGLCubemapTargetDesc : public _GFXGLTargetDesc
-{
-public:
-   _GFXGLCubemapTargetDesc(GFXGLCubemap* tex, U32 _face, U32 _mipLevel, U32 _zOffset) 
-      : _GFXGLTargetDesc(_mipLevel, _zOffset), mTex(tex), mFace(_face)
-   {
-   }
-   
-   virtual ~_GFXGLCubemapTargetDesc() {}
-   
-   U32 getHandle() override { return mTex->getHandle(); }
-   U32 getWidth() override { return mTex->getWidth(); }
-   U32 getHeight() override { return mTex->getHeight(); }
-   U32 getDepth() override { return 0; }
-   bool hasMips() override { return mTex->getMipMapLevels() != 1; }
-   GLenum getBinding() override { return GFXGLCubemap::getEnumForFaceNumber(mFace); }
-   GFXFormat getFormat() override { return mTex->getFormat(); }
-   bool isCompatible(const GFXGLTextureObject* tex) override
-   {
-      return mTex->getFormat() == tex->getFormat()
-         && mTex->getWidth() == tex->getWidth()
-         && mTex->getHeight() == tex->getHeight();
-   }
-   
-private:
-   StrongRefPtr<GFXGLCubemap> mTex;
    U32 mFace;
+   bool mIsCube;
 };
 
 // Internal implementations
@@ -123,9 +97,9 @@ class _GFXGLTextureTargetImpl // TODO OPENGL remove and implement on GFXGLTextur
 {
 public:
    GFXGLTextureTarget* mTarget;
-   
+
    virtual ~_GFXGLTextureTargetImpl() {}
-   
+
    virtual void applyState() = 0;
    virtual void makeActive() = 0;
    virtual void finish() = 0;
@@ -137,10 +111,10 @@ class _GFXGLTextureTargetFBOImpl : public _GFXGLTextureTargetImpl
 public:
    GLuint mFramebuffer;
    bool mGenMips;
-   
+
    _GFXGLTextureTargetFBOImpl(GFXGLTextureTarget* target);
    virtual ~_GFXGLTextureTargetFBOImpl();
-   
+
    void applyState() override;
    void makeActive() override;
    void finish() override;
@@ -159,42 +133,42 @@ _GFXGLTextureTargetFBOImpl::~_GFXGLTextureTargetFBOImpl()
 }
 
 void _GFXGLTextureTargetFBOImpl::applyState()
-{   
+{
    // REMINDER: When we implement MRT support, check against GFXGLDevice::getNumRenderTargets()
-   
+
    PRESERVE_FRAMEBUFFER();
    glBindFramebuffer(GL_FRAMEBUFFER, mFramebuffer);
    glEnable(GL_FRAMEBUFFER_SRGB);
    bool drawbufs[16];
    int bufsize = 0;
    for (int i = 0; i < 16; i++)
-           drawbufs[i] = false;
+      drawbufs[i] = false;
    bool hasColor = false;
-   for(int i = 0; i < GFXGL->getNumRenderTargets(); ++i)
-   {   
-      _GFXGLTargetDesc* color = mTarget->getTargetDesc( static_cast<GFXTextureTarget::RenderSlot>(GFXTextureTarget::Color0+i ));
-      if(color)
+   for (int i = 0; i < GFXGL->getNumRenderTargets(); ++i)
+   {
+      _GFXGLTargetDesc* color = mTarget->getTargetDesc(static_cast<GFXTextureTarget::RenderSlot>(GFXTextureTarget::Color0 + i));
+      if (color)
       {
          hasColor = true;
          const GLenum binding = color->getBinding();
-         if( binding == GL_TEXTURE_2D || (binding >= GL_TEXTURE_CUBE_MAP_POSITIVE_X && binding <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z) )
-            glFramebufferTexture2D( GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, color->getBinding( ), color->getHandle( ), color->getMipLevel( ) );
-         else if( binding == GL_TEXTURE_1D )
-            glFramebufferTexture1D( GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, color->getBinding( ), color->getHandle( ), color->getMipLevel( ) );
-         else if( binding == GL_TEXTURE_3D )
-            glFramebufferTexture3D( GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, color->getBinding( ), color->getHandle( ), color->getMipLevel( ), color->getZOffset( ) );
+         if (binding == GL_TEXTURE_2D || (binding >= GL_TEXTURE_CUBE_MAP_POSITIVE_X && binding <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z))
+            glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, color->getBinding(), color->getHandle(), color->getMipLevel());
+         else if (binding == GL_TEXTURE_1D)
+            glFramebufferTexture1D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, color->getBinding(), color->getHandle(), color->getMipLevel());
+         else if (binding == GL_TEXTURE_3D)
+            glFramebufferTexture3D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, color->getBinding(), color->getHandle(), color->getMipLevel(), color->getZOffset());
          else
-             Con::errorf("_GFXGLTextureTargetFBOImpl::applyState - Bad binding");
+            Con::errorf("_GFXGLTextureTargetFBOImpl::applyState - Bad binding");
       }
       else
       {
          // Clears the texture (note that the binding is irrelevent)
-         glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0+i, GL_TEXTURE_2D, 0, 0);
+         glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, GL_TEXTURE_2D, 0, 0);
       }
    }
-   
+
    _GFXGLTargetDesc* depthStecil = mTarget->getTargetDesc(GFXTextureTarget::DepthStencil);
-   if(depthStecil)
+   if (depthStecil)
    {
       // Certain drivers have issues with depth only FBOs.  That and the next two asserts assume we have a color target.
       AssertFatal(hasColor, "GFXGLTextureTarget::applyState() - Cannot set DepthStencil target without Color0 target!");
@@ -206,40 +180,40 @@ void _GFXGLTextureTargetFBOImpl::applyState()
       glFramebufferTexture2D(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
    }
 
-   GLenum *buf = new GLenum[bufsize];
+   GLenum* buf = new GLenum[bufsize];
    int count = 0;
    for (int i = 0; i < bufsize; i++)
    {
-           if (drawbufs[i])
-           {
-                   buf[count] = GL_COLOR_ATTACHMENT0 + i;
-                   count++;
-           }
+      if (drawbufs[i])
+      {
+         buf[count] = GL_COLOR_ATTACHMENT0 + i;
+         count++;
+      }
    }
- 
+
    glDrawBuffers(bufsize, buf);
- 
+
    delete[] buf;
    CHECK_FRAMEBUFFER_STATUS();
 }
 
 void _GFXGLTextureTargetFBOImpl::makeActive()
 {
-    glBindFramebuffer(GL_FRAMEBUFFER, mFramebuffer);
-    GFXGL->getOpenglCache()->setCacheBinded(GL_FRAMEBUFFER, mFramebuffer);
+   glBindFramebuffer(GL_FRAMEBUFFER, mFramebuffer);
+   GFXGL->getOpenglCache()->setCacheBinded(GL_FRAMEBUFFER, mFramebuffer);
 
-    int i = 0;
-    GLenum draws[16];
-    for( i = 0; i < GFXGL->getNumRenderTargets(); ++i)
-    {
-        _GFXGLTargetDesc* color = mTarget->getTargetDesc( static_cast<GFXTextureTarget::RenderSlot>(GFXTextureTarget::Color0+i ));
-        if(color)
-            draws[i] = GL_COLOR_ATTACHMENT0 + i;
-        else
-            break;
-    }
+   int i = 0;
+   GLenum draws[16];
+   for (i = 0; i < GFXGL->getNumRenderTargets(); ++i)
+   {
+      _GFXGLTargetDesc* color = mTarget->getTargetDesc(static_cast<GFXTextureTarget::RenderSlot>(GFXTextureTarget::Color0 + i));
+      if (color)
+         draws[i] = GL_COLOR_ATTACHMENT0 + i;
+      else
+         break;
+   }
 
-    glDrawBuffers( i, draws );
+   glDrawBuffers(i, draws);
 }
 
 void _GFXGLTextureTargetFBOImpl::finish()
@@ -250,20 +224,20 @@ void _GFXGLTextureTargetFBOImpl::finish()
    if (!mGenMips)
       return;
 
-   for(int i = 0; i < GFXGL->getNumRenderTargets(); ++i)
-   {   
-      _GFXGLTargetDesc* color = mTarget->getTargetDesc( static_cast<GFXTextureTarget::RenderSlot>(GFXTextureTarget::Color0+i ) );
-      if(!color || !(color->hasMips()))
+   for (int i = 0; i < GFXGL->getNumRenderTargets(); ++i)
+   {
+      _GFXGLTargetDesc* color = mTarget->getTargetDesc(static_cast<GFXTextureTarget::RenderSlot>(GFXTextureTarget::Color0 + i));
+      if (!color || !(color->hasMips()))
          continue;
-   
+
       // Generate mips if necessary
       // Assumes a 2D texture.
       GLenum binding = color->getBinding();
       binding = (binding >= GL_TEXTURE_CUBE_MAP_POSITIVE_X && binding <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z) ? GL_TEXTURE_CUBE_MAP : binding;
 
-      PRESERVE_TEXTURE( binding );
-      glBindTexture( binding, color->getHandle() );
-      glGenerateMipmap( binding );
+      PRESERVE_TEXTURE(binding);
+      glBindTexture(binding, color->getHandle());
+      glGenerateMipmap(binding);
    }
 }
 
@@ -271,13 +245,13 @@ void _GFXGLTextureTargetFBOImpl::finish()
 GFXGLTextureTarget::GFXGLTextureTarget(bool genMips) : mCopyFboSrc(0), mCopyFboDst(0)
 {
    mGenMips = genMips;
-   for(U32 i=0; i<MaxRenderSlotId; i++)
+   for (U32 i = 0; i < MaxRenderSlotId; i++)
       mTargets[i] = NULL;
-   
-   GFXTextureManager::addEventDelegate( this, &GFXGLTextureTarget::_onTextureEvent );
+
+   GFXTextureManager::addEventDelegate(this, &GFXGLTextureTarget::_onTextureEvent);
 
    _impl = new _GFXGLTextureTargetFBOImpl(this);
-    
+
    glGenFramebuffers(1, &mCopyFboSrc);
    glGenFramebuffers(1, &mCopyFboDst);
 }
@@ -292,7 +266,7 @@ GFXGLTextureTarget::~GFXGLTextureTarget()
 
 const Point2I GFXGLTextureTarget::getSize()
 {
-   if(mTargets[Color0].isValid())
+   if (mTargets[Color0].isValid())
       return Point2I(mTargets[Color0]->getWidth(), mTargets[Color0]->getHeight());
 
    return Point2I(0, 0);
@@ -300,61 +274,50 @@ const Point2I GFXGLTextureTarget::getSize()
 
 GFXFormat GFXGLTextureTarget::getFormat()
 {
-   if(mTargets[Color0].isValid())
+   if (mTargets[Color0].isValid())
       return mTargets[Color0]->getFormat();
 
    return GFXFormatR8G8B8A8;
 }
 
-void GFXGLTextureTarget::attachTexture( RenderSlot slot, GFXTextureObject *tex, U32 mipLevel/*=0*/, U32 zOffset /*= 0*/ )
+void GFXGLTextureTarget::attachTexture(RenderSlot slot, GFXTextureObject* tex, U32 mipLevel/*=0*/, U32 zOffset /*= 0*/, U32 face /*= 0*/)
 {
-   if( tex == GFXTextureTarget::sDefaultDepthStencil )
+   if (tex == GFXTextureTarget::sDefaultDepthStencil)
       tex = GFXGL->getDefaultDepthTex();
 
+   // are we readding the same thing, face and all?
    _GFXGLTextureTargetDesc* mTex = static_cast<_GFXGLTextureTargetDesc*>(mTargets[slot].ptr());
-   if( (!tex && !mTex) || (mTex && mTex->getTextureObject() == tex) )
+   if ((!tex && !mTex) || (mTex && mTex->getTextureObject() == tex && mTex->getFace() == face))
       return;
-   
+
    // Triggers an update when we next render
    invalidateState();
 
    // We stash the texture and info into an internal struct.
    GFXGLTextureObject* glTexture = static_cast<GFXGLTextureObject*>(tex);
-   if(tex && tex != GFXTextureTarget::sDefaultDepthStencil)
-      mTargets[slot] = new _GFXGLTextureTargetDesc(glTexture, mipLevel, zOffset);
+   if (tex && tex != GFXTextureTarget::sDefaultDepthStencil)
+   {
+      mTargets[slot] = new _GFXGLTextureTargetDesc(glTexture, mipLevel, zOffset, face, glTexture->isCubeMap());
+   }
    else
       mTargets[slot] = NULL;
 }
 
-void GFXGLTextureTarget::attachTexture( RenderSlot slot, GFXCubemap *tex, U32 face, U32 mipLevel/*=0*/ )
+void GFXGLTextureTarget::attachTexture(RenderSlot slot, GFXCubemap* tex, U32 face, U32 mipLevel/*=0*/)
 {
-   // No depth cubemaps, sorry
-   AssertFatal(slot != DepthStencil, "GFXGLTextureTarget::attachTexture (cube) - Cube depth textures not supported!");
-   if(slot == DepthStencil)
-      return;
-    
-   // Triggers an update when we next render
-   invalidateState();
-   
-   // We stash the texture and info into an internal struct.
-   GFXGLCubemap* glTexture = static_cast<GFXGLCubemap*>(tex);
-   if(tex)
-      mTargets[slot] = new _GFXGLCubemapTargetDesc(glTexture, face, mipLevel, 0);
-   else
-      mTargets[slot] = NULL;
 }
 
 void GFXGLTextureTarget::clearAttachments()
 {
    deactivate();
-   for(S32 i=1; i<MaxRenderSlotId; i++)
+   for (S32 i = 1; i < MaxRenderSlotId; i++)
       attachTexture((RenderSlot)i, NULL);
 }
 
 void GFXGLTextureTarget::zombify()
 {
    invalidateState();
-   
+
    // Will be recreated in applyState
    _impl = NULL;
 }
@@ -376,15 +339,15 @@ void GFXGLTextureTarget::deactivate()
 
 void GFXGLTextureTarget::applyState()
 {
-   if(!isPendingState())
+   if (!isPendingState())
       return;
 
    // So we don't do this over and over again
    stateApplied();
-   
-   if(_impl.isNull())
+
+   if (_impl.isNull())
       _impl = new _GFXGLTextureTargetFBOImpl(this);
-           
+
    _impl->applyState();
 }
 
@@ -394,7 +357,7 @@ _GFXGLTargetDesc* GFXGLTextureTarget::getTargetDesc(RenderSlot slot) const
    return mTargets[slot].ptr();
 }
 
-void GFXGLTextureTarget::_onTextureEvent( GFXTexCallbackCode code )
+void GFXGLTextureTarget::_onTextureEvent(GFXTexCallbackCode code)
 {
    invalidateState();
 }
@@ -403,7 +366,7 @@ const String GFXGLTextureTarget::describeSelf() const
 {
    String ret = String::ToString("   Color0 Attachment: %i", mTargets[Color0].isValid() ? mTargets[Color0]->getHandle() : 0);
    ret += String::ToString("   Depth Attachment: %i", mTargets[DepthStencil].isValid() ? mTargets[DepthStencil]->getHandle() : 0);
-   
+
    return ret;
 }
 
@@ -416,27 +379,27 @@ void GFXGLTextureTarget::resolveTo(GFXTextureObject* obj)
    AssertFatal(dynamic_cast<GFXGLTextureObject*>(obj), "GFXGLTextureTarget::resolveTo - Incorrect type of texture, expected a GFXGLTextureObject");
    GFXGLTextureObject* glTexture = static_cast<GFXGLTextureObject*>(obj);
 
-   if( GFXGL->mCapabilities.copyImage && mTargets[Color0]->isCompatible(glTexture) )
+   if (GFXGL->mCapabilities.copyImage && mTargets[Color0]->isCompatible(glTexture))
    {
-      GLenum binding = mTargets[Color0]->getBinding();      
+      GLenum binding = mTargets[Color0]->getBinding();
       binding = (binding >= GL_TEXTURE_CUBE_MAP_POSITIVE_X && binding <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z) ? GL_TEXTURE_CUBE_MAP : binding;
       U32 srcStartDepth = binding == GL_TEXTURE_CUBE_MAP ? mTargets[Color0]->getBinding() - GL_TEXTURE_CUBE_MAP_POSITIVE_X : 0;
       glCopyImageSubData(
-        mTargets[Color0]->getHandle(), binding, 0, 0, 0, srcStartDepth,
-        glTexture->getHandle(), glTexture->getBinding(), 0, 0, 0, 0,
-        mTargets[Color0]->getWidth(), mTargets[Color0]->getHeight(), 1);
+         mTargets[Color0]->getHandle(), binding, 0, 0, 0, srcStartDepth,
+         glTexture->getHandle(), glTexture->getBinding(), 0, 0, 0, 0,
+         mTargets[Color0]->getWidth(), mTargets[Color0]->getHeight(), 1);
 
       return;
    }
 
    PRESERVE_FRAMEBUFFER();
-   
+
    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, mCopyFboDst);
    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, glTexture->getBinding(), glTexture->getHandle(), 0);
-   
+
    glBindFramebuffer(GL_READ_FRAMEBUFFER, mCopyFboSrc);
    glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, mTargets[Color0]->getBinding(), mTargets[Color0]->getHandle(), 0);
-   
+
    glBlitFramebuffer(0, 0, mTargets[Color0]->getWidth(), mTargets[Color0]->getHeight(),
       0, 0, glTexture->getWidth(), glTexture->getHeight(), GL_COLOR_BUFFER_BIT, GL_NEAREST);
 }
diff --git a/Engine/source/gfx/gl/gfxGLTextureTarget.h b/Engine/source/gfx/gl/gfxGLTextureTarget.h
index 605d7b29e..7a327d900 100644
--- a/Engine/source/gfx/gl/gfxGLTextureTarget.h
+++ b/Engine/source/gfx/gl/gfxGLTextureTarget.h
@@ -50,35 +50,35 @@ public:
 
    const Point2I getSize() override;
    GFXFormat getFormat() override;
-   void attachTexture(RenderSlot slot, GFXTextureObject *tex, U32 mipLevel=0, U32 zOffset = 0) override;
-   void attachTexture(RenderSlot slot, GFXCubemap *tex, U32 face, U32 mipLevel=0) override;
+   void attachTexture(RenderSlot slot, GFXTextureObject* tex, U32 mipLevel = 0, U32 zOffset = 0, U32 face = 0) override;
+   void attachTexture(RenderSlot slot, GFXCubemap* tex, U32 face, U32 mipLevel = 0) override;
    virtual void clearAttachments();
 
    /// Functions to query internal state
    /// @{
-   
+
    /// Returns the internal structure for the given slot.  This should only be called by our internal implementations.
    _GFXGLTargetDesc* getTargetDesc(RenderSlot slot) const;
 
    /// @}
-   
+
    void deactivate() override;
    void zombify() override;
    void resurrect() override;
    const String describeSelf() const override;
-   
+
    void resolve() override;
-   
+
    void resolveTo(GFXTextureObject* obj) override;
-   
+
 protected:
 
    friend class GFXGLDevice;
 
    /// The callback used to get texture events.
    /// @see GFXTextureManager::addEventDelegate
-   void _onTextureEvent( GFXTexCallbackCode code );
-   
+   void _onTextureEvent(GFXTexCallbackCode code);
+
    /// Pointer to our internal implementation
    AutoPtr<_GFXGLTextureTargetImpl> _impl;
 
@@ -87,10 +87,10 @@ protected:
 
    /// These redirect to our internal implementation
    /// @{
-   
+
    void applyState();
    void makeActive();
-   
+
    /// @}
 
    //copy FBO
diff --git a/Engine/source/gfx/gl/sdl/gfxGLDevice.sdl.cpp b/Engine/source/gfx/gl/sdl/gfxGLDevice.sdl.cpp
index b886094c7..1bb34d4f2 100644
--- a/Engine/source/gfx/gl/sdl/gfxGLDevice.sdl.cpp
+++ b/Engine/source/gfx/gl/sdl/gfxGLDevice.sdl.cpp
@@ -103,8 +103,6 @@ void GFXGLDevice::enumerateAdapters( Vector<GFXAdapter*> &adapterList )
    }
 
    SDL_ClearError();
-   SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3);
-   SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 3);
    SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
    SDL_GL_SetAttribute(SDL_GL_FRAMEBUFFER_SRGB_CAPABLE, 1);
 
@@ -140,6 +138,10 @@ void GFXGLDevice::enumerateAdapters( Vector<GFXAdapter*> &adapterList )
       return;
    }
 
+   // Set our sdl attribute to use this version.
+   SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, major);
+   SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, minor);
+
    //check for required extensions
    if (!gglHasExtension(ARB_texture_cube_map_array))
    {
@@ -168,7 +170,24 @@ void GFXGLDevice::enumerateAdapters( Vector<GFXAdapter*> &adapterList )
       dStrcpy(toAdd->mName, "OpenGL", GFXAdapter::MaxAdapterNameLen);
 
    toAdd->mType = OpenGL;
-   toAdd->mShaderModel = 0.f;
+   F32 shaderModel = 3.3f;
+   if (major == 4)
+   {
+      if (minor == 0)
+         shaderModel = 4.00f;  // GLSL 4.00
+      else if (minor == 1)
+         shaderModel = 4.10f;  // GLSL 4.10
+      else if (minor == 2)
+         shaderModel = 4.20f;  // GLSL 4.20
+      else if (minor == 3)
+         shaderModel = 4.30f;  // GLSL 4.30
+      else if (minor == 4)
+         shaderModel = 4.40f;  // GLSL 4.40
+      else if (minor == 5)
+         shaderModel = 4.50f;  // GLSL 4.50
+      else if (minor == 6)
+         shaderModel = 4.60f;  // GLSL 4.60
+   }
    toAdd->mCreateDeviceInstanceDelegate = mCreateDeviceInstance;
 
    // Enumerate all available resolutions:
diff --git a/Engine/source/gfx/sim/cubemapData.cpp b/Engine/source/gfx/sim/cubemapData.cpp
index d257027b2..d1dae8e16 100644
--- a/Engine/source/gfx/sim/cubemapData.cpp
+++ b/Engine/source/gfx/sim/cubemapData.cpp
@@ -45,7 +45,15 @@ CubemapData::CubemapData()
 
 CubemapData::~CubemapData()
 {
-   mCubemap = NULL;
+   if (mCubeMapAsset.notNull())
+   {
+      mCubeMapAsset.clear();
+   }
+
+   if (mCubemap)
+   {
+      mCubemap.free();
+   }
 }
 
 ConsoleDocClass( CubemapData, 
@@ -101,7 +109,7 @@ void CubemapData::createMap()
        //check mCubeMapFile first
        if (mCubeMapAsset.notNull())
        {
-          mCubemap = TEXMGR->createCubemap(mCubeMapAsset->getImageFile());
+          mCubemap = mCubeMapAsset->getTexture(&GFXCubemapStaticTextureProfile);
           return;
        }
        else
@@ -125,11 +133,14 @@ void CubemapData::createMap()
 
        if( initSuccess )
        {
-           mCubemap = GFX->createCubemap();
-           if (mCubeMapFaceAsset->isNull())
-              return;
+          if (mCubeMapFaceAsset->isNull())
+             return;
 
-           mCubemap->initStatic(mCubeMapFaceTex);
+          mCubemap.set(mCubeMapFaceTex->getWidth(), mCubeMapFaceTex->getHeight(), mCubeMapFaceTex->getFormat(), &GFXCubemapStaticTextureProfile, "CubemapData-InitTexture", mCubeMapFaceTex->getPointer()->getMipLevels());
+          for (U32 i = 0; i < 6; i++)
+          {
+             mCubemap->updateTextureSlot(mCubeMapFaceTex[i],0, i);
+          }
        }
    }
 }
@@ -141,7 +152,7 @@ void CubemapData::updateFaces()
    //check mCubeMapFile first
    if (mCubeMapAsset.notNull())
    {
-      mCubemap = TEXMGR->createCubemap(mCubeMapAsset->getImageFile());
+      mCubemap = mCubeMapAsset->getTexture(&GFXCubemapStaticTextureProfile);
       return;
    }
    else
@@ -166,11 +177,14 @@ void CubemapData::updateFaces()
 	if( initSuccess )
 	{
 		mCubemap = NULL;
-		mCubemap = GFX->createCubemap();
       if (mCubeMapFaceAsset->isNull())
          return;
 
-		mCubemap->initStatic(mCubeMapFaceTex);
+      mCubemap.set(mCubeMapFaceTex->getWidth(), mCubeMapFaceTex->getHeight(), GFXFormatR16G16B16A16F, &GFXCubemapStaticTextureProfile, "CubemapData-InitTexture", mCubeMapFaceTex->getFormat());
+      for (U32 i = 0; i < 6; i++)
+      {
+         mCubemap->updateTextureSlot(mCubeMapFaceTex[i], 0, i);
+      }
 	}
 }
 
diff --git a/Engine/source/gfx/sim/cubemapData.h b/Engine/source/gfx/sim/cubemapData.h
index 2d2385deb..54facf7a4 100644
--- a/Engine/source/gfx/sim/cubemapData.h
+++ b/Engine/source/gfx/sim/cubemapData.h
@@ -48,7 +48,7 @@ class CubemapData : public SimObject
 
 public:   
 
-   GFXCubemapHandle  mCubemap;
+   GFXTexHandle  mCubemap;
 
    CubemapData();
    ~CubemapData();
diff --git a/Engine/source/lighting/shadowMap/cubeLightShadowMap.cpp b/Engine/source/lighting/shadowMap/cubeLightShadowMap.cpp
index df8466bd3..468e61e95 100644
--- a/Engine/source/lighting/shadowMap/cubeLightShadowMap.cpp
+++ b/Engine/source/lighting/shadowMap/cubeLightShadowMap.cpp
@@ -35,26 +35,11 @@
 #include "gfx/util/gfxFrustumSaver.h"
 #include "math/mathUtils.h"
 
-
 CubeLightShadowMap::CubeLightShadowMap( LightInfo *light )
    : Parent( light )
 {
 }
 
-bool CubeLightShadowMap::setTextureStage( U32 currTexFlag, LightingShaderConstants* lsc )
-{
-   if ( currTexFlag == Material::DynamicLight )
-   {
-      S32 reg = lsc->mShadowMapSC->getSamplerRegister();
-   	if ( reg != -1 )
-      	GFX->setCubeTexture( reg, mCubemap );
-
-      return true;
-   }
-
-   return false;
-}
-
 void CubeLightShadowMap::setShaderParameters(   GFXShaderConstBuffer *params, 
                                                 LightingShaderConstants *lsc )
 {
@@ -77,12 +62,6 @@ void CubeLightShadowMap::setShaderParameters(   GFXShaderConstBuffer *params,
    params->setSafe( lsc->mShadowSoftnessConst, p->shadowSoftness * ( 1.0f / mTexSize ) );
 }
 
-void CubeLightShadowMap::releaseTextures()
-{
-   Parent::releaseTextures();
-   mCubemap = NULL;
-}
-
 void CubeLightShadowMap::_render(   RenderPassManager* renderPass,
                                     const SceneRenderState *diffuseState )
 {
@@ -92,15 +71,16 @@ void CubeLightShadowMap::_render(   RenderPassManager* renderPass,
    const bool bUseLightmappedGeometry = lmParams ? !lmParams->representedInLightmap || lmParams->includeLightmappedGeometryInShadow : true;
 
    const U32 texSize = getBestTexSize();
-
-   if (  mCubemap.isNull() || 
-         mTexSize != texSize )
+   if (mShadowMapTex.isNull() ||
+      mTexSize != texSize)
    {
       mTexSize = texSize;
-      mCubemap = GFX->createCubemap();
-      mCubemap->initDynamic( mTexSize, LightShadowMap::ShadowMapFormat );
-   }
 
+      mShadowMapTex.set(mTexSize, mTexSize,
+         ShadowMapFormat, &CubeShadowMapProfile,
+         "CubeLightShadowMap");
+      mShadowMapDepth = _getDepthTarget(mShadowMapTex->getWidth(), mShadowMapTex->getHeight());
+   }
    // Setup the world to light projection which is used
    // in the shader to transform the light vector for the
    // shadow lookup.
@@ -155,20 +135,14 @@ void CubeLightShadowMap::_render(   RenderPassManager* renderPass,
       GFXDEBUGEVENT_START( CubeLightShadowMap_Render_Face, ColorI::RED );
 
       // create camera matrix
-      VectorF cross = mCross(vUpVec, vLookatPt);
-      cross.normalizeSafe();
-
       MatrixF lightMatrix(true);
-      lightMatrix.setColumn(0, cross);
-      lightMatrix.setColumn(1, vLookatPt);
-      lightMatrix.setColumn(2, vUpVec);
-      lightMatrix.setPosition( mLight->getPosition() );
+      lightMatrix.LookAt(mLight->getPosition(), vLookatPt, vUpVec);
       lightMatrix.inverse();
 
       GFX->setWorldMatrix( lightMatrix );
 
-      mTarget->attachTexture(GFXTextureTarget::Color0, mCubemap, i);
-      mTarget->attachTexture(GFXTextureTarget::DepthStencil, _getDepthTarget( mTexSize, mTexSize ));
+      mTarget->attachTexture(GFXTextureTarget::Color0, mShadowMapTex,0,0, i);
+      mTarget->attachTexture(GFXTextureTarget::DepthStencil, _getDepthTarget(mShadowMapTex->getWidth(), mShadowMapTex->getHeight()));
       GFX->setActiveRenderTarget(mTarget);
       GFX->clear( GFXClearTarget | GFXClearStencil | GFXClearZBuffer, ColorI(255,255,255,255), 0.0f, 0 );
 
diff --git a/Engine/source/lighting/shadowMap/cubeLightShadowMap.h b/Engine/source/lighting/shadowMap/cubeLightShadowMap.h
index 083182f33..462ec28e4 100644
--- a/Engine/source/lighting/shadowMap/cubeLightShadowMap.h
+++ b/Engine/source/lighting/shadowMap/cubeLightShadowMap.h
@@ -39,18 +39,9 @@ public:
 
    CubeLightShadowMap( LightInfo *light );
 
-   // LightShadowMap
-   bool hasShadowTex() const override { return mCubemap.isValid(); }
    ShadowType getShadowType() const override { return ShadowType_CubeMap; }
    void _render( RenderPassManager* renderPass, const SceneRenderState *diffuseState ) override;
    void setShaderParameters( GFXShaderConstBuffer* params, LightingShaderConstants* lsc ) override;
-   void releaseTextures() override;
-   bool setTextureStage( U32 currTexFlag, LightingShaderConstants* lsc ) override;
-
-protected:   
-
-   /// The shadow cubemap.
-   GFXCubemapHandle mCubemap;
 
 };
 
diff --git a/Engine/source/lighting/shadowMap/lightShadowMap.cpp b/Engine/source/lighting/shadowMap/lightShadowMap.cpp
index 6f10bf6a1..d3ecbdd08 100644
--- a/Engine/source/lighting/shadowMap/lightShadowMap.cpp
+++ b/Engine/source/lighting/shadowMap/lightShadowMap.cpp
@@ -72,6 +72,13 @@ GFX_ImplementTextureProfile( ShadowMapProfile,
                               GFXTextureProfile::Pooled,
                               GFXTextureProfile::NONE );
 
+GFX_ImplementTextureProfile(CubeShadowMapProfile,
+                              GFXTextureProfile::DiffuseMap,
+                              GFXTextureProfile::PreserveSize |
+                              GFXTextureProfile::RenderTarget |
+                              GFXTextureProfile::Pooled | GFXTextureProfile::CubeMap,
+                              GFXTextureProfile::NONE);
+
 GFX_ImplementTextureProfile( ShadowMapZProfile,
                              GFXTextureProfile::DiffuseMap, 
                              GFXTextureProfile::PreserveSize | 
diff --git a/Engine/source/lighting/shadowMap/lightShadowMap.h b/Engine/source/lighting/shadowMap/lightShadowMap.h
index c8f566d50..35d2f5821 100644
--- a/Engine/source/lighting/shadowMap/lightShadowMap.h
+++ b/Engine/source/lighting/shadowMap/lightShadowMap.h
@@ -266,6 +266,7 @@ protected:
 };
 
 GFX_DeclareTextureProfile( ShadowMapProfile );
+GFX_DeclareTextureProfile( CubeShadowMapProfile );
 GFX_DeclareTextureProfile( ShadowMapZProfile );
 
 
diff --git a/Engine/source/materials/materialDefinition.h b/Engine/source/materials/materialDefinition.h
index cecbdc7b3..832898949 100644
--- a/Engine/source/materials/materialDefinition.h
+++ b/Engine/source/materials/materialDefinition.h
@@ -150,7 +150,7 @@ public:
       TextureTable mTextures;
 
       /// The cubemap for this stage.
-      GFXCubemap* mCubemap;
+      GFXTexHandle mCubemap;
 
    public:
 
@@ -204,10 +204,10 @@ public:
       void getFeatureSet(FeatureSet* outFeatures) const;
 
       /// Returns the stage cubemap.
-      GFXCubemap* getCubemap() const { return mCubemap; }
+      GFXTexHandle getCubemap() const { return mCubemap; }
 
       /// Set the stage cubemap.
-      void setCubemap(GFXCubemap* cubemap) { mCubemap = cubemap; }
+      void setCubemap(GFXTexHandle cubemap) { mCubemap = cubemap; }
 
    };
 
diff --git a/Engine/source/materials/processedCustomMaterial.cpp b/Engine/source/materials/processedCustomMaterial.cpp
index 43d21ec8f..8450022b2 100644
--- a/Engine/source/materials/processedCustomMaterial.cpp
+++ b/Engine/source/materials/processedCustomMaterial.cpp
@@ -377,12 +377,12 @@ void ProcessedCustomMaterial::setTextureStages( SceneRenderState *state, const S
             }
          case Material::Cube:
             {
-               GFX->setCubeTexture( samplerRegister, rpd->mCubeMap );
+               GFX->setTexture( samplerRegister, rpd->mCubeMap );
                break;
             }
          case Material::SGCube:
             {
-               GFX->setCubeTexture( samplerRegister, sgData.cubemap );
+               GFX->setTexture( samplerRegister, sgData.cubemap );
                break;
             }
          case Material::BackBuff:
diff --git a/Engine/source/materials/processedMaterial.h b/Engine/source/materials/processedMaterial.h
index 4efc338bc..e93c3558d 100644
--- a/Engine/source/materials/processedMaterial.h
+++ b/Engine/source/materials/processedMaterial.h
@@ -77,7 +77,7 @@ public:
    /// The cubemap to use when the texture type is
    /// set to Material::Cube.
    /// @see mTexType
-   GFXCubemapHandle mCubeMap;
+   GFXTexHandle mCubeMap;
 
    U32 mNumTex;
 
diff --git a/Engine/source/materials/processedShaderMaterial.cpp b/Engine/source/materials/processedShaderMaterial.cpp
index 5c543ee5f..2494fc759 100644
--- a/Engine/source/materials/processedShaderMaterial.cpp
+++ b/Engine/source/materials/processedShaderMaterial.cpp
@@ -854,11 +854,11 @@ void ProcessedShaderMaterial::setTextureStages( SceneRenderState *state, const S
             break;
 
          case Material::Cube:
-            GFX->setCubeTexture( i, rpd->mCubeMap );
+            GFX->setTexture( i, rpd->mCubeMap );
             break;
 
          case Material::SGCube:
-            GFX->setCubeTexture( i, sgData.cubemap );
+            GFX->setTexture( i, sgData.cubemap );
             break;
 
          case Material::BackBuff:
@@ -1333,7 +1333,7 @@ void ProcessedShaderMaterial::setSceneInfo(SceneRenderState * state, const Scene
       }
    }
    if (sgData.cubemap)
-      shaderConsts->setSafe(handles->mCubeMipsSC, (F32)sgData.cubemap->getMipMapLevels());
+      shaderConsts->setSafe(handles->mCubeMipsSC, (F32)sgData.cubemap->getMipLevels());
    else
       shaderConsts->setSafe(handles->mCubeMipsSC, (F32)getBinLog2(PROBEMGR->getProbeTexSize()));
 
diff --git a/Engine/source/materials/sceneData.h b/Engine/source/materials/sceneData.h
index bd99e3e00..cdaaf97e5 100644
--- a/Engine/source/materials/sceneData.h
+++ b/Engine/source/materials/sceneData.h
@@ -83,7 +83,7 @@ struct SceneData
   
    // misc
    const MatrixF *objTrans;
-   GFXCubemap *cubemap;
+   GFXTexHandle cubemap;
    F32 visibility;
 
    /// Enables wireframe rendering for the object.
diff --git a/Engine/source/platformWin32/winAsmBlit.cpp b/Engine/source/platformWin32/winAsmBlit.cpp
index 8dd6dc413..c6b96488b 100644
--- a/Engine/source/platformWin32/winAsmBlit.cpp
+++ b/Engine/source/platformWin32/winAsmBlit.cpp
@@ -195,7 +195,7 @@ void bitmapConvertRGB_to_5551_mmx(U8 *src, U32 pixels)
 void PlatformBlitInit()
 {
    bitmapExtrude5551 = bitmapExtrude5551_asm;
-   bitmapExtrudeRGB  = bitmapExtrudeRGB_c;
+   //bitmapExtrudeRGB  = bitmapExtrudeRGB_c;
 
    if (Platform::SystemInfo.processor.properties & CPU_PROP_MMX)
    {
diff --git a/Engine/source/renderInstance/renderDeferredMgr.cpp b/Engine/source/renderInstance/renderDeferredMgr.cpp
index e7fd4f907..b394f6b31 100644
--- a/Engine/source/renderInstance/renderDeferredMgr.cpp
+++ b/Engine/source/renderInstance/renderDeferredMgr.cpp
@@ -374,7 +374,7 @@ void RenderDeferredMgr::render( SceneRenderState *state )
 
    // init loop data
    GFXTextureObject *lastLM = NULL;
-   GFXCubemap *lastCubemap = NULL;
+   GFXTexHandle lastCubemap = NULL;
    GFXTextureObject *lastReflectTex = NULL;
    GFXTextureObject *lastAccuTex = NULL;
    
diff --git a/Engine/source/renderInstance/renderMeshMgr.cpp b/Engine/source/renderInstance/renderMeshMgr.cpp
index 14de5b838..dc5be1934 100644
--- a/Engine/source/renderInstance/renderMeshMgr.cpp
+++ b/Engine/source/renderInstance/renderMeshMgr.cpp
@@ -113,7 +113,7 @@ void RenderMeshMgr::render(SceneRenderState * state)
 
    // init loop data
    GFXTextureObject *lastLM = NULL;
-   GFXCubemap *lastCubemap = NULL;
+   GFXTexHandle lastCubemap = NULL;
    GFXTextureObject *lastReflectTex = NULL;
    GFXTextureObject *lastMiscTex = NULL;
    GFXTextureObject *lastAccuTex = NULL;
diff --git a/Engine/source/renderInstance/renderPassManager.h b/Engine/source/renderInstance/renderPassManager.h
index 2d4810ece..af30ea74e 100644
--- a/Engine/source/renderInstance/renderPassManager.h
+++ b/Engine/source/renderInstance/renderPassManager.h
@@ -382,7 +382,7 @@ struct MeshRenderInst : public RenderInst
    GFXTextureObject *reflectTex;
    GFXTextureObject *miscTex;
    GFXTextureObject *accuTex;
-   GFXCubemap   *cubemap;
+   GFXTexHandle cubemap;
 
    /// @name Hardware Skinning
    /// {
diff --git a/Engine/source/renderInstance/renderProbeMgr.cpp b/Engine/source/renderInstance/renderProbeMgr.cpp
index 3eda07362..99c4e979e 100644
--- a/Engine/source/renderInstance/renderProbeMgr.cpp
+++ b/Engine/source/renderInstance/renderProbeMgr.cpp
@@ -221,6 +221,9 @@ RenderProbeMgr::~RenderProbeMgr()
          SAFE_DELETE(i->value);
    }
    mConstantLookup.clear();
+
+   mIrradianceArray.free();
+   mPrefilterArray.free();
 }
 
 bool RenderProbeMgr::onAdd()
@@ -228,13 +231,11 @@ bool RenderProbeMgr::onAdd()
    if (!Parent::onAdd())
       return false;
 
-   mIrradianceArray = GFXCubemapArrayHandle(GFX->createCubemapArray());
-   mPrefilterArray = GFXCubemapArrayHandle(GFX->createCubemapArray());
-
    U32 scaledSize = getProbeTexSize();
    //pre-allocate a few slots
-   mIrradianceArray->init(PROBE_ARRAY_SLOT_BUFFER_SIZE, scaledSize, PROBE_FORMAT);
-   mPrefilterArray->init(PROBE_ARRAY_SLOT_BUFFER_SIZE, scaledSize, PROBE_FORMAT);
+   mIrradianceArray.set(scaledSize, scaledSize, PROBE_FORMAT, &GFXCubemapStaticTextureProfile, "RenderProbeMgr::mIrradianceArray", 0,0, PROBE_ARRAY_SLOT_BUFFER_SIZE);
+   mPrefilterArray.set(scaledSize, scaledSize, PROBE_FORMAT, &GFXCubemapStaticTextureProfile, "RenderProbeMgr::mPrefilterArray", 0,0, PROBE_ARRAY_SLOT_BUFFER_SIZE);
+
    mCubeSlotCount = PROBE_ARRAY_SLOT_BUFFER_SIZE;
 
    String brdfTexturePath = GFXTextureManager::getBRDFTexturePath();
@@ -376,12 +377,12 @@ void RenderProbeMgr::registerProbe(ReflectionProbe::ProbeInfo* newProbe)
    if (cubeIndex >= mCubeSlotCount)
    {
       //alloc temp array handles
-      GFXCubemapArrayHandle irr = GFXCubemapArrayHandle(GFX->createCubemapArray());
-      GFXCubemapArrayHandle prefilter = GFXCubemapArrayHandle(GFX->createCubemapArray());
+      GFXTexHandle irr;
+      GFXTexHandle prefilter;
 
       U32 scaledSize = getProbeTexSize();
-      irr->init(mCubeSlotCount + PROBE_ARRAY_SLOT_BUFFER_SIZE, scaledSize, PROBE_FORMAT);
-      prefilter->init(mCubeSlotCount + PROBE_ARRAY_SLOT_BUFFER_SIZE, scaledSize, PROBE_FORMAT);
+      irr.set(scaledSize, scaledSize, PROBE_FORMAT, &GFXCubemapStaticTextureProfile, "RenderProbeMgr::mIrradianceArray_temp_expansion", 0, 0, mCubeSlotCount + PROBE_ARRAY_SLOT_BUFFER_SIZE);
+      prefilter.set(scaledSize, scaledSize, PROBE_FORMAT, &GFXCubemapStaticTextureProfile, "RenderProbeMgr::mPrefilterArray_temp_expansion", 0, 0, mCubeSlotCount + PROBE_ARRAY_SLOT_BUFFER_SIZE);
 
       mIrradianceArray->copyTo(irr);
       mPrefilterArray->copyTo(prefilter);
@@ -390,6 +391,9 @@ void RenderProbeMgr::registerProbe(ReflectionProbe::ProbeInfo* newProbe)
       mIrradianceArray = irr;
       mPrefilterArray = prefilter;
 
+      irr.free();
+      prefilter.free();
+
       mCubeSlotCount += PROBE_ARRAY_SLOT_BUFFER_SIZE;
    }
 
@@ -466,15 +470,13 @@ void RenderProbeMgr::updateProbeTexture(ReflectionProbe::ProbeInfo* probeInfo)
       return;
    U32 scaledSize = getProbeTexSize();
    //Some basic sanity checking that we have valid cubemaps to work with
-   if (probeInfo->mIrradianceCubemap.isNull() || !probeInfo->mIrradianceCubemap->isInitialized() ||
-      probeInfo->mIrradianceCubemap->getSize() != scaledSize)
+   if (probeInfo->mIrradianceCubemap.isNull() || probeInfo->mIrradianceCubemap->getWidth() != scaledSize)
    {
       Con::errorf("RenderProbeMgr::updateProbeTexture() - tried to update a probe's texture with an invalid or uninitialized irradiance map!");
       return;
    }
 
-   if (probeInfo->mPrefilterCubemap.isNull() || !probeInfo->mPrefilterCubemap->isInitialized() ||
-      probeInfo->mPrefilterCubemap->getSize() != scaledSize)
+   if (probeInfo->mPrefilterCubemap.isNull() || probeInfo->mPrefilterCubemap->getWidth() != scaledSize)
    {
       Con::errorf("RenderProbeMgr::updateProbeTexture() - tried to update a probe's texture with an invalid or uninitialized specular map!");
       return;
@@ -482,12 +484,13 @@ void RenderProbeMgr::updateProbeTexture(ReflectionProbe::ProbeInfo* probeInfo)
 
    //Run the update on the array pair with the probe's cubemaps and index
    const U32 cubeIndex = probe->mCubemapIndex;
-   mIrradianceArray->updateTexture(probeInfo->mIrradianceCubemap, cubeIndex);
-   mPrefilterArray->updateTexture(probeInfo->mPrefilterCubemap, cubeIndex);
+   mIrradianceArray->updateTextureSlot(probeInfo->mIrradianceCubemap, cubeIndex);
+
+   mPrefilterArray->updateTextureSlot(probeInfo->mPrefilterCubemap, cubeIndex);
 
 #ifdef TORQUE_DEBUG
    Con::warnf("UpdatedProbeTexture - probe id: %u on cubeIndex %u, Irrad validity: %d, Prefilter validity: %d", probeInfo->mObject->getId(), cubeIndex,
-      probeInfo->mIrradianceCubemap->isInitialized(), probeInfo->mPrefilterCubemap->isInitialized());
+      probeInfo->mIrradianceCubemap.isValid(), probeInfo->mPrefilterCubemap.isValid());
 #endif
 }
 
@@ -616,8 +619,8 @@ void RenderProbeMgr::bakeProbe(ReflectionProbe* probe)
       clientProbe->createClientResources();
 
       //Prep it with whatever resolution we've dictated for our bake
-      clientProbe->mIrridianceMap->mCubemap->initDynamic(resolution, reflectFormat);
-      clientProbe->mPrefilterMap->mCubemap->initDynamic(resolution, reflectFormat);
+      clientProbe->mIrridianceMap->mCubemap.set(resolution, resolution, reflectFormat, &GFXCubemapRenderTargetProfile, "ReflectionProbe::mIrridianceMap", 0);
+      clientProbe->mPrefilterMap->mCubemap.set(resolution, resolution, reflectFormat, &GFXCubemapRenderTargetProfile, "ReflectionProbe::mPrefilterMap", 0);
 
       GFXTextureTargetRef renderTarget = GFX->allocRenderToTextureTarget(false);
       clientProbe->mPrefilterMap->mCubemap = cubeRefl.getCubemap();
@@ -807,9 +810,9 @@ void RenderProbeMgr::_update4ProbeConsts(const SceneData& sgData,
       shaderConsts->setSafe(probeShaderConsts->mSkylightDampSC, (int)probeSet.skyLightDamp);
 
       if (probeShaderConsts->mProbeSpecularCubemapArraySC->getSamplerRegister() != -1)
-         GFX->setCubeArrayTexture(probeShaderConsts->mProbeSpecularCubemapArraySC->getSamplerRegister(), mPrefilterArray);
+         GFX->setTexture(probeShaderConsts->mProbeSpecularCubemapArraySC->getSamplerRegister(), mPrefilterArray);
       if (probeShaderConsts->mProbeIrradianceCubemapArraySC->getSamplerRegister() != -1)
-         GFX->setCubeArrayTexture(probeShaderConsts->mProbeIrradianceCubemapArraySC->getSamplerRegister(), mIrradianceArray);
+         GFX->setTexture(probeShaderConsts->mProbeIrradianceCubemapArraySC->getSamplerRegister(), mIrradianceArray);
 
       shaderConsts->setSafe(probeShaderConsts->mMaxProbeDrawDistanceSC, smMaxProbeDrawDistance);
    }
@@ -875,8 +878,8 @@ void RenderProbeMgr::render( SceneRenderState *state )
    mProbeArrayEffect->setShaderMacro("MAX_PROBES", probePerFrame);
 
    mProbeArrayEffect->setTexture(3, mBRDFTexture);
-   mProbeArrayEffect->setCubemapArrayTexture(4, mPrefilterArray);
-   mProbeArrayEffect->setCubemapArrayTexture(5, mIrradianceArray);
+   mProbeArrayEffect->setTexture(4, mPrefilterArray);
+   mProbeArrayEffect->setTexture(5, mIrradianceArray);
    mProbeArrayEffect->setTexture(6, mWetnessTexture);
    //ssao mask
    if (AdvancedLightBinManager::smUseSSAOMask)
@@ -898,7 +901,7 @@ void RenderProbeMgr::render( SceneRenderState *state )
    mProbeArrayEffect->setShaderConst("$skylightCubemapIdx", (S32)mProbeData.skyLightIdx);
    mProbeArrayEffect->setShaderConst(ShaderGenVars::skylightDamp, mProbeData.skyLightDamp);
 
-   mProbeArrayEffect->setShaderConst("$cubeMips", (float)mPrefilterArray->getMipMapLevels());
+   mProbeArrayEffect->setShaderConst("$cubeMips", (float)mPrefilterArray->getMipLevels());
 
    //also set up some colors
    Vector<Point4F> contribColors;
diff --git a/Engine/source/renderInstance/renderProbeMgr.h b/Engine/source/renderInstance/renderProbeMgr.h
index 55dbd4315..3f953d295 100644
--- a/Engine/source/renderInstance/renderProbeMgr.h
+++ b/Engine/source/renderInstance/renderProbeMgr.h
@@ -184,7 +184,6 @@ public:
    static F32 smMaxProbeDrawDistance;
    static S32 smMaxProbesPerFrame;
    static S32 smProbeBakeResolution;
-   SceneRenderState *mState;
 private:
    /// <summary>
    /// List of registered probes. These are not necessarily rendered in a given frame
@@ -246,12 +245,12 @@ private:
    /// <summary>
    /// The prefilter cubemap array
    /// </summary>
-   GFXCubemapArrayHandle mPrefilterArray;
+   GFXTexHandle mPrefilterArray;
 
    /// <summary>
    /// The irradiance cubemap array
    /// </summary>
-   GFXCubemapArrayHandle mIrradianceArray;
+   GFXTexHandle mIrradianceArray;
 
    //Utilized in forward rendering
 
@@ -291,11 +290,6 @@ private:
    /// </summary>
    bool mUseHDRCaptures;
 
-   /// <summary>
-   /// holds the normal render state for light fade so we can capture them before and restore them after baking
-   /// </summary>
-   S32 mRenderMaximumNumOfLights;
-   bool mRenderUseLightFade;
 protected:
    /// The current active light manager.
    static RenderProbeMgr* smProbeManager;
diff --git a/Engine/source/renderInstance/renderTranslucentMgr.cpp b/Engine/source/renderInstance/renderTranslucentMgr.cpp
index 8c4b0cc8c..31816ac5f 100644
--- a/Engine/source/renderInstance/renderTranslucentMgr.cpp
+++ b/Engine/source/renderInstance/renderTranslucentMgr.cpp
@@ -142,7 +142,7 @@ void RenderTranslucentMgr::render( SceneRenderState *state )
 
    // init loop data
    GFXTextureObject *lastLM = NULL;
-   GFXCubemap *lastCubemap = NULL;
+   GFXTexHandle lastCubemap = NULL;
    GFXTextureObject *lastReflectTex = NULL;
    GFXTextureObject *lastAccuTex = NULL;
 
diff --git a/Engine/source/scene/reflector.cpp b/Engine/source/scene/reflector.cpp
index ba05abf3c..76a42d0e3 100644
--- a/Engine/source/scene/reflector.cpp
+++ b/Engine/source/scene/reflector.cpp
@@ -27,6 +27,7 @@
 #include "gfx/gfxCubemap.h"
 #include "gfx/gfxDebugEvent.h"
 #include "gfx/gfxTransformSaver.h"
+#include "gfx/util/gfxFrustumSaver.h"
 #include "scene/sceneManager.h"
 #include "scene/sceneRenderState.h"
 #include "core/stream/bitStream.h"
@@ -319,15 +320,12 @@ void CubeReflector::updateReflection( const ReflectParams &params, Point3F expli
          mCubemap.isNull() ||
          mCubemap->getFormat() != reflectFormat )
    {
-      mCubemap = GFX->createCubemap();
-      mCubemap->initDynamic( texDim, reflectFormat);
+      mCubemap.set(texDim, texDim, reflectFormat, &GFXCubemapRenderTargetProfile, "CubeReflector::updateReflection", 0);
    }
 
    if ( mRenderTarget.isNull() )
       mRenderTarget = GFX->allocRenderToTextureTarget();   
 
-   mDepthBuff = LightShadowMap::_getDepthTarget(texDim, texDim);
-   mRenderTarget->attachTexture(GFXTextureTarget::DepthStencil, mDepthBuff);
    F32 oldVisibleDist = gClientSceneGraph->getVisibleDistance();
    gClientSceneGraph->setVisibleDistance( mDesc->farDist );   
 
@@ -335,15 +333,17 @@ void CubeReflector::updateReflection( const ReflectParams &params, Point3F expli
    TSShapeInstance::smDetailAdjust *= mDesc->detailAdjust;
 
    // store current matrices
+   GFXFrustumSaver fsaver;
    GFXTransformSaver saver;
-
-   // set projection to 90 degrees vertical and horizontal
-   F32 left, right, top, bottom;
-   MathUtils::makeFrustum(&left, &right, &top, &bottom, M_HALFPI_F, 1.0f, mDesc->nearDist);
-   GFX->setFrustum(left, right, bottom, top, mDesc->nearDist, mDesc->farDist);
+   {
+      // set projection to 90 degrees vertical and horizontal
+      F32 left, right, top, bottom;
+      MathUtils::makeFrustum(&left, &right, &top, &bottom, M_HALFPI_F, 1.0f, mDesc->nearDist);
+      GFX->setFrustum(left, right, bottom, top, mDesc->nearDist, mDesc->farDist);
+   }
 
    GFX->pushActiveRenderTarget();
-   for (S32 i = 5; i >= 0; i--) {
+   for (S32 i = 0; i < 6; i++) {
       updateFace(params, i, explicitPostion);
    }
    GFX->popActiveRenderTarget();
@@ -352,7 +352,6 @@ void CubeReflector::updateReflection( const ReflectParams &params, Point3F expli
 
    mCubemap->generateMipMaps();
 
-
    gClientSceneGraph->setVisibleDistance(oldVisibleDist);
 
    mIsRendering = false;
@@ -413,19 +412,19 @@ void CubeReflector::updateFace( const ReflectParams &params, U32 faceidx, Point3
 
    GFX->setWorldMatrix(lightMatrix);
    GFX->clearTextureStateImmediate(0);
-   mRenderTarget->attachTexture( GFXTextureTarget::Color0, mCubemap, faceidx );   // Setup textures and targets...
    S32 texDim = mDesc->texSize;
    texDim = getMax(texDim, 32);
+   mRenderTarget->attachTexture(GFXTextureTarget::Color0, mCubemap, 0, 0, faceidx);   // Setup textures and targets...
    mRenderTarget->attachTexture(GFXTextureTarget::DepthStencil, LightShadowMap::_getDepthTarget(texDim, texDim));
    
-   GFX->setActiveRenderTarget(mRenderTarget);
+   GFX->setActiveRenderTarget(mRenderTarget, true);
    GFX->clear( GFXClearStencil | GFXClearTarget | GFXClearZBuffer, gCanvasClearColor, 1.0f, 0);
 
    SceneRenderState reflectRenderState
    (
       gClientSceneGraph,
       SPT_Reflect,
-      SceneCameraState::fromGFX()
+      SceneCameraState::fromGFXWithViewport(GFX->getViewport())
    );
 
    reflectRenderState.getMaterialDelegate().bind( REFLECTMGR, &ReflectionManager::getReflectionMaterial );
diff --git a/Engine/source/scene/reflector.h b/Engine/source/scene/reflector.h
index 77c90cb4f..ed9051bac 100644
--- a/Engine/source/scene/reflector.h
+++ b/Engine/source/scene/reflector.h
@@ -145,7 +145,7 @@ class CubeReflector : public ReflectorBase
 public:
 
    CubeReflector();
-   virtual ~CubeReflector() {}
+   virtual ~CubeReflector() { mCubemap.free(); }
 
    void registerReflector( SceneObject *inObject,
                            ReflectorDesc *inDesc );
@@ -153,7 +153,7 @@ public:
    void unregisterReflector() override;
    void updateReflection( const ReflectParams &params, Point3F explicitPostion = Point3F::Max) override;
 
-   GFXCubemap* getCubemap() const { return mCubemap; }
+   GFXTexHandle getCubemap() const { return mCubemap; }
 
    void updateFace( const ReflectParams &params, U32 faceidx, Point3F explicitPostion = Point3F::Max);
    F32 calcFaceScore( const ReflectParams &params, U32 faceidx );
@@ -162,7 +162,7 @@ protected:
 
    GFXTexHandle mDepthBuff;
    GFXTextureTargetRef mRenderTarget;   
-   GFXCubemapHandle  mCubemap;
+   GFXTexHandle  mCubemap;
    U32 mLastTexSize;
 
    class CubeFaceReflector : public ReflectorBase
diff --git a/Engine/source/ts/tsRenderState.h b/Engine/source/ts/tsRenderState.h
index eda8d5b3a..e30c50b6e 100644
--- a/Engine/source/ts/tsRenderState.h
+++ b/Engine/source/ts/tsRenderState.h
@@ -74,7 +74,7 @@ protected:
    
    const SceneRenderState *mState;
 
-   GFXCubemap *mCubemap;
+   GFXTexHandle mCubemap;
 
    /// Used to override the normal
    /// fade value of an object.
@@ -134,8 +134,8 @@ public:
    void setSceneState( const SceneRenderState *state ) { mState = state; }
 
    ///@see mCubemap
-   GFXCubemap* getCubemap() const { return mCubemap; }
-   void setCubemap( GFXCubemap *cubemap ) { mCubemap = cubemap; }
+   GFXTexHandle getCubemap() const { return mCubemap; }
+   void setCubemap(GFXTexHandle cubemap ) { mCubemap = cubemap; }
 
    ///@see mFadeOverride
    F32 getFadeOverride() const { return mFadeOverride; }