diff --git a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp
index c5710ee..ba7db4d 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp
@@ -13,12 +13,12 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
     , m_surfaceWriteModule(nullptr)
     , m_surfaceWriteKernel(nullptr)
 {
-    // Load surface write kernel
+    // Load surface write kernel immediately since CUDA context is already active
     if (!LoadSurfaceWriteKernel()) {
-        LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel");
+        LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel during initialization");
+    } else {
+        LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel loaded successfully");
     }
-
-    LOGF_DEBUG("[D3D12SurfaceHandler] Created");
 }
 
 D3D12SurfaceHandler::~D3D12SurfaceHandler()
@@ -303,7 +303,7 @@ bool D3D12SurfaceHandler::LoadSurfaceWriteKernel()
 {
     LOGF_DEBUG("[D3D12SurfaceHandler] Loading PTX from embedded string");
 
-    // Load PTX module from embedded string
+    // Load PTX module from embedded string (context should already be active from NVDEC initialization)
     CUresult result = cuModuleLoadData(&m_surfaceWriteModule, RGBA_SURFACE_WRITE_KERNEL_PTX);
     if (result != CUDA_SUCCESS) {
         LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write PTX from memory: %d", result);
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
index 4181176..b610b23 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -689,6 +689,34 @@ bool NVDECAV1Decoder::InitializeCUDA() {
         LOGF_DEBUG("[InitializeCUDA] D3D12 device detected, CUDA-D3D12 interop will be enabled");
 
         LOGF_DEBUG("[InitializeCUDA] D3D12 device: %p, CUDA context: %p", m_d3d12Device, m_cuContext);
+
+        // If SetD3DDevice was called before Initialize, create handlers now
+        if (!m_d3d12Handler) {
+            LOGF_DEBUG("[InitializeCUDA] Creating D3D12 handlers (deferred from SetD3DDevice)");
+
+            // Set CUDA context before creating D3D12SurfaceHandler
+            {
+                std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
+                CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
+                if (ctxResult != CUDA_SUCCESS) {
+                    LOGF_ERROR("[InitializeCUDA] cuCtxSetCurrent failed with code %d", ctxResult);
+                    return false;
+                }
+            }
+
+            // Create handlers
+            m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
+                static_cast<ID3D12Device*>(m_d3d12Device),
+                m_cuContext
+            );
+
+            m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
+
+            LOGF_DEBUG("[InitializeCUDA] D3D12SurfaceHandler and NV12ToRGBAConverter instances created");
+
+            m_cachedD3D12Device = m_d3d12Device;
+            m_cachedCuContext = m_cuContext;
+        }
     } else {
         LOGF_DEBUG("[InitializeCUDA] No D3D device, using CPU decoding path");
     }
@@ -1128,29 +1156,41 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) {
                 LOGF_DEBUG("[SetD3DDevice] D3D12 Fence already exists: %p", m_d3d12Fence);
             }
 
+            // Check if CUDA context has been created yet
+            if (m_cuContext) {
+                // CUDA context exists - create handlers now
+                LOGF_DEBUG("[SetD3DDevice] CUDA context exists, creating D3D12 handlers immediately");
 
-            // (Re)create handlers since the device context is being set/changed.
-            m_d3d12Handler.reset();
-            m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
-                static_cast<ID3D12Device*>(m_d3d12Device),
-                m_cuContext
-            );
+                // Set CUDA context BEFORE creating D3D12SurfaceHandler
+                // This ensures that the module loaded in D3D12SurfaceHandler constructor
+                // is bound to the correct context (m_cuContext)
+                {
+                    std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
+                    CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
+                    if (ctxResult != CUDA_SUCCESS) {
+                        LOGF_ERROR("[SetD3DDevice] cuCtxSetCurrent failed with code %d", ctxResult);
+                        return false;
+                    }
+                    LOGF_DEBUG("[SetD3DDevice] CUDA context set before creating D3D12SurfaceHandler");
+                }
 
-            m_rgbaConverter.reset();
-            m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
+                // (Re)create handlers since the device context is being set/changed
+                m_d3d12Handler.reset();
+                m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
+                    static_cast<ID3D12Device*>(m_d3d12Device),
+                    m_cuContext
+                );
 
-            LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");
+                m_rgbaConverter.reset();
+                m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
 
-            m_cachedD3D12Device = m_d3d12Device;
-            m_cachedCuContext = m_cuContext;
+                LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");
 
-
-            // DON'T reinitialize if already initialized - this would invalidate existing decoded frames
-            if (m_initialized) {
-                LOGF_DEBUG("[SetD3DDevice] Decoder already initialized, D3D12 device set for DecodeToSurface");
-                LOGF_DEBUG("[SetD3DDevice] Note: D3D12 zero-copy will be available for DecodeToSurface calls");
+                m_cachedD3D12Device = m_d3d12Device;
+                m_cachedCuContext = m_cuContext;
             } else {
-                LOGF_DEBUG("[SetD3DDevice] D3D12 device set, will be used when Initialize() is called");
+                // CUDA context not created yet - handlers will be initialized after InitializeCUDA()
+                LOGF_DEBUG("[SetD3DDevice] CUDA context not created yet, D3D12 handlers will be initialized after InitializeCUDA()");
             }
 
             success = true;
@@ -1405,15 +1445,16 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                 // RGBA path: NV12 -> RGBA conversion
                 LOGF_DEBUG("[DecodeToSurface] RGBA format detected, using NV12ToRGBAConverter");
 
-                // m_rgbaConverter is now created in SetD3D_Device.
-                // The Initialize method should be idempotent or check internally if it's already initialized.
-                if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
-                    LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
-                    cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
-                    my_slot.in_use.store(false);
-                    m_returnCounter.fetch_add(1);
-                    m_fifoWaitCV.notify_all(); // Notify others that we are skipping
-                    return false;
+                // Initialize converter only if not already initialized (to avoid repeated reinitialization on each frame)
+                if (!m_rgbaConverter->IsInitialized()) {
+                    if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
+                        LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
+                        cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+                        my_slot.in_use.store(false);
+                        m_returnCounter.fetch_add(1);
+                        m_fifoWaitCV.notify_all(); // Notify others that we are skipping
+                        return false;
+                    }
                 }
 
                 // Convert NV12 to RGBA
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu
index 374f140..274f553 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu
+++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu
@@ -1,9 +1,11 @@
 // CUDA kernel to write RGBA buffer to surface object
 // This handles the tiled texture layout automatically
+// NOTE: cudaSurfaceObject_t and CUsurfObject are the same type (unsigned long long)
+// surf2Dwrite() works with both Runtime and Driver API surface objects
 
 extern "C" __global__ void WriteSurfaceFromBuffer_Kernel(
     const unsigned char* __restrict__ rgba_buffer,
-    cudaSurfaceObject_t rgba_surface,
+    unsigned long long rgba_surface,  // Works with both cudaSurfaceObject_t and CUsurfObject
     unsigned int width,
     unsigned int height,
     unsigned int pitch
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h
index dd9992a..5981a5e 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h
@@ -1,10 +1,5 @@
 #pragma once
 
-// Embedded PTX for RGBA surface write kernel
-// Generated from rgba_surface_write_kernel.cu
-
-namespace VavCore {
-
 const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
 //
 // Generated by NVIDIA NVVM Compiler
@@ -15,7 +10,7 @@ const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
 //
 
 .version 9.0
-.target sm_86
+.target sm_75
 .address_size 64
 
 	// .globl	WriteSurfaceFromBuffer_Kernel
@@ -67,6 +62,5 @@ $L__BB0_2:
 	ret;
 
 }
-)PTX";
 
-} // namespace VavCore
+)PTX";