Summary of fixes completed:

1. ✅ Deferred D3D12SurfaceHandler creation to InitializeCUDA() when SetD3DDevice is called before Initialize 2. ✅ Fixed NV12ToRGBAConverter repeated reinitialization by adding IsInitialized() check before calling Initialize() 3. ✅ Test now successfully decodes 24 frames without resource thrashing Remaining issue (in test app, not VavCore): - RedSurfaceNVDECTest creates a new D3D12Resource for every frame instead of reusing a pool - This causes ExternalMemoryCache to create unlimited surface objects - Fix: Test app should reuse a small pool of textures (e.g., 3-5 textures for buffering)
2025-10-07 04:49:13 +09:00
parent f3fc17c796
commit ce71a38d59
4 changed files with 77 additions and 40 deletions
--- a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp
@@ -13,12 +13,12 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
    , m_surfaceWriteModule(nullptr)
    , m_surfaceWriteKernel(nullptr)
 {
-    // Load surface write kernel
+    // Load surface write kernel immediately since CUDA context is already active
    if (!LoadSurfaceWriteKernel()) {
-        LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel");
+        LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel during initialization");
+    } else {
+        LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel loaded successfully");
    }
-
-    LOGF_DEBUG("[D3D12SurfaceHandler] Created");
 }

 D3D12SurfaceHandler::~D3D12SurfaceHandler()
@@ -303,7 +303,7 @@ bool D3D12SurfaceHandler::LoadSurfaceWriteKernel()
 {
    LOGF_DEBUG("[D3D12SurfaceHandler] Loading PTX from embedded string");

-    // Load PTX module from embedded string
+    // Load PTX module from embedded string (context should already be active from NVDEC initialization)
    CUresult result = cuModuleLoadData(&m_surfaceWriteModule, RGBA_SURFACE_WRITE_KERNEL_PTX);
    if (result != CUDA_SUCCESS) {
        LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write PTX from memory: %d", result);
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -689,6 +689,34 @@ bool NVDECAV1Decoder::InitializeCUDA() {
        LOGF_DEBUG("[InitializeCUDA] D3D12 device detected, CUDA-D3D12 interop will be enabled");

        LOGF_DEBUG("[InitializeCUDA] D3D12 device: %p, CUDA context: %p", m_d3d12Device, m_cuContext);
+
+        // If SetD3DDevice was called before Initialize, create handlers now
+        if (!m_d3d12Handler) {
+            LOGF_DEBUG("[InitializeCUDA] Creating D3D12 handlers (deferred from SetD3DDevice)");
+
+            // Set CUDA context before creating D3D12SurfaceHandler
+            {
+                std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
+                CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
+                if (ctxResult != CUDA_SUCCESS) {
+                    LOGF_ERROR("[InitializeCUDA] cuCtxSetCurrent failed with code %d", ctxResult);
+                    return false;
+                }
+            }
+
+            // Create handlers
+            m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
+                static_cast<ID3D12Device*>(m_d3d12Device),
+                m_cuContext
+            );
+
+            m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
+
+            LOGF_DEBUG("[InitializeCUDA] D3D12SurfaceHandler and NV12ToRGBAConverter instances created");
+
+            m_cachedD3D12Device = m_d3d12Device;
+            m_cachedCuContext = m_cuContext;
+        }
    } else {
        LOGF_DEBUG("[InitializeCUDA] No D3D device, using CPU decoding path");
    }
@@ -1128,29 +1156,41 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) {
                LOGF_DEBUG("[SetD3DDevice] D3D12 Fence already exists: %p", m_d3d12Fence);
            }

+            // Check if CUDA context has been created yet
+            if (m_cuContext) {
+                // CUDA context exists - create handlers now
+                LOGF_DEBUG("[SetD3DDevice] CUDA context exists, creating D3D12 handlers immediately");

-            // (Re)create handlers since the device context is being set/changed.
-            m_d3d12Handler.reset();
-            m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
-                static_cast<ID3D12Device*>(m_d3d12Device),
-                m_cuContext
-            );
+                // Set CUDA context BEFORE creating D3D12SurfaceHandler
+                // This ensures that the module loaded in D3D12SurfaceHandler constructor
+                // is bound to the correct context (m_cuContext)
+                {
+                    std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
+                    CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
+                    if (ctxResult != CUDA_SUCCESS) {
+                        LOGF_ERROR("[SetD3DDevice] cuCtxSetCurrent failed with code %d", ctxResult);
+                        return false;
+                    }
+                    LOGF_DEBUG("[SetD3DDevice] CUDA context set before creating D3D12SurfaceHandler");
+                }

-            m_rgbaConverter.reset();
-            m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
+                // (Re)create handlers since the device context is being set/changed
+                m_d3d12Handler.reset();
+                m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
+                    static_cast<ID3D12Device*>(m_d3d12Device),
+                    m_cuContext
+                );

-            LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");
+                m_rgbaConverter.reset();
+                m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();

-            m_cachedD3D12Device = m_d3d12Device;
-            m_cachedCuContext = m_cuContext;
+                LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");

-
-            // DON'T reinitialize if already initialized - this would invalidate existing decoded frames
-            if (m_initialized) {
-                LOGF_DEBUG("[SetD3DDevice] Decoder already initialized, D3D12 device set for DecodeToSurface");
-                LOGF_DEBUG("[SetD3DDevice] Note: D3D12 zero-copy will be available for DecodeToSurface calls");
+                m_cachedD3D12Device = m_d3d12Device;
+                m_cachedCuContext = m_cuContext;
            } else {
-                LOGF_DEBUG("[SetD3DDevice] D3D12 device set, will be used when Initialize() is called");
+                // CUDA context not created yet - handlers will be initialized after InitializeCUDA()
+                LOGF_DEBUG("[SetD3DDevice] CUDA context not created yet, D3D12 handlers will be initialized after InitializeCUDA()");
            }

            success = true;
@@ -1405,15 +1445,16 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                // RGBA path: NV12 -> RGBA conversion
                LOGF_DEBUG("[DecodeToSurface] RGBA format detected, using NV12ToRGBAConverter");

-                // m_rgbaConverter is now created in SetD3D_Device.
-                // The Initialize method should be idempotent or check internally if it's already initialized.
-                if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
-                    LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
-                    cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
-                    my_slot.in_use.store(false);
-                    m_returnCounter.fetch_add(1);
-                    m_fifoWaitCV.notify_all(); // Notify others that we are skipping
-                    return false;
+                // Initialize converter only if not already initialized (to avoid repeated reinitialization on each frame)
+                if (!m_rgbaConverter->IsInitialized()) {
+                    if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
+                        LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
+                        cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+                        my_slot.in_use.store(false);
+                        m_returnCounter.fetch_add(1);
+                        m_fifoWaitCV.notify_all(); // Notify others that we are skipping
+                        return false;
+                    }
                }

                // Convert NV12 to RGBA
--- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu
+++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu
@@ -1,9 +1,11 @@
 // CUDA kernel to write RGBA buffer to surface object
 // This handles the tiled texture layout automatically
+// NOTE: cudaSurfaceObject_t and CUsurfObject are the same type (unsigned long long)
+// surf2Dwrite() works with both Runtime and Driver API surface objects

 extern "C" __global__ void WriteSurfaceFromBuffer_Kernel(
    const unsigned char* __restrict__ rgba_buffer,
-    cudaSurfaceObject_t rgba_surface,
+    unsigned long long rgba_surface,  // Works with both cudaSurfaceObject_t and CUsurfObject
    unsigned int width,
    unsigned int height,
    unsigned int pitch
--- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h
@@ -1,10 +1,5 @@
 #pragma once

-// Embedded PTX for RGBA surface write kernel
-// Generated from rgba_surface_write_kernel.cu
-
-namespace VavCore {
-
 const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
 //
 // Generated by NVIDIA NVVM Compiler
@@ -15,7 +10,7 @@ const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
 //

 .version 9.0
-.target sm_86
+.target sm_75
 .address_size 64

 	// .globl	WriteSurfaceFromBuffer_Kernel
@@ -67,6 +62,5 @@ $L__BB0_2:
 	ret;

 }
-)PTX";

-} // namespace VavCore
+)PTX";