Summary of fixes completed:

1.  Deferred D3D12SurfaceHandler creation to InitializeCUDA() when SetD3DDevice is called before Initialize
  2.  Fixed NV12ToRGBAConverter repeated reinitialization by adding IsInitialized() check before calling
  Initialize()
  3.  Test now successfully decodes 24 frames without resource thrashing

  Remaining issue (in test app, not VavCore):
  - RedSurfaceNVDECTest creates a new D3D12Resource for every frame instead of reusing a pool
  - This causes ExternalMemoryCache to create unlimited surface objects
  - Fix: Test app should reuse a small pool of textures (e.g., 3-5 textures for buffering)
This commit is contained in:
2025-10-07 04:49:13 +09:00
parent f3fc17c796
commit ce71a38d59
4 changed files with 77 additions and 40 deletions

View File

@@ -13,12 +13,12 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
, m_surfaceWriteModule(nullptr)
, m_surfaceWriteKernel(nullptr)
{
// Load surface write kernel
// Load surface write kernel immediately since CUDA context is already active
if (!LoadSurfaceWriteKernel()) {
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel");
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel during initialization");
} else {
LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel loaded successfully");
}
LOGF_DEBUG("[D3D12SurfaceHandler] Created");
}
D3D12SurfaceHandler::~D3D12SurfaceHandler()
@@ -303,7 +303,7 @@ bool D3D12SurfaceHandler::LoadSurfaceWriteKernel()
{
LOGF_DEBUG("[D3D12SurfaceHandler] Loading PTX from embedded string");
// Load PTX module from embedded string
// Load PTX module from embedded string (context should already be active from NVDEC initialization)
CUresult result = cuModuleLoadData(&m_surfaceWriteModule, RGBA_SURFACE_WRITE_KERNEL_PTX);
if (result != CUDA_SUCCESS) {
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write PTX from memory: %d", result);

View File

@@ -689,6 +689,34 @@ bool NVDECAV1Decoder::InitializeCUDA() {
LOGF_DEBUG("[InitializeCUDA] D3D12 device detected, CUDA-D3D12 interop will be enabled");
LOGF_DEBUG("[InitializeCUDA] D3D12 device: %p, CUDA context: %p", m_d3d12Device, m_cuContext);
// If SetD3DDevice was called before Initialize, create handlers now
if (!m_d3d12Handler) {
LOGF_DEBUG("[InitializeCUDA] Creating D3D12 handlers (deferred from SetD3DDevice)");
// Set CUDA context before creating D3D12SurfaceHandler
{
std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
if (ctxResult != CUDA_SUCCESS) {
LOGF_ERROR("[InitializeCUDA] cuCtxSetCurrent failed with code %d", ctxResult);
return false;
}
}
// Create handlers
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
static_cast<ID3D12Device*>(m_d3d12Device),
m_cuContext
);
m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
LOGF_DEBUG("[InitializeCUDA] D3D12SurfaceHandler and NV12ToRGBAConverter instances created");
m_cachedD3D12Device = m_d3d12Device;
m_cachedCuContext = m_cuContext;
}
} else {
LOGF_DEBUG("[InitializeCUDA] No D3D device, using CPU decoding path");
}
@@ -1128,29 +1156,41 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) {
LOGF_DEBUG("[SetD3DDevice] D3D12 Fence already exists: %p", m_d3d12Fence);
}
// Check if CUDA context has been created yet
if (m_cuContext) {
// CUDA context exists - create handlers now
LOGF_DEBUG("[SetD3DDevice] CUDA context exists, creating D3D12 handlers immediately");
// (Re)create handlers since the device context is being set/changed.
m_d3d12Handler.reset();
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
static_cast<ID3D12Device*>(m_d3d12Device),
m_cuContext
);
// Set CUDA context BEFORE creating D3D12SurfaceHandler
// This ensures that the module loaded in D3D12SurfaceHandler constructor
// is bound to the correct context (m_cuContext)
{
std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
if (ctxResult != CUDA_SUCCESS) {
LOGF_ERROR("[SetD3DDevice] cuCtxSetCurrent failed with code %d", ctxResult);
return false;
}
LOGF_DEBUG("[SetD3DDevice] CUDA context set before creating D3D12SurfaceHandler");
}
m_rgbaConverter.reset();
m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
// (Re)create handlers since the device context is being set/changed
m_d3d12Handler.reset();
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
static_cast<ID3D12Device*>(m_d3d12Device),
m_cuContext
);
LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");
m_rgbaConverter.reset();
m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
m_cachedD3D12Device = m_d3d12Device;
m_cachedCuContext = m_cuContext;
LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");
// DON'T reinitialize if already initialized - this would invalidate existing decoded frames
if (m_initialized) {
LOGF_DEBUG("[SetD3DDevice] Decoder already initialized, D3D12 device set for DecodeToSurface");
LOGF_DEBUG("[SetD3DDevice] Note: D3D12 zero-copy will be available for DecodeToSurface calls");
m_cachedD3D12Device = m_d3d12Device;
m_cachedCuContext = m_cuContext;
} else {
LOGF_DEBUG("[SetD3DDevice] D3D12 device set, will be used when Initialize() is called");
// CUDA context not created yet - handlers will be initialized after InitializeCUDA()
LOGF_DEBUG("[SetD3DDevice] CUDA context not created yet, D3D12 handlers will be initialized after InitializeCUDA()");
}
success = true;
@@ -1405,15 +1445,16 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
// RGBA path: NV12 -> RGBA conversion
LOGF_DEBUG("[DecodeToSurface] RGBA format detected, using NV12ToRGBAConverter");
// m_rgbaConverter is now created in SetD3D_Device.
// The Initialize method should be idempotent or check internally if it's already initialized.
if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
my_slot.in_use.store(false);
m_returnCounter.fetch_add(1);
m_fifoWaitCV.notify_all(); // Notify others that we are skipping
return false;
// Initialize converter only if not already initialized (to avoid repeated reinitialization on each frame)
if (!m_rgbaConverter->IsInitialized()) {
if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
my_slot.in_use.store(false);
m_returnCounter.fetch_add(1);
m_fifoWaitCV.notify_all(); // Notify others that we are skipping
return false;
}
}
// Convert NV12 to RGBA

View File

@@ -1,9 +1,11 @@
// CUDA kernel to write RGBA buffer to surface object
// This handles the tiled texture layout automatically
// NOTE: cudaSurfaceObject_t and CUsurfObject are the same type (unsigned long long)
// surf2Dwrite() works with both Runtime and Driver API surface objects
extern "C" __global__ void WriteSurfaceFromBuffer_Kernel(
const unsigned char* __restrict__ rgba_buffer,
cudaSurfaceObject_t rgba_surface,
unsigned long long rgba_surface, // Works with both cudaSurfaceObject_t and CUsurfObject
unsigned int width,
unsigned int height,
unsigned int pitch

View File

@@ -1,10 +1,5 @@
#pragma once
// Embedded PTX for RGBA surface write kernel
// Generated from rgba_surface_write_kernel.cu
namespace VavCore {
const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
//
// Generated by NVIDIA NVVM Compiler
@@ -15,7 +10,7 @@ const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
//
.version 9.0
.target sm_86
.target sm_75
.address_size 64
// .globl WriteSurfaceFromBuffer_Kernel
@@ -67,6 +62,5 @@ $L__BB0_2:
ret;
}
)PTX";
} // namespace VavCore
)PTX";