Summary of fixes completed:
1. ✅ Deferred D3D12SurfaceHandler creation to InitializeCUDA() when SetD3DDevice is called before Initialize 2. ✅ Fixed NV12ToRGBAConverter repeated reinitialization by adding IsInitialized() check before calling Initialize() 3. ✅ Test now successfully decodes 24 frames without resource thrashing Remaining issue (in test app, not VavCore): - RedSurfaceNVDECTest creates a new D3D12Resource for every frame instead of reusing a pool - This causes ExternalMemoryCache to create unlimited surface objects - Fix: Test app should reuse a small pool of textures (e.g., 3-5 textures for buffering)
This commit is contained in:
@@ -13,12 +13,12 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
|
||||
, m_surfaceWriteModule(nullptr)
|
||||
, m_surfaceWriteKernel(nullptr)
|
||||
{
|
||||
// Load surface write kernel
|
||||
// Load surface write kernel immediately since CUDA context is already active
|
||||
if (!LoadSurfaceWriteKernel()) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel");
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel during initialization");
|
||||
} else {
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel loaded successfully");
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Created");
|
||||
}
|
||||
|
||||
D3D12SurfaceHandler::~D3D12SurfaceHandler()
|
||||
@@ -303,7 +303,7 @@ bool D3D12SurfaceHandler::LoadSurfaceWriteKernel()
|
||||
{
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Loading PTX from embedded string");
|
||||
|
||||
// Load PTX module from embedded string
|
||||
// Load PTX module from embedded string (context should already be active from NVDEC initialization)
|
||||
CUresult result = cuModuleLoadData(&m_surfaceWriteModule, RGBA_SURFACE_WRITE_KERNEL_PTX);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write PTX from memory: %d", result);
|
||||
|
||||
@@ -689,6 +689,34 @@ bool NVDECAV1Decoder::InitializeCUDA() {
|
||||
LOGF_DEBUG("[InitializeCUDA] D3D12 device detected, CUDA-D3D12 interop will be enabled");
|
||||
|
||||
LOGF_DEBUG("[InitializeCUDA] D3D12 device: %p, CUDA context: %p", m_d3d12Device, m_cuContext);
|
||||
|
||||
// If SetD3DDevice was called before Initialize, create handlers now
|
||||
if (!m_d3d12Handler) {
|
||||
LOGF_DEBUG("[InitializeCUDA] Creating D3D12 handlers (deferred from SetD3DDevice)");
|
||||
|
||||
// Set CUDA context before creating D3D12SurfaceHandler
|
||||
{
|
||||
std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
|
||||
CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
|
||||
if (ctxResult != CUDA_SUCCESS) {
|
||||
LOGF_ERROR("[InitializeCUDA] cuCtxSetCurrent failed with code %d", ctxResult);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Create handlers
|
||||
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
|
||||
static_cast<ID3D12Device*>(m_d3d12Device),
|
||||
m_cuContext
|
||||
);
|
||||
|
||||
m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
|
||||
|
||||
LOGF_DEBUG("[InitializeCUDA] D3D12SurfaceHandler and NV12ToRGBAConverter instances created");
|
||||
|
||||
m_cachedD3D12Device = m_d3d12Device;
|
||||
m_cachedCuContext = m_cuContext;
|
||||
}
|
||||
} else {
|
||||
LOGF_DEBUG("[InitializeCUDA] No D3D device, using CPU decoding path");
|
||||
}
|
||||
@@ -1128,29 +1156,41 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) {
|
||||
LOGF_DEBUG("[SetD3DDevice] D3D12 Fence already exists: %p", m_d3d12Fence);
|
||||
}
|
||||
|
||||
// Check if CUDA context has been created yet
|
||||
if (m_cuContext) {
|
||||
// CUDA context exists - create handlers now
|
||||
LOGF_DEBUG("[SetD3DDevice] CUDA context exists, creating D3D12 handlers immediately");
|
||||
|
||||
// (Re)create handlers since the device context is being set/changed.
|
||||
m_d3d12Handler.reset();
|
||||
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
|
||||
static_cast<ID3D12Device*>(m_d3d12Device),
|
||||
m_cuContext
|
||||
);
|
||||
// Set CUDA context BEFORE creating D3D12SurfaceHandler
|
||||
// This ensures that the module loaded in D3D12SurfaceHandler constructor
|
||||
// is bound to the correct context (m_cuContext)
|
||||
{
|
||||
std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
|
||||
CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
|
||||
if (ctxResult != CUDA_SUCCESS) {
|
||||
LOGF_ERROR("[SetD3DDevice] cuCtxSetCurrent failed with code %d", ctxResult);
|
||||
return false;
|
||||
}
|
||||
LOGF_DEBUG("[SetD3DDevice] CUDA context set before creating D3D12SurfaceHandler");
|
||||
}
|
||||
|
||||
m_rgbaConverter.reset();
|
||||
m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
|
||||
// (Re)create handlers since the device context is being set/changed
|
||||
m_d3d12Handler.reset();
|
||||
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
|
||||
static_cast<ID3D12Device*>(m_d3d12Device),
|
||||
m_cuContext
|
||||
);
|
||||
|
||||
LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");
|
||||
m_rgbaConverter.reset();
|
||||
m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
|
||||
|
||||
m_cachedD3D12Device = m_d3d12Device;
|
||||
m_cachedCuContext = m_cuContext;
|
||||
LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created.");
|
||||
|
||||
|
||||
// DON'T reinitialize if already initialized - this would invalidate existing decoded frames
|
||||
if (m_initialized) {
|
||||
LOGF_DEBUG("[SetD3DDevice] Decoder already initialized, D3D12 device set for DecodeToSurface");
|
||||
LOGF_DEBUG("[SetD3DDevice] Note: D3D12 zero-copy will be available for DecodeToSurface calls");
|
||||
m_cachedD3D12Device = m_d3d12Device;
|
||||
m_cachedCuContext = m_cuContext;
|
||||
} else {
|
||||
LOGF_DEBUG("[SetD3DDevice] D3D12 device set, will be used when Initialize() is called");
|
||||
// CUDA context not created yet - handlers will be initialized after InitializeCUDA()
|
||||
LOGF_DEBUG("[SetD3DDevice] CUDA context not created yet, D3D12 handlers will be initialized after InitializeCUDA()");
|
||||
}
|
||||
|
||||
success = true;
|
||||
@@ -1405,15 +1445,16 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
// RGBA path: NV12 -> RGBA conversion
|
||||
LOGF_DEBUG("[DecodeToSurface] RGBA format detected, using NV12ToRGBAConverter");
|
||||
|
||||
// m_rgbaConverter is now created in SetD3D_Device.
|
||||
// The Initialize method should be idempotent or check internally if it's already initialized.
|
||||
if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
|
||||
LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
|
||||
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
|
||||
my_slot.in_use.store(false);
|
||||
m_returnCounter.fetch_add(1);
|
||||
m_fifoWaitCV.notify_all(); // Notify others that we are skipping
|
||||
return false;
|
||||
// Initialize converter only if not already initialized (to avoid repeated reinitialization on each frame)
|
||||
if (!m_rgbaConverter->IsInitialized()) {
|
||||
if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) {
|
||||
LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter");
|
||||
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
|
||||
my_slot.in_use.store(false);
|
||||
m_returnCounter.fetch_add(1);
|
||||
m_fifoWaitCV.notify_all(); // Notify others that we are skipping
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert NV12 to RGBA
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
// CUDA kernel to write RGBA buffer to surface object
|
||||
// This handles the tiled texture layout automatically
|
||||
// NOTE: cudaSurfaceObject_t and CUsurfObject are the same type (unsigned long long)
|
||||
// surf2Dwrite() works with both Runtime and Driver API surface objects
|
||||
|
||||
extern "C" __global__ void WriteSurfaceFromBuffer_Kernel(
|
||||
const unsigned char* __restrict__ rgba_buffer,
|
||||
cudaSurfaceObject_t rgba_surface,
|
||||
unsigned long long rgba_surface, // Works with both cudaSurfaceObject_t and CUsurfObject
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
unsigned int pitch
|
||||
|
||||
@@ -1,10 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
// Embedded PTX for RGBA surface write kernel
|
||||
// Generated from rgba_surface_write_kernel.cu
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
@@ -15,7 +10,7 @@ const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
|
||||
//
|
||||
|
||||
.version 9.0
|
||||
.target sm_86
|
||||
.target sm_75
|
||||
.address_size 64
|
||||
|
||||
// .globl WriteSurfaceFromBuffer_Kernel
|
||||
@@ -67,6 +62,5 @@ $L__BB0_2:
|
||||
ret;
|
||||
|
||||
}
|
||||
)PTX";
|
||||
|
||||
} // namespace VavCore
|
||||
)PTX";
|
||||
|
||||
Reference in New Issue
Block a user