diff --git a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp index c5710ee..ba7db4d 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp @@ -13,12 +13,12 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co , m_surfaceWriteModule(nullptr) , m_surfaceWriteKernel(nullptr) { - // Load surface write kernel + // Load surface write kernel immediately since CUDA context is already active if (!LoadSurfaceWriteKernel()) { - LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel"); + LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel during initialization"); + } else { + LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel loaded successfully"); } - - LOGF_DEBUG("[D3D12SurfaceHandler] Created"); } D3D12SurfaceHandler::~D3D12SurfaceHandler() @@ -303,7 +303,7 @@ bool D3D12SurfaceHandler::LoadSurfaceWriteKernel() { LOGF_DEBUG("[D3D12SurfaceHandler] Loading PTX from embedded string"); - // Load PTX module from embedded string + // Load PTX module from embedded string (context should already be active from NVDEC initialization) CUresult result = cuModuleLoadData(&m_surfaceWriteModule, RGBA_SURFACE_WRITE_KERNEL_PTX); if (result != CUDA_SUCCESS) { LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write PTX from memory: %d", result); diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp index 4181176..b610b23 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp @@ -689,6 +689,34 @@ bool NVDECAV1Decoder::InitializeCUDA() { LOGF_DEBUG("[InitializeCUDA] D3D12 device detected, CUDA-D3D12 interop will be enabled"); LOGF_DEBUG("[InitializeCUDA] D3D12 device: %p, CUDA context: %p", m_d3d12Device, m_cuContext); + + // If SetD3DDevice was called before Initialize, create handlers now + if (!m_d3d12Handler) { + LOGF_DEBUG("[InitializeCUDA] Creating D3D12 handlers (deferred from SetD3DDevice)"); + + // Set CUDA context before creating D3D12SurfaceHandler + { + std::lock_guard contextLock(m_cudaContextMutex); + CUresult ctxResult = cuCtxSetCurrent(m_cuContext); + if (ctxResult != CUDA_SUCCESS) { + LOGF_ERROR("[InitializeCUDA] cuCtxSetCurrent failed with code %d", ctxResult); + return false; + } + } + + // Create handlers + m_d3d12Handler = std::make_unique( + static_cast(m_d3d12Device), + m_cuContext + ); + + m_rgbaConverter = std::make_unique(); + + LOGF_DEBUG("[InitializeCUDA] D3D12SurfaceHandler and NV12ToRGBAConverter instances created"); + + m_cachedD3D12Device = m_d3d12Device; + m_cachedCuContext = m_cuContext; + } } else { LOGF_DEBUG("[InitializeCUDA] No D3D device, using CPU decoding path"); } @@ -1128,29 +1156,41 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) { LOGF_DEBUG("[SetD3DDevice] D3D12 Fence already exists: %p", m_d3d12Fence); } + // Check if CUDA context has been created yet + if (m_cuContext) { + // CUDA context exists - create handlers now + LOGF_DEBUG("[SetD3DDevice] CUDA context exists, creating D3D12 handlers immediately"); - // (Re)create handlers since the device context is being set/changed. - m_d3d12Handler.reset(); - m_d3d12Handler = std::make_unique( - static_cast(m_d3d12Device), - m_cuContext - ); + // Set CUDA context BEFORE creating D3D12SurfaceHandler + // This ensures that the module loaded in D3D12SurfaceHandler constructor + // is bound to the correct context (m_cuContext) + { + std::lock_guard contextLock(m_cudaContextMutex); + CUresult ctxResult = cuCtxSetCurrent(m_cuContext); + if (ctxResult != CUDA_SUCCESS) { + LOGF_ERROR("[SetD3DDevice] cuCtxSetCurrent failed with code %d", ctxResult); + return false; + } + LOGF_DEBUG("[SetD3DDevice] CUDA context set before creating D3D12SurfaceHandler"); + } - m_rgbaConverter.reset(); - m_rgbaConverter = std::make_unique(); + // (Re)create handlers since the device context is being set/changed + m_d3d12Handler.reset(); + m_d3d12Handler = std::make_unique( + static_cast(m_d3d12Device), + m_cuContext + ); - LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created."); + m_rgbaConverter.reset(); + m_rgbaConverter = std::make_unique(); - m_cachedD3D12Device = m_d3d12Device; - m_cachedCuContext = m_cuContext; + LOGF_DEBUG("[SetD3DDevice] D3D12SurfaceHandler and NV12ToRGBAConverter instances created."); - - // DON'T reinitialize if already initialized - this would invalidate existing decoded frames - if (m_initialized) { - LOGF_DEBUG("[SetD3DDevice] Decoder already initialized, D3D12 device set for DecodeToSurface"); - LOGF_DEBUG("[SetD3DDevice] Note: D3D12 zero-copy will be available for DecodeToSurface calls"); + m_cachedD3D12Device = m_d3d12Device; + m_cachedCuContext = m_cuContext; } else { - LOGF_DEBUG("[SetD3DDevice] D3D12 device set, will be used when Initialize() is called"); + // CUDA context not created yet - handlers will be initialized after InitializeCUDA() + LOGF_DEBUG("[SetD3DDevice] CUDA context not created yet, D3D12 handlers will be initialized after InitializeCUDA()"); } success = true; @@ -1405,15 +1445,16 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ // RGBA path: NV12 -> RGBA conversion LOGF_DEBUG("[DecodeToSurface] RGBA format detected, using NV12ToRGBAConverter"); - // m_rgbaConverter is now created in SetD3D_Device. - // The Initialize method should be idempotent or check internally if it's already initialized. - if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) { - LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter"); - cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); - my_slot.in_use.store(false); - m_returnCounter.fetch_add(1); - m_fifoWaitCV.notify_all(); // Notify others that we are skipping - return false; + // Initialize converter only if not already initialized (to avoid repeated reinitialization on each frame) + if (!m_rgbaConverter->IsInitialized()) { + if (!m_rgbaConverter->Initialize(m_width, m_height, m_stream)) { + LOGF_ERROR("[DecodeToSurface] Failed to initialize NV12ToRGBAConverter"); + cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); + m_fifoWaitCV.notify_all(); // Notify others that we are skipping + return false; + } } // Convert NV12 to RGBA diff --git a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu index 374f140..274f553 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu +++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu @@ -1,9 +1,11 @@ // CUDA kernel to write RGBA buffer to surface object // This handles the tiled texture layout automatically +// NOTE: cudaSurfaceObject_t and CUsurfObject are the same type (unsigned long long) +// surf2Dwrite() works with both Runtime and Driver API surface objects extern "C" __global__ void WriteSurfaceFromBuffer_Kernel( const unsigned char* __restrict__ rgba_buffer, - cudaSurfaceObject_t rgba_surface, + unsigned long long rgba_surface, // Works with both cudaSurfaceObject_t and CUsurfObject unsigned int width, unsigned int height, unsigned int pitch diff --git a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h index dd9992a..5981a5e 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h +++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h @@ -1,10 +1,5 @@ #pragma once -// Embedded PTX for RGBA surface write kernel -// Generated from rgba_surface_write_kernel.cu - -namespace VavCore { - const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX( // // Generated by NVIDIA NVVM Compiler @@ -15,7 +10,7 @@ const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX( // .version 9.0 -.target sm_86 +.target sm_75 .address_size 64 // .globl WriteSurfaceFromBuffer_Kernel @@ -67,6 +62,5 @@ $L__BB0_2: ret; } -)PTX"; -} // namespace VavCore +)PTX";