This commit is contained in:
2025-10-03 16:21:08 +09:00
parent 92e2e6464a
commit 47a93404a0
3 changed files with 11 additions and 352 deletions

View File

@@ -32,36 +32,22 @@ bool D3D12SurfaceHandler::CopyNV12Frame(CUdeviceptr src_frame,
}
// Get D3D12 texture layout for both Y and UV planes
// CRITICAL: Must use the ACTUAL texture descriptor as-is (including any padding)
// DO NOT override height - GetCopyableFootprints needs the real allocation size
D3D12_RESOURCE_DESC desc = dst_texture->GetDesc();
D3D12_PLACED_SUBRESOURCE_FOOTPRINT layouts[2];
UINT num_rows[2] = {0};
UINT64 row_sizes[2] = {0};
UINT64 total_bytes = 0;
// CRITICAL: Use actual video height, not texture allocation height
// Texture was allocated with height * 1.5 for NV12 layout, but GetCopyableFootprints
// should use actual video height to calculate correct plane offsets
D3D12_RESOURCE_DESC actual_video_desc = desc;
actual_video_desc.Height = height; // Use actual video height, not allocation height
// CRITICAL: Query BOTH subresources in a SINGLE call to get correct relative offsets
// NV12 has 2 subresources (plane 0: Y, plane 1: UV)
m_device->GetCopyableFootprints(&actual_video_desc, 0, 2, 0,
// Use descriptor directly without modification (preserves allocated height/pitch)
m_device->GetCopyableFootprints(&desc, 0, 2, 0,
layouts, num_rows, row_sizes, &total_bytes);
uint32_t y_dst_pitch = layouts[0].Footprint.RowPitch;
uint32_t uv_dst_pitch = layouts[1].Footprint.RowPitch;
UINT64 uv_offset = layouts[1].Offset;
char buf[512];
sprintf_s(buf, "[D3D12SurfaceHandler] Y: width=%u, height=%u, srcPitch=%u, dstPitch=%u\n",
width, height, src_pitch, y_dst_pitch);
OutputDebugStringA(buf);
sprintf_s(buf, "[D3D12SurfaceHandler] UV: width=%u, height=%u, srcPitch=%u, dstPitch=%u, offset=%llu\n",
width, height / 2, src_pitch, uv_dst_pitch, uv_offset);
OutputDebugStringA(buf);
// Copy Y plane
if (!CopyYPlane(src_frame, src_pitch,
dst_ptr, y_dst_pitch,
@@ -70,7 +56,7 @@ bool D3D12SurfaceHandler::CopyNV12Frame(CUdeviceptr src_frame,
}
// Copy UV plane
CUdeviceptr src_uv = src_frame + (src_pitch * height);
CUdeviceptr src_uv = src_frame + (static_cast<UINT64>(src_pitch) * height);
CUdeviceptr dst_uv = dst_ptr + uv_offset;
if (!CopyUVPlane(src_uv, src_pitch,
@@ -79,7 +65,6 @@ bool D3D12SurfaceHandler::CopyNV12Frame(CUdeviceptr src_frame,
return false;
}
OutputDebugStringA("[D3D12SurfaceHandler] NV12 frame copied successfully\n");
return true;
}
@@ -99,12 +84,10 @@ bool D3D12SurfaceHandler::CopyYPlane(CUdeviceptr src, uint32_t src_pitch,
CUdeviceptr dst, uint32_t dst_pitch,
uint32_t width, uint32_t height)
{
// Copy Y plane: single 8-bit channel
// Width parameter is in bytes (same as pixel width for 8-bit format)
cudaError_t err = cudaMemcpy2D(
(void*)dst, dst_pitch,
(void*)src, src_pitch,
width, height,
width, height, // Use the logical width, not the pitch, for the copy width.
cudaMemcpyDeviceToDevice
);
@@ -115,8 +98,6 @@ bool D3D12SurfaceHandler::CopyYPlane(CUdeviceptr src, uint32_t src_pitch,
OutputDebugStringA(buf);
return false;
}
OutputDebugStringA("[D3D12SurfaceHandler] Y plane copied\n");
return true;
}
@@ -124,13 +105,11 @@ bool D3D12SurfaceHandler::CopyUVPlane(CUdeviceptr src, uint32_t src_pitch,
CUdeviceptr dst, uint32_t dst_pitch,
uint32_t width, uint32_t height)
{
// Copy UV plane: interleaved U and V (NV12 format)
// Width in bytes = width of Y plane (because U and V are interleaved)
// Height = half of Y plane height
// For interleaved NV12, the byte width of the UV plane is the same as the Y plane.
cudaError_t err = cudaMemcpy2D(
(void*)dst, dst_pitch,
(void*)src, src_pitch,
width, height,
width, height, // Use the logical width, not the pitch, for the copy width.
cudaMemcpyDeviceToDevice
);
@@ -141,8 +120,6 @@ bool D3D12SurfaceHandler::CopyUVPlane(CUdeviceptr src, uint32_t src_pitch,
OutputDebugStringA(buf);
return false;
}
OutputDebugStringA("[D3D12SurfaceHandler] UV plane copied\n");
return true;
}

View File

@@ -546,12 +546,7 @@ bool NVDECAV1Decoder::InitializeCUDA() {
}
// If D3D device is set, enable D3D interop on this context
if (m_d3d11Device) {
OutputDebugStringA("[InitializeCUDA] D3D11 device detected, CUDA-D3D11 interop will be enabled\n");
char debug_buf[256];
sprintf_s(debug_buf, "[InitializeCUDA] D3D11 device: %p, CUDA context: %p\n", m_d3d11Device, m_cuContext);
OutputDebugStringA(debug_buf);
} else if (m_d3d12Device) {
if (m_d3d12Device) {
OutputDebugStringA("[InitializeCUDA] D3D12 device detected, CUDA-D3D12 interop will be enabled\n");
char debug_buf[256];
sprintf_s(debug_buf, "[InitializeCUDA] D3D12 device: %p, CUDA context: %p\n", m_d3d12Device, m_cuContext);
@@ -783,12 +778,11 @@ bool NVDECAV1Decoder::SupportsSurfaceType(VavCoreSurfaceType type) const {
switch (type) {
case VAVCORE_SURFACE_CUDA_DEVICE:
return true; // NVDEC's primary strength - CUDA device pointers
case VAVCORE_SURFACE_D3D11_TEXTURE:
return true; // Via CUDA-D3D11 interop
case VAVCORE_SURFACE_D3D12_RESOURCE:
return true; // Via CUDA-D3D12 interop
case VAVCORE_SURFACE_CPU:
return true; // Always support CPU fallback
case VAVCORE_SURFACE_D3D11_TEXTURE:
case VAVCORE_SURFACE_AMF_SURFACE:
default:
return false;
@@ -803,25 +797,7 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) {
try {
bool success = false;
if (type == VAVCORE_SURFACE_D3D11_TEXTURE) {
char debug_buf[256];
sprintf_s(debug_buf, "[SetD3DDevice] Setting D3D11 device: %p\n", d3d_device);
OutputDebugStringA(debug_buf);
// Store D3D11 device
m_d3d11Device = d3d_device;
m_preferredSurfaceType = type;
// DON'T reinitialize if already initialized - this would invalidate existing decoded frames
if (m_initialized) {
OutputDebugStringA("[SetD3DDevice] Decoder already initialized, D3D11 device set for DecodeToSurface\n");
OutputDebugStringA("[SetD3DDevice] Note: D3D11 zero-copy will be available for DecodeToSurface calls\n");
} else {
OutputDebugStringA("[SetD3DDevice] D3D11 device set, will be used when Initialize() is called\n");
}
success = true;
} else if (type == VAVCORE_SURFACE_D3D12_RESOURCE) {
if (type == VAVCORE_SURFACE_D3D12_RESOURCE) {
char debug_buf[256];
sprintf_s(debug_buf, "[SetD3DDevice] Setting D3D12 device: %p\n", d3d_device);
OutputDebugStringA(debug_buf);
@@ -1013,153 +989,6 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
// Unmap the frame (this is just for getting device pointer, actual usage will be later)
cuvidUnmapVideoFrame(m_decoder, devicePtr);
} else if (target_type == VAVCORE_SURFACE_D3D11_TEXTURE) {
// For D3D11 texture, we need CUDA-D3D interop
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D11 texture path\n");
if (!target_surface || !m_d3d11Device) {
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] ERROR: target_surface or m_d3d11Device is null\n");
LogError("D3D11 texture or device not available");
return false;
}
char debug_buf[256];
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] target_surface=%p, m_d3d11Device=%p\n",
target_surface, m_d3d11Device);
OutputDebugStringA(debug_buf);
// Register D3D texture with CUDA
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Calling RegisterD3DResourceWithCUDA...\n");
bool registerSuccess = RegisterD3DResourceWithCUDA(target_surface);
if (!registerSuccess) {
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] ERROR: RegisterD3DResourceWithCUDA failed\n");
LogError("Failed to register D3D texture with CUDA");
return false;
}
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] RegisterD3DResourceWithCUDA succeeded\n");
// Wait for decoded frame to be available (blocks until callback completes)
int frameIdx = -1;
{
std::unique_lock<std::mutex> lock(m_frameQueueMutex);
m_frameAvailable.wait(lock, [this] {
return !m_decodedFrameIndices.empty();
});
frameIdx = m_decodedFrameIndices.front();
m_decodedFrameIndices.pop();
}
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Using decoded frame from queue: frameIdx=%d\n", frameIdx);
OutputDebugStringA(debug_buf);
// Map decoded NVDEC frame
CUVIDPROCPARAMS procParams = {};
procParams.progressive_frame = 1;
procParams.top_field_first = 0;
procParams.unpaired_field = 0;
CUdeviceptr srcDevicePtr = 0;
unsigned int srcPitch = 0;
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Calling cuvidMapVideoFrame (frameIdx=%d)\n", frameIdx);
OutputDebugStringA(debug_buf);
result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams);
if (result == CUDA_SUCCESS) {
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidMapVideoFrame succeeded: srcDevicePtr=%p, srcPitch=%u\n",
(void*)srcDevicePtr, srcPitch);
OutputDebugStringA(debug_buf);
} else {
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidMapVideoFrame failed with code %d\n", result);
OutputDebugStringA(debug_buf);
LogCUDAError(result, "cuvidMapVideoFrame");
return false;
}
// Map D3D texture to CUDA array
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Calling MapCUDADevicePtrFromD3D...\n");
CUdeviceptr dstDevicePtr;
size_t dstPitch;
bool mapSuccess = MapCUDADevicePtrFromD3D(dstDevicePtr, dstPitch);
if (!mapSuccess) {
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] ERROR: MapCUDADevicePtrFromD3D failed\n");
LogError("Failed to map CUDA device pointer from D3D");
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
return false;
}
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] MapCUDADevicePtrFromD3D succeeded\n");
// NV12 D3D11 texture: Get CUDA array (entire NV12 surface including Y and UV)
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Getting CUDA array for NV12 texture...\n");
CUarray cudaArray;
result = cuGraphicsSubResourceGetMappedArray(&cudaArray, m_cudaGraphicsResource, 0, 0);
if (result != CUDA_SUCCESS) {
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuGraphicsSubResourceGetMappedArray failed with code %d\n", result);
OutputDebugStringA(debug_buf);
LogCUDAError(result, "cuGraphicsSubResourceGetMappedArray");
UnmapCUDADevicePtr();
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
return false;
}
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Got CUDA array: %p\n", cudaArray);
OutputDebugStringA(debug_buf);
// Copy Y plane from NVDEC to D3D11 NV12 texture
CUDA_MEMCPY2D copyParamsY = {};
copyParamsY.srcMemoryType = CU_MEMORYTYPE_DEVICE;
copyParamsY.srcDevice = srcDevicePtr;
copyParamsY.srcPitch = srcPitch;
copyParamsY.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParamsY.dstArray = cudaArray;
copyParamsY.WidthInBytes = m_width;
copyParamsY.Height = m_height;
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying Y plane...\n");
result = cuMemcpy2D(&copyParamsY);
if (result != CUDA_SUCCESS) {
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuMemcpy2D (Y plane) failed with code %d\n", result);
OutputDebugStringA(debug_buf);
LogCUDAError(result, "cuMemcpy2D (Y plane)");
UnmapCUDADevicePtr();
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
return false;
}
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Y plane copied successfully\n");
// Copy UV plane from NVDEC to D3D11 NV12 texture (UV starts at row m_height)
CUDA_MEMCPY2D copyParamsUV = {};
copyParamsUV.srcMemoryType = CU_MEMORYTYPE_DEVICE;
copyParamsUV.srcDevice = srcDevicePtr + (srcPitch * m_height);
copyParamsUV.srcPitch = srcPitch;
copyParamsUV.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParamsUV.dstArray = cudaArray;
copyParamsUV.dstY = m_height; // UV plane starts at row m_height in NV12 layout
copyParamsUV.WidthInBytes = m_width; // NV12 UV is interleaved, same width as Y
copyParamsUV.Height = m_height / 2;
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying UV plane...\n");
result = cuMemcpy2D(&copyParamsUV);
if (result != CUDA_SUCCESS) {
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuMemcpy2D (UV plane) failed with code %d\n", result);
OutputDebugStringA(debug_buf);
LogCUDAError(result, "cuMemcpy2D (UV plane)");
UnmapCUDADevicePtr();
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
return false;
}
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] UV plane copied successfully\n");
// Unmap and cleanup
UnmapCUDADevicePtr();
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
// Fill output frame metadata
output_frame.width = m_width;
output_frame.height = m_height;
output_frame.color_space = ColorSpace::YUV420P;
output_frame.frame_index = m_framesDecoded;
output_frame.timestamp_seconds = static_cast<double>(m_framesDecoded) / 30.0;
} else if (target_type == VAVCORE_SURFACE_D3D12_RESOURCE) {
// D3D12 resource path using D3D12SurfaceHandler
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 resource path\n");
@@ -1275,133 +1104,6 @@ VavCoreSurfaceType NVDECAV1Decoder::GetOptimalSurfaceType() const {
// CUDA surface helper methods implementation
bool NVDECAV1Decoder::RegisterD3DResourceWithCUDA(void* d3d_texture) {
if (!d3d_texture) {
OutputDebugStringA("[RegisterD3DResourceWithCUDA] ERROR: d3d_texture is null\n");
return false;
}
char debug_buf[256];
sprintf_s(debug_buf, "[RegisterD3DResourceWithCUDA] d3d_texture=%p\n", d3d_texture);
OutputDebugStringA(debug_buf);
try {
OutputDebugStringA("[RegisterD3DResourceWithCUDA] Calling cuGraphicsD3D11RegisterResource...\n");
CUresult result = cuGraphicsD3D11RegisterResource(&m_cudaGraphicsResource,
static_cast<ID3D11Resource*>(d3d_texture),
CU_GRAPHICS_REGISTER_FLAGS_NONE);
if (result != CUDA_SUCCESS) {
sprintf_s(debug_buf, "[RegisterD3DResourceWithCUDA] cuGraphicsD3D11RegisterResource failed with code %d\n", result);
OutputDebugStringA(debug_buf);
LogCUDAError(result, "cuGraphicsD3D11RegisterResource");
return false;
}
sprintf_s(debug_buf, "[RegisterD3DResourceWithCUDA] Success, m_cudaGraphicsResource=%p\n", m_cudaGraphicsResource);
OutputDebugStringA(debug_buf);
return true;
}
catch (const std::exception& e) {
sprintf_s(debug_buf, "[RegisterD3DResourceWithCUDA] Exception: %s\n", e.what());
OutputDebugStringA(debug_buf);
LogError("Exception registering D3D resource with CUDA");
return false;
}
catch (...) {
OutputDebugStringA("[RegisterD3DResourceWithCUDA] Unknown exception\n");
LogError("Exception registering D3D resource with CUDA");
return false;
}
}
bool NVDECAV1Decoder::MapCUDADevicePtrFromD3D(CUdeviceptr& device_ptr, size_t& pitch) {
if (!m_cudaGraphicsResource) {
OutputDebugStringA("[MapCUDADevicePtrFromD3D] ERROR: m_cudaGraphicsResource is null\n");
return false;
}
char debug_buf[256];
sprintf_s(debug_buf, "[MapCUDADevicePtrFromD3D] m_cudaGraphicsResource=%p, m_stream=%p\n",
m_cudaGraphicsResource, m_stream);
OutputDebugStringA(debug_buf);
try {
// Map graphics resource
OutputDebugStringA("[MapCUDADevicePtrFromD3D] Calling cuGraphicsMapResources...\n");
CUresult result = cuGraphicsMapResources(1, &m_cudaGraphicsResource, m_stream);
if (result != CUDA_SUCCESS) {
sprintf_s(debug_buf, "[MapCUDADevicePtrFromD3D] cuGraphicsMapResources failed with code %d\n", result);
OutputDebugStringA(debug_buf);
LogCUDAError(result, "cuGraphicsMapResources");
return false;
}
OutputDebugStringA("[MapCUDADevicePtrFromD3D] cuGraphicsMapResources succeeded\n");
// D3D11 textures are mapped as CUDA arrays, not linear memory
OutputDebugStringA("[MapCUDADevicePtrFromD3D] D3D11 textures map to CUDA arrays, using cuGraphicsSubResourceGetMappedArray\n");
// Return 0 for device_ptr to indicate array-based access
device_ptr = 0;
pitch = m_width;
sprintf_s(debug_buf, "[MapCUDADevicePtrFromD3D] Mapped as CUDA array (resource=%p)\n",
m_cudaGraphicsResource);
OutputDebugStringA(debug_buf);
return true;
}
catch (...) {
OutputDebugStringA("[MapCUDADevicePtrFromD3D] Exception caught\n");
LogError("Exception mapping CUDA device pointer from D3D");
return false;
}
}
bool NVDECAV1Decoder::UnmapCUDADevicePtr() {
if (!m_cudaGraphicsResource) {
return false;
}
try {
CUresult result = cuGraphicsUnmapResources(1, &m_cudaGraphicsResource, m_stream);
if (result != CUDA_SUCCESS) {
LogCUDAError(result, "cuGraphicsUnmapResources");
return false;
}
// Unregister resource
result = cuGraphicsUnregisterResource(m_cudaGraphicsResource);
if (result != CUDA_SUCCESS) {
LogCUDAError(result, "cuGraphicsUnregisterResource");
return false;
}
m_cudaGraphicsResource = nullptr;
return true;
}
catch (...) {
LogError("Exception unmapping CUDA device pointer");
return false;
}
}
bool NVDECAV1Decoder::SetupCUDAD3D11Interop(void* d3d_device) {
if (!m_cuContext || !d3d_device) {
OutputDebugStringA("[SetupCUDAD3D11Interop] ERROR: CUDA context or D3D11 device is null\n");
return false;
}
OutputDebugStringA("[SetupCUDAD3D11Interop] Setting up CUDA-D3D11 interop...\n");
// Store the D3D11 device for later use
m_d3d11Device = d3d_device;
char debug_buf[256];
sprintf_s(debug_buf, "[SetupCUDAD3D11Interop] D3D11 device stored: %p\n", m_d3d11Device);
OutputDebugStringA(debug_buf);
return true;
}
bool NVDECAV1Decoder::SetupCUDAD3D12Interop(void* d3d_device) {
if (!m_cuContext || !d3d_device) {
OutputDebugStringA("[SetupCUDAD3D12Interop] ERROR: CUDA context or D3D12 device is null\n");

View File

@@ -11,20 +11,6 @@ namespace VavCore {
class D3D12SurfaceHandler;
}
//// Prevent TIMECODE conflicts by defining it before Windows headers
//#define WIN32_LEAN_AND_MEAN
//#define NOMINMAX
//
//// Prevent specific Windows header conflicts
//#define TIMECODE TIMECODE_WIN32
//#include <cuda.h>
//#include <nvcuvid.h>
//#include <cuviddec.h>
//#include <cudaD3D11.h>
//#include <nvcuvid.h>
//#include <dxgi.h>
//#undef TIMECODE
// Forward declaration for CUDA external memory type
typedef struct CUexternalMemory_st* cudaExternalMemory_t;
typedef struct CUexternalSemaphore_st* cudaExternalSemaphore_t;
@@ -112,9 +98,7 @@ private:
CUfunction m_kernel = nullptr;
// D3D-CUDA interop
void* m_d3d11Device = nullptr; // ID3D11Device*
void* m_d3d12Device = nullptr; // ID3D12Device*
CUgraphicsResource m_cudaGraphicsResource = nullptr;
VavCoreSurfaceType m_preferredSurfaceType = VAVCORE_SURFACE_CPU;
// D3D12-CUDA Asynchronous Synchronization
@@ -150,11 +134,7 @@ private:
void CleanupCUDA();
// CUDA surface helper methods
bool SetupCUDAD3D11Interop(void* d3d_device);
bool SetupCUDAD3D12Interop(void* d3d_device);
bool RegisterD3DResourceWithCUDA(void* d3d_texture);
bool MapCUDADevicePtrFromD3D(CUdeviceptr& device_ptr, size_t& pitch);
bool UnmapCUDADevicePtr();
// NVDEC callbacks
static int CUDAAPI HandleVideoSequence(void* user_data, CUVIDEOFORMAT* format);