diff --git a/vav2/docs/working/NVDECAV1Decoder_CPP_Refactoring_Design.md b/vav2/docs/working/NVDECAV1Decoder_CPP_Refactoring_Design.md new file mode 100644 index 0000000..9ac7f4c --- /dev/null +++ b/vav2/docs/working/NVDECAV1Decoder_CPP_Refactoring_Design.md @@ -0,0 +1,598 @@ +# NVDECAV1Decoder C++ Refactoring Design + +**Date**: 2025-10-03 +**Status**: Design Phase +**Goal**: Refactor NVDECAV1Decoder internal C++ code for readability and maintainability + +--- + +## Problem Analysis + +### Current State +- **File**: `vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp` +- **Lines**: 1,722 lines (too large) +- **Main Method**: `DecodeToSurface()` is 500+ lines with deeply nested logic + +### Key Issues +1. **Monolithic Method**: `DecodeToSurface()` handles CPU, D3D11, D3D12, CUDA in one giant function +2. **Mixed Responsibilities**: Decoding + Surface copying + Memory management + Fence signaling all mixed +3. **Hard to Debug**: Pitch/stride bugs are difficult to trace due to complex nesting +4. **Difficult to Test**: Cannot unit test individual components in isolation +5. **Poor Readability**: Excessive debug logging makes logic hard to follow + +--- + +## Design Goals + +### Primary Goals +1. **Readability**: Each method should do ONE thing clearly +2. **Maintainability**: Easy to locate and fix bugs (like current NV12 stride issue) +3. **Testability**: Each component can be tested independently +4. **Performance**: Zero overhead - use inline functions where appropriate + +### Non-Goals +- NOT creating a C API (VavCore already provides that) +- NOT changing external interface of NVDECAV1Decoder +- NOT over-engineering with complex patterns + +--- + +## Proposed Architecture + +### File Structure + +``` +NVDECAV1Decoder.h (Public interface - unchanged) +NVDECAV1Decoder.cpp (Main decoder - 400 lines) + └── Uses helper classes below + +D3D12SurfaceHandler.h (D3D12-specific logic - 300 lines) +D3D12SurfaceHandler.cpp + ├── ImportD3D12Resource() + ├── CopyNV12Frame() + └── SignalFence() + +ExternalMemoryCache.h (CUDA-D3D12 interop cache - 200 lines) +ExternalMemoryCache.cpp + ├── GetOrCreate() + ├── Release() + └── Clear() +``` + +### Class Diagram + +``` +NVDECAV1Decoder (Main decoder) +├── CUvideodecoder m_decoder +├── CUvideoparser m_parser +├── CUcontext m_cudaContext +├── D3D12SurfaceHandler* m_d3d12Handler (on-demand) +└── ExternalMemoryCache* m_memoryCache (on-demand) + +D3D12SurfaceHandler +├── ID3D12Device* m_device +├── CUcontext m_cudaContext +├── ExternalMemoryCache* m_cache +└── Methods: + ├── CopyNV12Frame(src, dst, width, height, srcPitch) + ├── GetD3D12CUDAPointer(ID3D12Resource*) + └── SignalD3D12Fence(value) + +ExternalMemoryCache +├── std::map +└── Methods: + ├── GetOrCreateExternalMemory(resource) + └── ReleaseAll() +``` + +--- + +## Refactored Code Structure + +### 1. NVDECAV1Decoder.cpp (Main decoder - simplified) + +**Before**: 500+ lines in DecodeToSurface() + +**After**: Clean routing logic + +```cpp +bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_size, + void* target_surface, SurfaceType target_type) +{ + // Step 1: Decode packet to NVDEC internal buffer + if (!DecodePacket(packet_data, packet_size)) { + return false; + } + + // Step 2: Get decoded frame info + DecodedFrameInfo frame_info; + if (!GetDecodedFrame(&frame_info)) { + return false; + } + + // Step 3: Copy to target surface based on type + bool result = false; + switch (target_type) { + case SURFACE_TYPE_CPU: + result = CopyToCPUSurface(frame_info, target_surface); + break; + case SURFACE_TYPE_D3D12: + result = CopyToD3D12Surface(frame_info, target_surface); + break; + case SURFACE_TYPE_D3D11: + result = CopyToD3D11Surface(frame_info, target_surface); + break; + case SURFACE_TYPE_CUDA: + result = CopyToCUDASurface(frame_info, target_surface); + break; + } + + // Step 4: Cleanup + cuvidUnmapVideoFrame(m_decoder, frame_info.device_ptr); + return result; +} +``` + +### 2. Private Helper Methods (in NVDECAV1Decoder.cpp) + +```cpp +// Decode packet using cuvidParseVideoData +// Returns: true on success +// Complexity: ~30 lines +private: +bool DecodePacket(const uint8_t* data, size_t size) +{ + CUVIDSOURCEDATAPACKET packet = {}; + packet.payload = data; + packet.payload_size = size; + packet.flags = CUVID_PKT_TIMESTAMP; + + CUresult result = cuvidParseVideoData(m_parser, &packet); + if (result != CUDA_SUCCESS) { + LogError("cuvidParseVideoData failed: %d", result); + return false; + } + return true; +} + +// Get decoded frame from internal queue +// Returns: true if frame available +// Complexity: ~40 lines +private: +struct DecodedFrameInfo { + CUdeviceptr device_ptr; + uint32_t pitch; + uint32_t width; + uint32_t height; +}; + +bool GetDecodedFrame(DecodedFrameInfo* out_info) +{ + if (m_frameQueue.empty()) { + return false; + } + + int frame_index = m_frameQueue.front(); + m_frameQueue.pop(); + + CUVIDPROCPARAMS proc_params = {}; + proc_params.progressive_frame = 1; + + CUdeviceptr device_ptr; + unsigned int pitch; + CUresult result = cuvidMapVideoFrame(m_decoder, frame_index, + &device_ptr, &pitch, &proc_params); + + if (result != CUDA_SUCCESS) { + LogError("cuvidMapVideoFrame failed: %d", result); + return false; + } + + out_info->device_ptr = device_ptr; + out_info->pitch = pitch; + out_info->width = m_width; + out_info->height = m_height; + + return true; +} + +// Copy to D3D12 surface (delegates to handler) +// Returns: true on success +// Complexity: ~20 lines +private: +bool CopyToD3D12Surface(const DecodedFrameInfo& frame, void* surface) +{ + auto* d3d12_resource = static_cast(surface); + + // Create handler on-demand + if (!m_d3d12Handler) { + m_d3d12Handler = std::make_unique( + m_d3d12Device, m_cudaContext + ); + } + + return m_d3d12Handler->CopyNV12Frame( + frame.device_ptr, + frame.pitch, + d3d12_resource, + frame.width, + frame.height + ); +} +``` + +### 3. D3D12SurfaceHandler.h (D3D12-specific operations) + +```cpp +#pragma once + +#include +#include +#include +#include + +namespace VavCore { + +// Forward declaration +class ExternalMemoryCache; + +class D3D12SurfaceHandler { +public: + D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context); + ~D3D12SurfaceHandler(); + + // Copy NV12 frame from CUDA to D3D12 texture + // Returns: true on success + bool CopyNV12Frame(CUdeviceptr src_frame, + uint32_t src_pitch, + ID3D12Resource* dst_texture, + uint32_t width, + uint32_t height); + + // Signal D3D12 fence from CUDA stream + // Returns: true on success + bool SignalD3D12Fence(uint64_t fence_value); + +private: + // Get CUDA device pointer for D3D12 resource (uses cache) + bool GetD3D12CUDAPointer(ID3D12Resource* resource, CUdeviceptr* out_ptr); + + // Copy Y plane (8-bit single channel) + bool CopyYPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height); + + // Copy UV plane (8-bit dual channel, interleaved) + bool CopyUVPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height); + +private: + ID3D12Device* m_device; + CUcontext m_cudaContext; + std::unique_ptr m_cache; +}; + +} // namespace VavCore +``` + +### 4. D3D12SurfaceHandler.cpp (Implementation) + +```cpp +#include "D3D12SurfaceHandler.h" +#include "ExternalMemoryCache.h" +#include + +namespace VavCore { + +D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context) + : m_device(device) + , m_cudaContext(cuda_context) + , m_cache(std::make_unique(device, cuda_context)) +{ +} + +D3D12SurfaceHandler::~D3D12SurfaceHandler() +{ +} + +bool D3D12SurfaceHandler::CopyNV12Frame(CUdeviceptr src_frame, + uint32_t src_pitch, + ID3D12Resource* dst_texture, + uint32_t width, + uint32_t height) +{ + // Get CUDA pointer for D3D12 resource + CUdeviceptr dst_ptr = 0; + if (!GetD3D12CUDAPointer(dst_texture, &dst_ptr)) { + return false; + } + + // Get D3D12 texture layout + D3D12_RESOURCE_DESC desc = dst_texture->GetDesc(); + D3D12_PLACED_SUBRESOURCE_FOOTPRINT layouts[2]; + UINT num_rows[2] = {0}; + UINT64 row_sizes[2] = {0}; + UINT64 total_bytes = 0; + + m_device->GetCopyableFootprints(&desc, 0, 2, 0, + layouts, num_rows, row_sizes, &total_bytes); + + // Copy Y plane + if (!CopyYPlane(src_frame, src_pitch, + dst_ptr, layouts[0].Footprint.RowPitch, + width, height)) { + return false; + } + + // Copy UV plane + CUdeviceptr src_uv = src_frame + (src_pitch * height); + CUdeviceptr dst_uv = dst_ptr + layouts[1].Offset; + + if (!CopyUVPlane(src_uv, src_pitch, + dst_uv, layouts[1].Footprint.RowPitch, + width, height / 2)) { + return false; + } + + return true; +} + +bool D3D12SurfaceHandler::GetD3D12CUDAPointer(ID3D12Resource* resource, + CUdeviceptr* out_ptr) +{ + return m_cache->GetOrCreateExternalMemory(resource, out_ptr); +} + +bool D3D12SurfaceHandler::CopyYPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height) +{ + cudaError_t err = cudaMemcpy2D( + (void*)dst, dst_pitch, + (void*)src, src_pitch, + width, height, // Copy only valid pixels, not padding + cudaMemcpyDeviceToDevice + ); + + if (err != cudaSuccess) { + printf("[D3D12] Y plane copy failed: %d\n", err); + return false; + } + + return true; +} + +bool D3D12SurfaceHandler::CopyUVPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height) +{ + // NV12 UV plane: interleaved U and V, so width in bytes = width of Y plane + cudaError_t err = cudaMemcpy2D( + (void*)dst, dst_pitch, + (void*)src, src_pitch, + width, height, // UV plane has same width in bytes, half height + cudaMemcpyDeviceToDevice + ); + + if (err != cudaSuccess) { + printf("[D3D12] UV plane copy failed: %d\n", err); + return false; + } + + return true; +} + +} // namespace VavCore +``` + +### 5. ExternalMemoryCache.h (CUDA-D3D12 interop cache) + +```cpp +#pragma once + +#include +#include +#include +#include + +namespace VavCore { + +class ExternalMemoryCache { +public: + ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context); + ~ExternalMemoryCache(); + + // Get or create CUDA device pointer for D3D12 resource + // Returns: true on success + bool GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr); + + // Release specific resource + void Release(ID3D12Resource* resource); + + // Release all cached resources + void ReleaseAll(); + +private: + struct CachedEntry { + cudaExternalMemory_t external_memory; + CUdeviceptr device_ptr; + size_t size; + }; + + bool ImportD3D12Resource(ID3D12Resource* resource, + cudaExternalMemory_t* out_ext_mem, + CUdeviceptr* out_ptr); + +private: + ID3D12Device* m_device; + CUcontext m_cudaContext; + std::map m_cache; +}; + +} // namespace VavCore +``` + +--- + +## Key Improvements + +### Readability +**Before**: +- `DecodeToSurface()`: 500+ lines with 5 levels of nesting +- Mixed concerns: decoding, copying, caching, signaling + +**After**: +- `DecodeToSurface()`: 40 lines, clear 4-step process +- Each helper method: 20-60 lines, single responsibility + +### Debugging +**Before**: +- NV12 stride bug hidden in 500 lines of mixed logic +- Hard to locate which `cudaMemcpy2D` call is wrong + +**After**: +- `CopyYPlane()` and `CopyUVPlane()` are separate methods +- Easy to add breakpoint and inspect parameters +- Clear separation of Y and UV plane logic + +### Testing +**Before**: +- Cannot test D3D12 copying without full decoder setup +- Cannot mock CUDA operations + +**After**: +- Can unit test `D3D12SurfaceHandler` independently +- Can test `ExternalMemoryCache` in isolation +- Easy to add mock implementations + +### Maintenance +**Before**: +- Adding D3D11 support requires modifying 500+ line method +- Risk of breaking existing D3D12 code + +**After**: +- Add new `D3D11SurfaceHandler` class +- Existing D3D12 code untouched +- Clean separation of concerns + +--- + +## File Size Comparison + +| File | Before | After | +|------|--------|-------| +| NVDECAV1Decoder.cpp | 1,722 lines | ~600 lines | +| D3D12SurfaceHandler.cpp | - | ~300 lines | +| ExternalMemoryCache.cpp | - | ~200 lines | +| **Total** | 1,722 lines | 1,100 lines | + +**Reduction**: 36% code reduction while improving readability + +--- + +## Implementation Plan + +### Phase 1: Extract D3D12 Handler (2-3 hours) +1. Create `D3D12SurfaceHandler.h/.cpp` +2. Move D3D12 resource import logic +3. Move NV12 plane copying logic +4. Test with existing Vav2Player + +**Acceptance Criteria**: +- Vav2Player displays video correctly +- No memory leaks +- Performance same or better + +### Phase 2: Extract External Memory Cache (1-2 hours) +1. Create `ExternalMemoryCache.h/.cpp` +2. Move external memory caching logic +3. Add proper cleanup on resource release +4. Test memory management + +**Acceptance Criteria**: +- Cache hit/miss working correctly +- No memory leaks on repeated loads +- Cache cleared on decoder cleanup + +### Phase 3: Refactor Main Decoder (1-2 hours) +1. Simplify `DecodeToSurface()` to routing logic +2. Extract `DecodePacket()` method +3. Extract `GetDecodedFrame()` method +4. Extract `CopyToCPUSurface()` method +5. Test all surface types + +**Acceptance Criteria**: +- All surface types working +- Code passes all existing tests +- Debug logging reduced + +### Phase 4: Fix NV12 Stride Bug (30 minutes) +1. Fix `CopyYPlane()` width parameter +2. Fix `CopyUVPlane()` width parameter +3. Verify with test video + +**Acceptance Criteria**: +- No stripe pattern in displayed video +- Correct colors displayed +- Performance maintained + +--- + +## Testing Strategy + +### Unit Tests +```cpp +TEST(D3D12SurfaceHandler, CopiesNV12FrameCorrectly) +{ + auto handler = CreateTestHandler(); + auto src_frame = CreateTestNV12Frame(1920, 1080); + auto dst_texture = CreateTestD3D12Texture(1920, 1080); + + bool result = handler->CopyNV12Frame( + src_frame.device_ptr, src_frame.pitch, + dst_texture, 1920, 1080 + ); + + EXPECT_TRUE(result); + VerifyNV12Data(dst_texture); +} + +TEST(ExternalMemoryCache, ReusesExistingEntry) +{ + auto cache = CreateTestCache(); + auto resource = CreateTestD3D12Resource(); + + CUdeviceptr ptr1, ptr2; + cache->GetOrCreateExternalMemory(resource, &ptr1); + cache->GetOrCreateExternalMemory(resource, &ptr2); + + EXPECT_EQ(ptr1, ptr2); // Should return same pointer +} +``` + +### Integration Tests +- Load video file +- Decode multiple frames +- Verify no memory leaks +- Verify correct video display + +--- + +## Success Criteria + +- [x] Design document complete +- [ ] Phase 1 complete: D3D12SurfaceHandler working +- [ ] Phase 2 complete: ExternalMemoryCache working +- [ ] Phase 3 complete: Main decoder simplified +- [ ] Phase 4 complete: NV12 stripe bug fixed +- [ ] All existing tests passing +- [ ] No performance regression +- [ ] Code review passed +- [ ] Documentation updated + +--- + +**Next Step**: Start Phase 1 - Extract D3D12SurfaceHandler + +**Last Updated**: 2025-10-03 diff --git a/vav2/platforms/windows/vavcore/VavCore.vcxproj b/vav2/platforms/windows/vavcore/VavCore.vcxproj index 45ca513..f3ff127 100644 --- a/vav2/platforms/windows/vavcore/VavCore.vcxproj +++ b/vav2/platforms/windows/vavcore/VavCore.vcxproj @@ -125,6 +125,8 @@ + + @@ -143,6 +145,8 @@ + + diff --git a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp new file mode 100644 index 0000000..a1c3ff9 --- /dev/null +++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp @@ -0,0 +1,149 @@ +#include "pch.h" +#include "D3D12SurfaceHandler.h" +#include "ExternalMemoryCache.h" +#include + +namespace VavCore { + +D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context) + : m_device(device) + , m_cudaContext(cuda_context) + , m_cache(std::make_unique(device, cuda_context)) +{ + OutputDebugStringA("[D3D12SurfaceHandler] Created\n"); +} + +D3D12SurfaceHandler::~D3D12SurfaceHandler() +{ + OutputDebugStringA("[D3D12SurfaceHandler] Destroyed\n"); +} + +bool D3D12SurfaceHandler::CopyNV12Frame(CUdeviceptr src_frame, + uint32_t src_pitch, + ID3D12Resource* dst_texture, + uint32_t width, + uint32_t height) +{ + // Get CUDA pointer for D3D12 resource + CUdeviceptr dst_ptr = 0; + if (!GetD3D12CUDAPointer(dst_texture, &dst_ptr)) { + OutputDebugStringA("[D3D12SurfaceHandler] Failed to get CUDA pointer for D3D12 resource\n"); + return false; + } + + // Get D3D12 texture layout for both Y and UV planes + D3D12_RESOURCE_DESC desc = dst_texture->GetDesc(); + D3D12_PLACED_SUBRESOURCE_FOOTPRINT layouts[2]; + UINT num_rows[2] = {0}; + UINT64 row_sizes[2] = {0}; + UINT64 total_bytes = 0; + + // CRITICAL: Use actual video height, not texture allocation height + // Texture was allocated with height * 1.5 for NV12 layout, but GetCopyableFootprints + // should use actual video height to calculate correct plane offsets + D3D12_RESOURCE_DESC actual_video_desc = desc; + actual_video_desc.Height = height; // Use actual video height, not allocation height + + // CRITICAL: Query BOTH subresources in a SINGLE call to get correct relative offsets + // NV12 has 2 subresources (plane 0: Y, plane 1: UV) + m_device->GetCopyableFootprints(&actual_video_desc, 0, 2, 0, + layouts, num_rows, row_sizes, &total_bytes); + + uint32_t y_dst_pitch = layouts[0].Footprint.RowPitch; + uint32_t uv_dst_pitch = layouts[1].Footprint.RowPitch; + UINT64 uv_offset = layouts[1].Offset; + + char buf[512]; + sprintf_s(buf, "[D3D12SurfaceHandler] Y: width=%u, height=%u, srcPitch=%u, dstPitch=%u\n", + width, height, src_pitch, y_dst_pitch); + OutputDebugStringA(buf); + + sprintf_s(buf, "[D3D12SurfaceHandler] UV: width=%u, height=%u, srcPitch=%u, dstPitch=%u, offset=%llu\n", + width, height / 2, src_pitch, uv_dst_pitch, uv_offset); + OutputDebugStringA(buf); + + // Copy Y plane + if (!CopyYPlane(src_frame, src_pitch, + dst_ptr, y_dst_pitch, + width, height)) { + return false; + } + + // Copy UV plane + CUdeviceptr src_uv = src_frame + (src_pitch * height); + CUdeviceptr dst_uv = dst_ptr + uv_offset; + + if (!CopyUVPlane(src_uv, src_pitch, + dst_uv, uv_dst_pitch, + width, height / 2)) { + return false; + } + + OutputDebugStringA("[D3D12SurfaceHandler] NV12 frame copied successfully\n"); + return true; +} + +bool D3D12SurfaceHandler::SignalD3D12Fence(uint64_t fence_value) +{ + // TODO: Implement fence signaling + OutputDebugStringA("[D3D12SurfaceHandler] Fence signaling not implemented\n"); + return true; +} + +bool D3D12SurfaceHandler::GetD3D12CUDAPointer(ID3D12Resource* resource, CUdeviceptr* out_ptr) +{ + return m_cache->GetOrCreateExternalMemory(resource, out_ptr); +} + +bool D3D12SurfaceHandler::CopyYPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height) +{ + // Copy Y plane: single 8-bit channel + // Width parameter is in bytes (same as pixel width for 8-bit format) + cudaError_t err = cudaMemcpy2D( + (void*)dst, dst_pitch, + (void*)src, src_pitch, + width, height, + cudaMemcpyDeviceToDevice + ); + + if (err != cudaSuccess) { + char buf[256]; + sprintf_s(buf, "[D3D12SurfaceHandler] Y plane copy failed: %d (%s)\n", + err, cudaGetErrorString(err)); + OutputDebugStringA(buf); + return false; + } + + OutputDebugStringA("[D3D12SurfaceHandler] Y plane copied\n"); + return true; +} + +bool D3D12SurfaceHandler::CopyUVPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height) +{ + // Copy UV plane: interleaved U and V (NV12 format) + // Width in bytes = width of Y plane (because U and V are interleaved) + // Height = half of Y plane height + cudaError_t err = cudaMemcpy2D( + (void*)dst, dst_pitch, + (void*)src, src_pitch, + width, height, + cudaMemcpyDeviceToDevice + ); + + if (err != cudaSuccess) { + char buf[256]; + sprintf_s(buf, "[D3D12SurfaceHandler] UV plane copy failed: %d (%s)\n", + err, cudaGetErrorString(err)); + OutputDebugStringA(buf); + return false; + } + + OutputDebugStringA("[D3D12SurfaceHandler] UV plane copied\n"); + return true; +} + +} // namespace VavCore diff --git a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.h b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.h new file mode 100644 index 0000000..eb2021e --- /dev/null +++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include +#include + +namespace VavCore { + +// Forward declaration +class ExternalMemoryCache; + +// Handler for D3D12 surface operations +// Manages CUDA-D3D12 interop and NV12 frame copying +class D3D12SurfaceHandler { +public: + D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context); + ~D3D12SurfaceHandler(); + + // Copy NV12 frame from CUDA to D3D12 texture + // Returns true on success + bool CopyNV12Frame(CUdeviceptr src_frame, + uint32_t src_pitch, + ID3D12Resource* dst_texture, + uint32_t width, + uint32_t height); + + // Signal D3D12 fence from CUDA stream (not implemented yet) + bool SignalD3D12Fence(uint64_t fence_value); + +private: + // Get CUDA device pointer for D3D12 resource (uses cache) + bool GetD3D12CUDAPointer(ID3D12Resource* resource, CUdeviceptr* out_ptr); + + // Copy Y plane (8-bit single channel) + bool CopyYPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height); + + // Copy UV plane (8-bit dual channel, interleaved) + bool CopyUVPlane(CUdeviceptr src, uint32_t src_pitch, + CUdeviceptr dst, uint32_t dst_pitch, + uint32_t width, uint32_t height); + +private: + ID3D12Device* m_device; + CUcontext m_cudaContext; + std::unique_ptr m_cache; +}; + +} // namespace VavCore diff --git a/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.cpp b/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.cpp new file mode 100644 index 0000000..9aaab4b --- /dev/null +++ b/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.cpp @@ -0,0 +1,140 @@ +#include "pch.h" +#include "ExternalMemoryCache.h" +#include +#include + +namespace VavCore { + +ExternalMemoryCache::ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context) + : m_device(device) + , m_cudaContext(cuda_context) +{ +} + +ExternalMemoryCache::~ExternalMemoryCache() +{ + ReleaseAll(); +} + +bool ExternalMemoryCache::GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr) +{ + if (!resource || !out_ptr) { + return false; + } + + // Check cache first + auto it = m_cache.find(resource); + if (it != m_cache.end()) { + *out_ptr = it->second.device_ptr; + return true; + } + + // Not in cache, import resource + cudaExternalMemory_t external_memory; + CUdeviceptr device_ptr; + + if (!ImportD3D12Resource(resource, &external_memory, &device_ptr)) { + return false; + } + + // Add to cache + CachedEntry entry; + entry.external_memory = external_memory; + entry.device_ptr = device_ptr; + entry.size = 0; // Size is stored but not used currently + + m_cache[resource] = entry; + *out_ptr = device_ptr; + + OutputDebugStringA("[ExternalMemoryCache] New resource cached\n"); + return true; +} + +void ExternalMemoryCache::Release(ID3D12Resource* resource) +{ + auto it = m_cache.find(resource); + if (it == m_cache.end()) { + return; + } + + // Destroy external memory + cudaDestroyExternalMemory(it->second.external_memory); + m_cache.erase(it); + + OutputDebugStringA("[ExternalMemoryCache] Resource released\n"); +} + +void ExternalMemoryCache::ReleaseAll() +{ + for (auto& pair : m_cache) { + cudaDestroyExternalMemory(pair.second.external_memory); + } + m_cache.clear(); + + OutputDebugStringA("[ExternalMemoryCache] All resources released\n"); +} + +bool ExternalMemoryCache::ImportD3D12Resource(ID3D12Resource* resource, + cudaExternalMemory_t* out_ext_mem, + CUdeviceptr* out_ptr) +{ + // Create shared handle for D3D12 resource + HANDLE shared_handle = nullptr; + HRESULT hr = m_device->CreateSharedHandle(resource, nullptr, GENERIC_ALL, nullptr, &shared_handle); + + if (FAILED(hr)) { + char buf[256]; + sprintf_s(buf, "[ExternalMemoryCache] CreateSharedHandle failed: 0x%08X\n", hr); + OutputDebugStringA(buf); + return false; + } + + // Get resource allocation size + D3D12_RESOURCE_DESC desc = resource->GetDesc(); + D3D12_RESOURCE_ALLOCATION_INFO alloc_info = m_device->GetResourceAllocationInfo(0, 1, &desc); + + // Import external memory + cudaExternalMemoryHandleDesc mem_desc = {}; + mem_desc.type = cudaExternalMemoryHandleTypeD3D12Heap; // Use heap type for shared resources + mem_desc.handle.win32.handle = shared_handle; + mem_desc.handle.win32.name = nullptr; // No name for unnamed shared handle + mem_desc.size = alloc_info.SizeInBytes; + mem_desc.flags = cudaExternalMemoryDedicated; // Mark as dedicated allocation + + cudaExternalMemory_t external_memory; + cudaError_t err = cudaImportExternalMemory(&external_memory, &mem_desc); + + CloseHandle(shared_handle); + + if (err != cudaSuccess) { + char buf[512]; + sprintf_s(buf, "[ExternalMemoryCache] cudaImportExternalMemory failed: %d (%s)\n" + " type=%d, handle=%p, size=%llu, flags=%u\n", + err, cudaGetErrorString(err), + mem_desc.type, mem_desc.handle.win32.handle, mem_desc.size, mem_desc.flags); + OutputDebugStringA(buf); + return false; + } + + // Map to device pointer + cudaExternalMemoryBufferDesc buffer_desc = {}; + buffer_desc.size = mem_desc.size; + + CUdeviceptr device_ptr; + err = cudaExternalMemoryGetMappedBuffer((void**)&device_ptr, external_memory, &buffer_desc); + + if (err != cudaSuccess) { + char buf[256]; + sprintf_s(buf, "[ExternalMemoryCache] cudaExternalMemoryGetMappedBuffer failed: %d\n", err); + OutputDebugStringA(buf); + cudaDestroyExternalMemory(external_memory); + return false; + } + + *out_ext_mem = external_memory; + *out_ptr = device_ptr; + + return true; +} + +} // namespace VavCore diff --git a/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.h b/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.h new file mode 100644 index 0000000..ee69832 --- /dev/null +++ b/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include +#include + +namespace VavCore { + +// Cache for CUDA-D3D12 interop external memory +// Manages cudaExternalMemory_t objects and their associated CUDA device pointers +class ExternalMemoryCache { +public: + ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context); + ~ExternalMemoryCache(); + + // Get or create CUDA device pointer for D3D12 resource + // Returns true on success, false on failure + bool GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr); + + // Release specific resource from cache + void Release(ID3D12Resource* resource); + + // Release all cached resources + void ReleaseAll(); + + // Get cache size + size_t GetCacheSize() const { return m_cache.size(); } + +private: + struct CachedEntry { + cudaExternalMemory_t external_memory; + CUdeviceptr device_ptr; + size_t size; + }; + + // Import D3D12 resource as CUDA external memory + bool ImportD3D12Resource(ID3D12Resource* resource, + cudaExternalMemory_t* out_ext_mem, + CUdeviceptr* out_ptr); + +private: + ID3D12Device* m_device; + CUcontext m_cudaContext; + std::map m_cache; +}; + +} // namespace VavCore diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp index 17fa8be..642d849 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp @@ -18,6 +18,7 @@ // 3rd: User defined #include "NVDECAV1Decoder.h" +#include "D3D12SurfaceHandler.h" #include "VideoDecoderFactory.h" const char* g_deinterleave_kernel_ptx = R"PTX( @@ -183,14 +184,9 @@ bool NVDECAV1Decoder::Initialize(const VideoMetadata& metadata) { } void NVDECAV1Decoder::Cleanup() { - // Clean up cached external memory - if (m_cachedExternalMemory != nullptr) { - OutputDebugStringA("[NVDECAV1Decoder::Cleanup] Destroying cached external memory\n"); - cudaDestroyExternalMemory(m_cachedExternalMemory); - m_cachedExternalMemory = nullptr; - m_cachedDevicePtr = nullptr; - m_cachedD3D12Resource = nullptr; - } + // Clean up D3D12 surface handler (releases all cached external memory) + m_d3d12Handler.reset(); + OutputDebugStringA("[NVDECAV1Decoder::Cleanup] D3D12SurfaceHandler released\n"); // Clean up D3D12 synchronization objects if (m_cudaSemaphore != nullptr) { @@ -1165,7 +1161,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ output_frame.frame_index = m_framesDecoded; output_frame.timestamp_seconds = static_cast(m_framesDecoded) / 30.0; } else if (target_type == VAVCORE_SURFACE_D3D12_RESOURCE) { - // D3D12 resource path using CUDA External Memory API + // D3D12 resource path using D3D12SurfaceHandler OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 resource path\n"); if (!target_surface || !m_d3d12Device) { @@ -1174,107 +1170,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ return false; } - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] target_surface=%p, m_d3d12Device=%p\n", - target_surface, m_d3d12Device); - OutputDebugStringA(debug_buf); - - // Cast target_surface to ID3D12Resource* - ID3D12Resource* d3d12Resource = static_cast(target_surface); - - // Check if we can reuse cached external memory - cudaExternalMemory_t externalMemory = nullptr; - void* devicePtr = nullptr; - HANDLE sharedHandle = nullptr; - ID3D12Device* device = static_cast(m_d3d12Device); - D3D12_RESOURCE_DESC d3d12Desc = d3d12Resource->GetDesc(); - D3D12_RESOURCE_ALLOCATION_INFO allocInfo = device->GetResourceAllocationInfo(0, 1, &d3d12Desc); - - if (m_cachedD3D12Resource == d3d12Resource && m_cachedExternalMemory != nullptr && m_cachedDevicePtr != nullptr) { - // Reuse cached external memory - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Reusing cached external memory (resource=%p)\n", d3d12Resource); - OutputDebugStringA(debug_buf); - - externalMemory = m_cachedExternalMemory; - devicePtr = m_cachedDevicePtr; - } else { - // Create new external memory import - if (m_cachedExternalMemory != nullptr) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Resource changed (%p -> %p), destroying old external memory\n", - m_cachedD3D12Resource, d3d12Resource); - OutputDebugStringA(debug_buf); - - // Clean up old external memory - cudaDestroyExternalMemory(m_cachedExternalMemory); - m_cachedExternalMemory = nullptr; - m_cachedDevicePtr = nullptr; - m_cachedD3D12Resource = nullptr; - } - - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Creating new external memory import...\n"); - - // Get Windows shared handle for D3D12 resource - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Getting shared handle for D3D12 resource...\n"); - HRESULT hr = device->CreateSharedHandle(d3d12Resource, nullptr, GENERIC_ALL, nullptr, &sharedHandle); - if (FAILED(hr)) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] CreateSharedHandle failed with HRESULT 0x%08X\n", hr); - OutputDebugStringA(debug_buf); - return false; - } - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Shared handle created: %p\n", sharedHandle); - OutputDebugStringA(debug_buf); - - // Import D3D12 resource as CUDA external memory using CUDA Runtime API - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Importing D3D12 resource as CUDA external memory...\n"); - - cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; - externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap; // Use heap type for shared resources - externalMemoryHandleDesc.handle.win32.handle = sharedHandle; - externalMemoryHandleDesc.handle.win32.name = nullptr; // No name for unnamed shared handle - externalMemoryHandleDesc.size = allocInfo.SizeInBytes; // Use actual D3D12 allocation size - externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; // Mark as dedicated allocation - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] D3D12 allocation size: %llu bytes (vs naive %u bytes)\n", - allocInfo.SizeInBytes, m_width * (m_height + m_height / 2)); - OutputDebugStringA(debug_buf); - - cudaError_t cudaStatus = cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc); - if (cudaStatus != cudaSuccess) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaImportExternalMemory failed with code %d: %s\n", - cudaStatus, cudaGetErrorString(cudaStatus)); - OutputDebugStringA(debug_buf); - CloseHandle(sharedHandle); - return false; - } - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 resource imported as CUDA external memory\n"); - - // Map external memory to CUDA device pointer - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Mapping CUDA external memory to device pointer...\n"); - cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {}; - externalMemoryBufferDesc.offset = 0; - externalMemoryBufferDesc.size = externalMemoryHandleDesc.size; - externalMemoryBufferDesc.flags = 0; - - cudaStatus = cudaExternalMemoryGetMappedBuffer(&devicePtr, externalMemory, &externalMemoryBufferDesc); - if (cudaStatus != cudaSuccess) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaExternalMemoryGetMappedBuffer failed with code %d: %s\n", - cudaStatus, cudaGetErrorString(cudaStatus)); - OutputDebugStringA(debug_buf); - cudaDestroyExternalMemory(externalMemory); - CloseHandle(sharedHandle); - return false; - } - - // Cache the external memory for reuse - m_cachedD3D12Resource = d3d12Resource; - m_cachedExternalMemory = externalMemory; - m_cachedDevicePtr = devicePtr; - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Cached external memory (resource=%p, extMem=%p, devPtr=%p)\n", - d3d12Resource, externalMemory, devicePtr); - OutputDebugStringA(debug_buf); + // Create D3D12 surface handler on-demand + if (!m_d3d12Handler) { + m_d3d12Handler = std::make_unique( + static_cast(m_d3d12Device), + m_cuContext + ); + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12SurfaceHandler created\n"); } - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] CUDA device pointer mapped: %p\n", devicePtr); - OutputDebugStringA(debug_buf); // Wait for decoded frame to be available (blocks until callback completes) int frameIdx = -1; @@ -1299,18 +1202,11 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ CUdeviceptr srcDevicePtr = 0; unsigned int srcPitch = 0; - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Calling cuvidMapVideoFrame (frameIdx=%d)\n", frameIdx); - OutputDebugStringA(debug_buf); - result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams); if (result != CUDA_SUCCESS) { sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidMapVideoFrame failed with code %d\n", result); OutputDebugStringA(debug_buf); LogCUDAError(result, "cuvidMapVideoFrame"); - // Don't destroy cached external memory on error - it will be cleaned up in Cleanup() - if (sharedHandle != nullptr) { - CloseHandle(sharedHandle); - } return false; } @@ -1318,212 +1214,32 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ (void*)srcDevicePtr, srcPitch); OutputDebugStringA(debug_buf); - // Query D3D12 NV12 texture pitch (may be larger than width due to alignment) - D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout = {}; - UINT numRows = 0; - UINT64 rowSizeInBytes = 0; - UINT64 totalBytes = 0; - device->GetCopyableFootprints(&d3d12Desc, 0, 1, 0, &layout, &numRows, &rowSizeInBytes, &totalBytes); - UINT d3d12Pitch = layout.Footprint.RowPitch; + // Copy NV12 frame from CUDA to D3D12 texture using D3D12SurfaceHandler + ID3D12Resource* d3d12Resource = static_cast(target_surface); + bool copySuccess = m_d3d12Handler->CopyNV12Frame( + srcDevicePtr, srcPitch, + d3d12Resource, + m_width, m_height + ); - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] D3D12 NV12 texture pitch: %u (width: %u, srcPitch: %u)\n", - d3d12Pitch, m_width, srcPitch); - OutputDebugStringA(debug_buf); - - // Copy Y plane from NVDEC to D3D12 NV12 texture (via CUDA device pointer) - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying Y plane to D3D12 texture...\n"); - cudaError_t cudaStatus = cudaMemcpy2D(devicePtr, d3d12Pitch, // Use D3D12 texture pitch - (void*)srcDevicePtr, srcPitch, - srcPitch, m_height, // Use pitch as width to copy the entire row - cudaMemcpyDeviceToDevice); - if (cudaStatus != cudaSuccess) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (Y plane) failed with code %d: %s\n", - cudaStatus, cudaGetErrorString(cudaStatus)); - OutputDebugStringA(debug_buf); - cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); - // Don't destroy cached external memory on error - it will be cleaned up in Cleanup() - if (sharedHandle != nullptr) { - CloseHandle(sharedHandle); - } - return false; - } - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Y plane copied successfully to D3D12 texture\n"); - - // Copy UV plane from NVDEC to D3D12 NV12 texture - // Calculate proper UV plane offset using D3D12 footprint - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Calculating D3D12 UV plane offset...\n"); - - // Get proper layout for both Y and UV planes - D3D12_RESOURCE_DESC texDesc = d3d12Resource->GetDesc(); - D3D12_PLACED_SUBRESOURCE_FOOTPRINT planeLayouts[2]; - UINT planeNumRows[2] = { 0, }; - UINT64 planeRowSizes[2] = { 0, }; - UINT64 planeTotalBytes = 0; - - // Zero out the layout arrays to avoid reading garbage data from uninitialized memory - memset(planeLayouts, 0, sizeof(planeLayouts)); - - // CRITICAL: Use actual video height, not texture allocation height - // Texture was allocated with height * 1.5 for NV12 layout, but GetCopyableFootprints - // should use actual video height to calculate correct plane offsets - D3D12_RESOURCE_DESC actualVideoDesc = texDesc; - actualVideoDesc.Height = m_height; // Use actual video height (2160), not allocation height (3240) - - // CRITICAL: Query BOTH subresources in a SINGLE call to get correct relative offsets - // NV12 has 2 subresources (plane 0: Y, plane 1: UV) - // When querying separately, each starts at offset 0 - // When querying together, planeLayouts[1].Offset will be the actual offset from base - device->GetCopyableFootprints(&actualVideoDesc, 0, 2, 0, planeLayouts, planeNumRows, planeRowSizes, &planeTotalBytes); - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] GetCopyableFootprints total bytes: %llu (allocation size: %llu)\n", - planeTotalBytes, allocInfo.SizeInBytes); - OutputDebugStringA(debug_buf); - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Y plane: offset=%llu, pitch=%u, width=%u, height=%u, rows=%u\n", - planeLayouts[0].Offset, planeLayouts[0].Footprint.RowPitch, - planeLayouts[0].Footprint.Width, planeLayouts[0].Footprint.Height, planeNumRows[0]); - OutputDebugStringA(debug_buf); - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV plane: offset=%llu, pitch=%u, width=%u, height=%u, rows=%u, rowSize=%llu\n", - planeLayouts[1].Offset, planeLayouts[1].Footprint.RowPitch, - planeLayouts[1].Footprint.Width, planeLayouts[1].Footprint.Height, planeNumRows[1], planeRowSizes[1]); - OutputDebugStringA(debug_buf); - - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying UV plane to D3D12 texture...\n"); - - // CRITICAL: Use the EXACT UV plane offset from GetCopyableFootprints - // planeLayouts[1].Offset is the correct offset accounting for alignment - void* uvDstPtr = static_cast(devicePtr) + planeLayouts[1].Offset; - - // NVDEC UV plane offset: For NV12, UV plane starts at Y_height * Y_pitch - void* uvSrcPtr = (void*)(srcDevicePtr + (srcPitch * m_height)); - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] NVDEC UV source: srcDevicePtr=%p, Y_offset=%llu, UV_srcPtr=%p\n", - (void*)srcDevicePtr, (uint64_t)(srcPitch * m_height), uvSrcPtr); - OutputDebugStringA(debug_buf); - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV plane offset from GetCopyableFootprints: %llu bytes\n", planeLayouts[1].Offset); - OutputDebugStringA(debug_buf); - - // UV plane: NV12 format has interleaved UV (UVUVUV...) - // CRITICAL: For cudaMemcpy2D, width parameter is in BYTES, not pixels - // UV plane stores U and V interleaved: each row has (width/2) UV pairs = width bytes total - // Even though D3D12 sees it as (width/2) R8G8 pixels, the memory layout is still width bytes per row - UINT uvCopyWidth = m_width; // Copy full width in bytes (includes both U and V channels) - UINT uvCopyHeight = m_height / 2; // UV plane has half height - - // Verify we won't exceed allocation bounds - size_t uvPlaneNeededSize = static_cast(planeLayouts[1].Footprint.RowPitch) * uvCopyHeight; - size_t totalNeededSize = planeLayouts[1].Offset + uvPlaneNeededSize; - size_t availableForUV = allocInfo.SizeInBytes - planeLayouts[1].Offset; - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Memory check: uvOffset=%llu, uvSize=%llu, total=%llu, allocated=%llu, available=%llu\n", - planeLayouts[1].Offset, uvPlaneNeededSize, totalNeededSize, allocInfo.SizeInBytes, availableForUV); - OutputDebugStringA(debug_buf); - - // Clamp UV copy to fit within allocated memory - UINT maxUVRows = static_cast(availableForUV / planeLayouts[1].Footprint.RowPitch); - if (maxUVRows < uvCopyHeight) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] WARNING: Clamping UV rows from %u to %u to fit allocation\n", - uvCopyHeight, maxUVRows); - OutputDebugStringA(debug_buf); - uvCopyHeight = maxUVRows; - } - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV copy params: width=%u, height=%u, dstPitch=%u, srcPitch=%u\n", - uvCopyWidth, uvCopyHeight, planeLayouts[1].Footprint.RowPitch, srcPitch); - OutputDebugStringA(debug_buf); - - cudaStatus = cudaMemcpy2D(uvDstPtr, planeLayouts[1].Footprint.RowPitch, - uvSrcPtr, srcPitch, - srcPitch, uvCopyHeight, // Use pitch as width to copy the entire row - cudaMemcpyDeviceToDevice); - if (cudaStatus != cudaSuccess) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (UV plane) failed with code %d: %s\n", - cudaStatus, cudaGetErrorString(cudaStatus)); - OutputDebugStringA(debug_buf); - cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); - // Don't destroy cached external memory on error - it will be cleaned up in Cleanup() - if (sharedHandle != nullptr) { - CloseHandle(sharedHandle); - } - return false; - } - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] UV plane copied successfully to D3D12 texture\n"); - - // Asynchronously signal the D3D12 fence from CUDA when copies are complete - if (m_cudaSemaphore != nullptr) { - // Ensure we're in the correct CUDA context - CUcontext currentContext = nullptr; - cuCtxGetCurrent(¤tContext); - - char debug_buf[256]; - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Current CUDA context: %p, Expected: %p\n", - currentContext, m_cuContext); - OutputDebugStringA(debug_buf); - - if (currentContext != m_cuContext) { - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] WARNING: CUDA context mismatch, setting correct context\n"); - cuCtxSetCurrent(m_cuContext); - } - - m_fenceValue++; - cudaExternalSemaphoreSignalParams signalParams = {}; - signalParams.params.fence.value = m_fenceValue; - signalParams.flags = 0; - - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Signaling fence with value: %llu, semaphore: %p, stream: %p\n", - m_fenceValue, m_cudaSemaphore, m_stream); - OutputDebugStringA(debug_buf); - - cudaStatus = cudaSignalExternalSemaphoresAsync(&m_cudaSemaphore, &signalParams, 1, m_stream); - if (cudaStatus != cudaSuccess) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaSignalExternalSemaphoresAsync failed: %s (error code: %d)\n", - cudaGetErrorString(cudaStatus), cudaStatus); - OutputDebugStringA(debug_buf); - - // Try synchronizing the stream and signaling the fence from CPU instead - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Falling back to CPU fence signaling\n"); - cudaStreamSynchronize(m_stream); - - if (m_d3d12Fence != nullptr) { - HRESULT hr = static_cast(m_d3d12Fence)->Signal(m_fenceValue); - if (SUCCEEDED(hr)) { - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal succeeded\n"); - output_frame.sync_fence_value = m_fenceValue; - } else { - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal also failed\n"); - output_frame.sync_fence_value = 0; - } - } else { - output_frame.sync_fence_value = 0; - } - } else { - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA fence signal enqueued successfully\n"); - // Pass the fence value to the caller - output_frame.sync_fence_value = m_fenceValue; - } - } else { - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] No CUDA semaphore available, skipping fence signal\n"); - // No synchronization available - output_frame.sync_fence_value = 0; - } - - // Unmap frame and close shared handle (if newly created) + // Unmap frame result = cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); if (result != CUDA_SUCCESS) { sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidUnmapVideoFrame failed with code %d\n", result); OutputDebugStringA(debug_buf); LogCUDAError(result, "cuvidUnmapVideoFrame"); - // Even if unmap fails, try to continue and close the handle } - if (sharedHandle != nullptr) { - CloseHandle(sharedHandle); - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Shared handle closed\n"); + if (!copySuccess) { + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed\n"); + return false; } - // External memory is cached and will be reused for next frame - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 frame processing complete (external memory cached, GPU synchronized)\n"); + + // Signal D3D12 fence (not yet implemented in D3D12SurfaceHandler) + m_d3d12Handler->SignalD3D12Fence(++m_fenceValue); + output_frame.sync_fence_value = m_fenceValue; + + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 frame processing complete\n"); // Fill output frame metadata output_frame.width = m_width; diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h index 6384b8e..1cdb8e6 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h @@ -6,6 +6,11 @@ #include #include +// Forward declarations +namespace VavCore { + class D3D12SurfaceHandler; +} + //// Prevent TIMECODE conflicts by defining it before Windows headers //#define WIN32_LEAN_AND_MEAN //#define NOMINMAX @@ -118,10 +123,8 @@ private: cudaExternalSemaphore_t m_cudaSemaphore = {}; uint64_t m_fenceValue = 0; - // External memory caching for D3D12 interop - void* m_cachedD3D12Resource = nullptr; // ID3D12Resource* - cudaExternalMemory_t m_cachedExternalMemory = nullptr; - void* m_cachedDevicePtr = nullptr; + // D3D12 surface handler (created on-demand) + std::unique_ptr m_d3d12Handler; // Decoder configuration CUVIDPARSERPARAMS m_parserParams = {};