Refactoring NVDEC decoder
This commit is contained in:
598
vav2/docs/working/NVDECAV1Decoder_CPP_Refactoring_Design.md
Normal file
598
vav2/docs/working/NVDECAV1Decoder_CPP_Refactoring_Design.md
Normal file
@@ -0,0 +1,598 @@
|
||||
# NVDECAV1Decoder C++ Refactoring Design
|
||||
|
||||
**Date**: 2025-10-03
|
||||
**Status**: Design Phase
|
||||
**Goal**: Refactor NVDECAV1Decoder internal C++ code for readability and maintainability
|
||||
|
||||
---
|
||||
|
||||
## Problem Analysis
|
||||
|
||||
### Current State
|
||||
- **File**: `vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp`
|
||||
- **Lines**: 1,722 lines (too large)
|
||||
- **Main Method**: `DecodeToSurface()` is 500+ lines with deeply nested logic
|
||||
|
||||
### Key Issues
|
||||
1. **Monolithic Method**: `DecodeToSurface()` handles CPU, D3D11, D3D12, CUDA in one giant function
|
||||
2. **Mixed Responsibilities**: Decoding + Surface copying + Memory management + Fence signaling all mixed
|
||||
3. **Hard to Debug**: Pitch/stride bugs are difficult to trace due to complex nesting
|
||||
4. **Difficult to Test**: Cannot unit test individual components in isolation
|
||||
5. **Poor Readability**: Excessive debug logging makes logic hard to follow
|
||||
|
||||
---
|
||||
|
||||
## Design Goals
|
||||
|
||||
### Primary Goals
|
||||
1. **Readability**: Each method should do ONE thing clearly
|
||||
2. **Maintainability**: Easy to locate and fix bugs (like current NV12 stride issue)
|
||||
3. **Testability**: Each component can be tested independently
|
||||
4. **Performance**: Zero overhead - use inline functions where appropriate
|
||||
|
||||
### Non-Goals
|
||||
- NOT creating a C API (VavCore already provides that)
|
||||
- NOT changing external interface of NVDECAV1Decoder
|
||||
- NOT over-engineering with complex patterns
|
||||
|
||||
---
|
||||
|
||||
## Proposed Architecture
|
||||
|
||||
### File Structure
|
||||
|
||||
```
|
||||
NVDECAV1Decoder.h (Public interface - unchanged)
|
||||
NVDECAV1Decoder.cpp (Main decoder - 400 lines)
|
||||
└── Uses helper classes below
|
||||
|
||||
D3D12SurfaceHandler.h (D3D12-specific logic - 300 lines)
|
||||
D3D12SurfaceHandler.cpp
|
||||
├── ImportD3D12Resource()
|
||||
├── CopyNV12Frame()
|
||||
└── SignalFence()
|
||||
|
||||
ExternalMemoryCache.h (CUDA-D3D12 interop cache - 200 lines)
|
||||
ExternalMemoryCache.cpp
|
||||
├── GetOrCreate()
|
||||
├── Release()
|
||||
└── Clear()
|
||||
```
|
||||
|
||||
### Class Diagram
|
||||
|
||||
```
|
||||
NVDECAV1Decoder (Main decoder)
|
||||
├── CUvideodecoder m_decoder
|
||||
├── CUvideoparser m_parser
|
||||
├── CUcontext m_cudaContext
|
||||
├── D3D12SurfaceHandler* m_d3d12Handler (on-demand)
|
||||
└── ExternalMemoryCache* m_memoryCache (on-demand)
|
||||
|
||||
D3D12SurfaceHandler
|
||||
├── ID3D12Device* m_device
|
||||
├── CUcontext m_cudaContext
|
||||
├── ExternalMemoryCache* m_cache
|
||||
└── Methods:
|
||||
├── CopyNV12Frame(src, dst, width, height, srcPitch)
|
||||
├── GetD3D12CUDAPointer(ID3D12Resource*)
|
||||
└── SignalD3D12Fence(value)
|
||||
|
||||
ExternalMemoryCache
|
||||
├── std::map<ID3D12Resource*, CachedEntry>
|
||||
└── Methods:
|
||||
├── GetOrCreateExternalMemory(resource)
|
||||
└── ReleaseAll()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Refactored Code Structure
|
||||
|
||||
### 1. NVDECAV1Decoder.cpp (Main decoder - simplified)
|
||||
|
||||
**Before**: 500+ lines in DecodeToSurface()
|
||||
|
||||
**After**: Clean routing logic
|
||||
|
||||
```cpp
|
||||
bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_size,
|
||||
void* target_surface, SurfaceType target_type)
|
||||
{
|
||||
// Step 1: Decode packet to NVDEC internal buffer
|
||||
if (!DecodePacket(packet_data, packet_size)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 2: Get decoded frame info
|
||||
DecodedFrameInfo frame_info;
|
||||
if (!GetDecodedFrame(&frame_info)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 3: Copy to target surface based on type
|
||||
bool result = false;
|
||||
switch (target_type) {
|
||||
case SURFACE_TYPE_CPU:
|
||||
result = CopyToCPUSurface(frame_info, target_surface);
|
||||
break;
|
||||
case SURFACE_TYPE_D3D12:
|
||||
result = CopyToD3D12Surface(frame_info, target_surface);
|
||||
break;
|
||||
case SURFACE_TYPE_D3D11:
|
||||
result = CopyToD3D11Surface(frame_info, target_surface);
|
||||
break;
|
||||
case SURFACE_TYPE_CUDA:
|
||||
result = CopyToCUDASurface(frame_info, target_surface);
|
||||
break;
|
||||
}
|
||||
|
||||
// Step 4: Cleanup
|
||||
cuvidUnmapVideoFrame(m_decoder, frame_info.device_ptr);
|
||||
return result;
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Private Helper Methods (in NVDECAV1Decoder.cpp)
|
||||
|
||||
```cpp
|
||||
// Decode packet using cuvidParseVideoData
|
||||
// Returns: true on success
|
||||
// Complexity: ~30 lines
|
||||
private:
|
||||
bool DecodePacket(const uint8_t* data, size_t size)
|
||||
{
|
||||
CUVIDSOURCEDATAPACKET packet = {};
|
||||
packet.payload = data;
|
||||
packet.payload_size = size;
|
||||
packet.flags = CUVID_PKT_TIMESTAMP;
|
||||
|
||||
CUresult result = cuvidParseVideoData(m_parser, &packet);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LogError("cuvidParseVideoData failed: %d", result);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Get decoded frame from internal queue
|
||||
// Returns: true if frame available
|
||||
// Complexity: ~40 lines
|
||||
private:
|
||||
struct DecodedFrameInfo {
|
||||
CUdeviceptr device_ptr;
|
||||
uint32_t pitch;
|
||||
uint32_t width;
|
||||
uint32_t height;
|
||||
};
|
||||
|
||||
bool GetDecodedFrame(DecodedFrameInfo* out_info)
|
||||
{
|
||||
if (m_frameQueue.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int frame_index = m_frameQueue.front();
|
||||
m_frameQueue.pop();
|
||||
|
||||
CUVIDPROCPARAMS proc_params = {};
|
||||
proc_params.progressive_frame = 1;
|
||||
|
||||
CUdeviceptr device_ptr;
|
||||
unsigned int pitch;
|
||||
CUresult result = cuvidMapVideoFrame(m_decoder, frame_index,
|
||||
&device_ptr, &pitch, &proc_params);
|
||||
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LogError("cuvidMapVideoFrame failed: %d", result);
|
||||
return false;
|
||||
}
|
||||
|
||||
out_info->device_ptr = device_ptr;
|
||||
out_info->pitch = pitch;
|
||||
out_info->width = m_width;
|
||||
out_info->height = m_height;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Copy to D3D12 surface (delegates to handler)
|
||||
// Returns: true on success
|
||||
// Complexity: ~20 lines
|
||||
private:
|
||||
bool CopyToD3D12Surface(const DecodedFrameInfo& frame, void* surface)
|
||||
{
|
||||
auto* d3d12_resource = static_cast<ID3D12Resource*>(surface);
|
||||
|
||||
// Create handler on-demand
|
||||
if (!m_d3d12Handler) {
|
||||
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
|
||||
m_d3d12Device, m_cudaContext
|
||||
);
|
||||
}
|
||||
|
||||
return m_d3d12Handler->CopyNV12Frame(
|
||||
frame.device_ptr,
|
||||
frame.pitch,
|
||||
d3d12_resource,
|
||||
frame.width,
|
||||
frame.height
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
### 3. D3D12SurfaceHandler.h (D3D12-specific operations)
|
||||
|
||||
```cpp
|
||||
#pragma once
|
||||
|
||||
#include <d3d12.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <memory>
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
// Forward declaration
|
||||
class ExternalMemoryCache;
|
||||
|
||||
class D3D12SurfaceHandler {
|
||||
public:
|
||||
D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context);
|
||||
~D3D12SurfaceHandler();
|
||||
|
||||
// Copy NV12 frame from CUDA to D3D12 texture
|
||||
// Returns: true on success
|
||||
bool CopyNV12Frame(CUdeviceptr src_frame,
|
||||
uint32_t src_pitch,
|
||||
ID3D12Resource* dst_texture,
|
||||
uint32_t width,
|
||||
uint32_t height);
|
||||
|
||||
// Signal D3D12 fence from CUDA stream
|
||||
// Returns: true on success
|
||||
bool SignalD3D12Fence(uint64_t fence_value);
|
||||
|
||||
private:
|
||||
// Get CUDA device pointer for D3D12 resource (uses cache)
|
||||
bool GetD3D12CUDAPointer(ID3D12Resource* resource, CUdeviceptr* out_ptr);
|
||||
|
||||
// Copy Y plane (8-bit single channel)
|
||||
bool CopyYPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height);
|
||||
|
||||
// Copy UV plane (8-bit dual channel, interleaved)
|
||||
bool CopyUVPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height);
|
||||
|
||||
private:
|
||||
ID3D12Device* m_device;
|
||||
CUcontext m_cudaContext;
|
||||
std::unique_ptr<ExternalMemoryCache> m_cache;
|
||||
};
|
||||
|
||||
} // namespace VavCore
|
||||
```
|
||||
|
||||
### 4. D3D12SurfaceHandler.cpp (Implementation)
|
||||
|
||||
```cpp
|
||||
#include "D3D12SurfaceHandler.h"
|
||||
#include "ExternalMemoryCache.h"
|
||||
#include <stdio.h>
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context)
|
||||
: m_device(device)
|
||||
, m_cudaContext(cuda_context)
|
||||
, m_cache(std::make_unique<ExternalMemoryCache>(device, cuda_context))
|
||||
{
|
||||
}
|
||||
|
||||
D3D12SurfaceHandler::~D3D12SurfaceHandler()
|
||||
{
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::CopyNV12Frame(CUdeviceptr src_frame,
|
||||
uint32_t src_pitch,
|
||||
ID3D12Resource* dst_texture,
|
||||
uint32_t width,
|
||||
uint32_t height)
|
||||
{
|
||||
// Get CUDA pointer for D3D12 resource
|
||||
CUdeviceptr dst_ptr = 0;
|
||||
if (!GetD3D12CUDAPointer(dst_texture, &dst_ptr)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get D3D12 texture layout
|
||||
D3D12_RESOURCE_DESC desc = dst_texture->GetDesc();
|
||||
D3D12_PLACED_SUBRESOURCE_FOOTPRINT layouts[2];
|
||||
UINT num_rows[2] = {0};
|
||||
UINT64 row_sizes[2] = {0};
|
||||
UINT64 total_bytes = 0;
|
||||
|
||||
m_device->GetCopyableFootprints(&desc, 0, 2, 0,
|
||||
layouts, num_rows, row_sizes, &total_bytes);
|
||||
|
||||
// Copy Y plane
|
||||
if (!CopyYPlane(src_frame, src_pitch,
|
||||
dst_ptr, layouts[0].Footprint.RowPitch,
|
||||
width, height)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy UV plane
|
||||
CUdeviceptr src_uv = src_frame + (src_pitch * height);
|
||||
CUdeviceptr dst_uv = dst_ptr + layouts[1].Offset;
|
||||
|
||||
if (!CopyUVPlane(src_uv, src_pitch,
|
||||
dst_uv, layouts[1].Footprint.RowPitch,
|
||||
width, height / 2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::GetD3D12CUDAPointer(ID3D12Resource* resource,
|
||||
CUdeviceptr* out_ptr)
|
||||
{
|
||||
return m_cache->GetOrCreateExternalMemory(resource, out_ptr);
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::CopyYPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy2D(
|
||||
(void*)dst, dst_pitch,
|
||||
(void*)src, src_pitch,
|
||||
width, height, // Copy only valid pixels, not padding
|
||||
cudaMemcpyDeviceToDevice
|
||||
);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
printf("[D3D12] Y plane copy failed: %d\n", err);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::CopyUVPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height)
|
||||
{
|
||||
// NV12 UV plane: interleaved U and V, so width in bytes = width of Y plane
|
||||
cudaError_t err = cudaMemcpy2D(
|
||||
(void*)dst, dst_pitch,
|
||||
(void*)src, src_pitch,
|
||||
width, height, // UV plane has same width in bytes, half height
|
||||
cudaMemcpyDeviceToDevice
|
||||
);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
printf("[D3D12] UV plane copy failed: %d\n", err);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace VavCore
|
||||
```
|
||||
|
||||
### 5. ExternalMemoryCache.h (CUDA-D3D12 interop cache)
|
||||
|
||||
```cpp
|
||||
#pragma once
|
||||
|
||||
#include <d3d12.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <map>
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
class ExternalMemoryCache {
|
||||
public:
|
||||
ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context);
|
||||
~ExternalMemoryCache();
|
||||
|
||||
// Get or create CUDA device pointer for D3D12 resource
|
||||
// Returns: true on success
|
||||
bool GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr);
|
||||
|
||||
// Release specific resource
|
||||
void Release(ID3D12Resource* resource);
|
||||
|
||||
// Release all cached resources
|
||||
void ReleaseAll();
|
||||
|
||||
private:
|
||||
struct CachedEntry {
|
||||
cudaExternalMemory_t external_memory;
|
||||
CUdeviceptr device_ptr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
bool ImportD3D12Resource(ID3D12Resource* resource,
|
||||
cudaExternalMemory_t* out_ext_mem,
|
||||
CUdeviceptr* out_ptr);
|
||||
|
||||
private:
|
||||
ID3D12Device* m_device;
|
||||
CUcontext m_cudaContext;
|
||||
std::map<ID3D12Resource*, CachedEntry> m_cache;
|
||||
};
|
||||
|
||||
} // namespace VavCore
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Improvements
|
||||
|
||||
### Readability
|
||||
**Before**:
|
||||
- `DecodeToSurface()`: 500+ lines with 5 levels of nesting
|
||||
- Mixed concerns: decoding, copying, caching, signaling
|
||||
|
||||
**After**:
|
||||
- `DecodeToSurface()`: 40 lines, clear 4-step process
|
||||
- Each helper method: 20-60 lines, single responsibility
|
||||
|
||||
### Debugging
|
||||
**Before**:
|
||||
- NV12 stride bug hidden in 500 lines of mixed logic
|
||||
- Hard to locate which `cudaMemcpy2D` call is wrong
|
||||
|
||||
**After**:
|
||||
- `CopyYPlane()` and `CopyUVPlane()` are separate methods
|
||||
- Easy to add breakpoint and inspect parameters
|
||||
- Clear separation of Y and UV plane logic
|
||||
|
||||
### Testing
|
||||
**Before**:
|
||||
- Cannot test D3D12 copying without full decoder setup
|
||||
- Cannot mock CUDA operations
|
||||
|
||||
**After**:
|
||||
- Can unit test `D3D12SurfaceHandler` independently
|
||||
- Can test `ExternalMemoryCache` in isolation
|
||||
- Easy to add mock implementations
|
||||
|
||||
### Maintenance
|
||||
**Before**:
|
||||
- Adding D3D11 support requires modifying 500+ line method
|
||||
- Risk of breaking existing D3D12 code
|
||||
|
||||
**After**:
|
||||
- Add new `D3D11SurfaceHandler` class
|
||||
- Existing D3D12 code untouched
|
||||
- Clean separation of concerns
|
||||
|
||||
---
|
||||
|
||||
## File Size Comparison
|
||||
|
||||
| File | Before | After |
|
||||
|------|--------|-------|
|
||||
| NVDECAV1Decoder.cpp | 1,722 lines | ~600 lines |
|
||||
| D3D12SurfaceHandler.cpp | - | ~300 lines |
|
||||
| ExternalMemoryCache.cpp | - | ~200 lines |
|
||||
| **Total** | 1,722 lines | 1,100 lines |
|
||||
|
||||
**Reduction**: 36% code reduction while improving readability
|
||||
|
||||
---
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Extract D3D12 Handler (2-3 hours)
|
||||
1. Create `D3D12SurfaceHandler.h/.cpp`
|
||||
2. Move D3D12 resource import logic
|
||||
3. Move NV12 plane copying logic
|
||||
4. Test with existing Vav2Player
|
||||
|
||||
**Acceptance Criteria**:
|
||||
- Vav2Player displays video correctly
|
||||
- No memory leaks
|
||||
- Performance same or better
|
||||
|
||||
### Phase 2: Extract External Memory Cache (1-2 hours)
|
||||
1. Create `ExternalMemoryCache.h/.cpp`
|
||||
2. Move external memory caching logic
|
||||
3. Add proper cleanup on resource release
|
||||
4. Test memory management
|
||||
|
||||
**Acceptance Criteria**:
|
||||
- Cache hit/miss working correctly
|
||||
- No memory leaks on repeated loads
|
||||
- Cache cleared on decoder cleanup
|
||||
|
||||
### Phase 3: Refactor Main Decoder (1-2 hours)
|
||||
1. Simplify `DecodeToSurface()` to routing logic
|
||||
2. Extract `DecodePacket()` method
|
||||
3. Extract `GetDecodedFrame()` method
|
||||
4. Extract `CopyToCPUSurface()` method
|
||||
5. Test all surface types
|
||||
|
||||
**Acceptance Criteria**:
|
||||
- All surface types working
|
||||
- Code passes all existing tests
|
||||
- Debug logging reduced
|
||||
|
||||
### Phase 4: Fix NV12 Stride Bug (30 minutes)
|
||||
1. Fix `CopyYPlane()` width parameter
|
||||
2. Fix `CopyUVPlane()` width parameter
|
||||
3. Verify with test video
|
||||
|
||||
**Acceptance Criteria**:
|
||||
- No stripe pattern in displayed video
|
||||
- Correct colors displayed
|
||||
- Performance maintained
|
||||
|
||||
---
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests
|
||||
```cpp
|
||||
TEST(D3D12SurfaceHandler, CopiesNV12FrameCorrectly)
|
||||
{
|
||||
auto handler = CreateTestHandler();
|
||||
auto src_frame = CreateTestNV12Frame(1920, 1080);
|
||||
auto dst_texture = CreateTestD3D12Texture(1920, 1080);
|
||||
|
||||
bool result = handler->CopyNV12Frame(
|
||||
src_frame.device_ptr, src_frame.pitch,
|
||||
dst_texture, 1920, 1080
|
||||
);
|
||||
|
||||
EXPECT_TRUE(result);
|
||||
VerifyNV12Data(dst_texture);
|
||||
}
|
||||
|
||||
TEST(ExternalMemoryCache, ReusesExistingEntry)
|
||||
{
|
||||
auto cache = CreateTestCache();
|
||||
auto resource = CreateTestD3D12Resource();
|
||||
|
||||
CUdeviceptr ptr1, ptr2;
|
||||
cache->GetOrCreateExternalMemory(resource, &ptr1);
|
||||
cache->GetOrCreateExternalMemory(resource, &ptr2);
|
||||
|
||||
EXPECT_EQ(ptr1, ptr2); // Should return same pointer
|
||||
}
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
- Load video file
|
||||
- Decode multiple frames
|
||||
- Verify no memory leaks
|
||||
- Verify correct video display
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [x] Design document complete
|
||||
- [ ] Phase 1 complete: D3D12SurfaceHandler working
|
||||
- [ ] Phase 2 complete: ExternalMemoryCache working
|
||||
- [ ] Phase 3 complete: Main decoder simplified
|
||||
- [ ] Phase 4 complete: NV12 stripe bug fixed
|
||||
- [ ] All existing tests passing
|
||||
- [ ] No performance regression
|
||||
- [ ] Code review passed
|
||||
- [ ] Documentation updated
|
||||
|
||||
---
|
||||
|
||||
**Next Step**: Start Phase 1 - Extract D3D12SurfaceHandler
|
||||
|
||||
**Last Updated**: 2025-10-03
|
||||
@@ -125,6 +125,8 @@
|
||||
<ClInclude Include="src\Decoder\AdaptiveAV1Decoder.h" />
|
||||
<ClInclude Include="src\Decoder\AdaptiveNVDECDecoder.h" />
|
||||
<ClInclude Include="src\Decoder\NVDECAV1Decoder.h" />
|
||||
<ClInclude Include="src\Decoder\D3D12SurfaceHandler.h" />
|
||||
<ClInclude Include="src\Decoder\ExternalMemoryCache.h" />
|
||||
<ClInclude Include="src\Decoder\MediaFoundationAV1Decoder.h" />
|
||||
<ClInclude Include="src\Decoder\AMFAV1Decoder.h" />
|
||||
<ClInclude Include="src\Decoder\VPLAV1Decoder.h" />
|
||||
@@ -143,6 +145,8 @@
|
||||
<ClCompile Include="src\Decoder\AdaptiveAV1Decoder.cpp" />
|
||||
<ClCompile Include="src\Decoder\AdaptiveNVDECDecoder.cpp" />
|
||||
<ClCompile Include="src\Decoder\NVDECAV1Decoder.cpp" />
|
||||
<ClCompile Include="src\Decoder\D3D12SurfaceHandler.cpp" />
|
||||
<ClCompile Include="src\Decoder\ExternalMemoryCache.cpp" />
|
||||
<ClCompile Include="src\Decoder\MediaFoundationAV1Decoder.cpp" />
|
||||
<ClCompile Include="src\Decoder\AMFAV1Decoder.cpp" />
|
||||
<ClCompile Include="src\Decoder\VPLAV1Decoder.cpp" />
|
||||
|
||||
@@ -0,0 +1,149 @@
|
||||
#include "pch.h"
|
||||
#include "D3D12SurfaceHandler.h"
|
||||
#include "ExternalMemoryCache.h"
|
||||
#include <stdio.h>
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context)
|
||||
: m_device(device)
|
||||
, m_cudaContext(cuda_context)
|
||||
, m_cache(std::make_unique<ExternalMemoryCache>(device, cuda_context))
|
||||
{
|
||||
OutputDebugStringA("[D3D12SurfaceHandler] Created\n");
|
||||
}
|
||||
|
||||
D3D12SurfaceHandler::~D3D12SurfaceHandler()
|
||||
{
|
||||
OutputDebugStringA("[D3D12SurfaceHandler] Destroyed\n");
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::CopyNV12Frame(CUdeviceptr src_frame,
|
||||
uint32_t src_pitch,
|
||||
ID3D12Resource* dst_texture,
|
||||
uint32_t width,
|
||||
uint32_t height)
|
||||
{
|
||||
// Get CUDA pointer for D3D12 resource
|
||||
CUdeviceptr dst_ptr = 0;
|
||||
if (!GetD3D12CUDAPointer(dst_texture, &dst_ptr)) {
|
||||
OutputDebugStringA("[D3D12SurfaceHandler] Failed to get CUDA pointer for D3D12 resource\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get D3D12 texture layout for both Y and UV planes
|
||||
D3D12_RESOURCE_DESC desc = dst_texture->GetDesc();
|
||||
D3D12_PLACED_SUBRESOURCE_FOOTPRINT layouts[2];
|
||||
UINT num_rows[2] = {0};
|
||||
UINT64 row_sizes[2] = {0};
|
||||
UINT64 total_bytes = 0;
|
||||
|
||||
// CRITICAL: Use actual video height, not texture allocation height
|
||||
// Texture was allocated with height * 1.5 for NV12 layout, but GetCopyableFootprints
|
||||
// should use actual video height to calculate correct plane offsets
|
||||
D3D12_RESOURCE_DESC actual_video_desc = desc;
|
||||
actual_video_desc.Height = height; // Use actual video height, not allocation height
|
||||
|
||||
// CRITICAL: Query BOTH subresources in a SINGLE call to get correct relative offsets
|
||||
// NV12 has 2 subresources (plane 0: Y, plane 1: UV)
|
||||
m_device->GetCopyableFootprints(&actual_video_desc, 0, 2, 0,
|
||||
layouts, num_rows, row_sizes, &total_bytes);
|
||||
|
||||
uint32_t y_dst_pitch = layouts[0].Footprint.RowPitch;
|
||||
uint32_t uv_dst_pitch = layouts[1].Footprint.RowPitch;
|
||||
UINT64 uv_offset = layouts[1].Offset;
|
||||
|
||||
char buf[512];
|
||||
sprintf_s(buf, "[D3D12SurfaceHandler] Y: width=%u, height=%u, srcPitch=%u, dstPitch=%u\n",
|
||||
width, height, src_pitch, y_dst_pitch);
|
||||
OutputDebugStringA(buf);
|
||||
|
||||
sprintf_s(buf, "[D3D12SurfaceHandler] UV: width=%u, height=%u, srcPitch=%u, dstPitch=%u, offset=%llu\n",
|
||||
width, height / 2, src_pitch, uv_dst_pitch, uv_offset);
|
||||
OutputDebugStringA(buf);
|
||||
|
||||
// Copy Y plane
|
||||
if (!CopyYPlane(src_frame, src_pitch,
|
||||
dst_ptr, y_dst_pitch,
|
||||
width, height)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy UV plane
|
||||
CUdeviceptr src_uv = src_frame + (src_pitch * height);
|
||||
CUdeviceptr dst_uv = dst_ptr + uv_offset;
|
||||
|
||||
if (!CopyUVPlane(src_uv, src_pitch,
|
||||
dst_uv, uv_dst_pitch,
|
||||
width, height / 2)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
OutputDebugStringA("[D3D12SurfaceHandler] NV12 frame copied successfully\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::SignalD3D12Fence(uint64_t fence_value)
|
||||
{
|
||||
// TODO: Implement fence signaling
|
||||
OutputDebugStringA("[D3D12SurfaceHandler] Fence signaling not implemented\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::GetD3D12CUDAPointer(ID3D12Resource* resource, CUdeviceptr* out_ptr)
|
||||
{
|
||||
return m_cache->GetOrCreateExternalMemory(resource, out_ptr);
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::CopyYPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height)
|
||||
{
|
||||
// Copy Y plane: single 8-bit channel
|
||||
// Width parameter is in bytes (same as pixel width for 8-bit format)
|
||||
cudaError_t err = cudaMemcpy2D(
|
||||
(void*)dst, dst_pitch,
|
||||
(void*)src, src_pitch,
|
||||
width, height,
|
||||
cudaMemcpyDeviceToDevice
|
||||
);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
char buf[256];
|
||||
sprintf_s(buf, "[D3D12SurfaceHandler] Y plane copy failed: %d (%s)\n",
|
||||
err, cudaGetErrorString(err));
|
||||
OutputDebugStringA(buf);
|
||||
return false;
|
||||
}
|
||||
|
||||
OutputDebugStringA("[D3D12SurfaceHandler] Y plane copied\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::CopyUVPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height)
|
||||
{
|
||||
// Copy UV plane: interleaved U and V (NV12 format)
|
||||
// Width in bytes = width of Y plane (because U and V are interleaved)
|
||||
// Height = half of Y plane height
|
||||
cudaError_t err = cudaMemcpy2D(
|
||||
(void*)dst, dst_pitch,
|
||||
(void*)src, src_pitch,
|
||||
width, height,
|
||||
cudaMemcpyDeviceToDevice
|
||||
);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
char buf[256];
|
||||
sprintf_s(buf, "[D3D12SurfaceHandler] UV plane copy failed: %d (%s)\n",
|
||||
err, cudaGetErrorString(err));
|
||||
OutputDebugStringA(buf);
|
||||
return false;
|
||||
}
|
||||
|
||||
OutputDebugStringA("[D3D12SurfaceHandler] UV plane copied\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace VavCore
|
||||
@@ -0,0 +1,51 @@
|
||||
#pragma once
|
||||
|
||||
#include <d3d12.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <memory>
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
// Forward declaration
|
||||
class ExternalMemoryCache;
|
||||
|
||||
// Handler for D3D12 surface operations
|
||||
// Manages CUDA-D3D12 interop and NV12 frame copying
|
||||
class D3D12SurfaceHandler {
|
||||
public:
|
||||
D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context);
|
||||
~D3D12SurfaceHandler();
|
||||
|
||||
// Copy NV12 frame from CUDA to D3D12 texture
|
||||
// Returns true on success
|
||||
bool CopyNV12Frame(CUdeviceptr src_frame,
|
||||
uint32_t src_pitch,
|
||||
ID3D12Resource* dst_texture,
|
||||
uint32_t width,
|
||||
uint32_t height);
|
||||
|
||||
// Signal D3D12 fence from CUDA stream (not implemented yet)
|
||||
bool SignalD3D12Fence(uint64_t fence_value);
|
||||
|
||||
private:
|
||||
// Get CUDA device pointer for D3D12 resource (uses cache)
|
||||
bool GetD3D12CUDAPointer(ID3D12Resource* resource, CUdeviceptr* out_ptr);
|
||||
|
||||
// Copy Y plane (8-bit single channel)
|
||||
bool CopyYPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height);
|
||||
|
||||
// Copy UV plane (8-bit dual channel, interleaved)
|
||||
bool CopyUVPlane(CUdeviceptr src, uint32_t src_pitch,
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height);
|
||||
|
||||
private:
|
||||
ID3D12Device* m_device;
|
||||
CUcontext m_cudaContext;
|
||||
std::unique_ptr<ExternalMemoryCache> m_cache;
|
||||
};
|
||||
|
||||
} // namespace VavCore
|
||||
@@ -0,0 +1,140 @@
|
||||
#include "pch.h"
|
||||
#include "ExternalMemoryCache.h"
|
||||
#include <windows.h>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
ExternalMemoryCache::ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context)
|
||||
: m_device(device)
|
||||
, m_cudaContext(cuda_context)
|
||||
{
|
||||
}
|
||||
|
||||
ExternalMemoryCache::~ExternalMemoryCache()
|
||||
{
|
||||
ReleaseAll();
|
||||
}
|
||||
|
||||
bool ExternalMemoryCache::GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr)
|
||||
{
|
||||
if (!resource || !out_ptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check cache first
|
||||
auto it = m_cache.find(resource);
|
||||
if (it != m_cache.end()) {
|
||||
*out_ptr = it->second.device_ptr;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Not in cache, import resource
|
||||
cudaExternalMemory_t external_memory;
|
||||
CUdeviceptr device_ptr;
|
||||
|
||||
if (!ImportD3D12Resource(resource, &external_memory, &device_ptr)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Add to cache
|
||||
CachedEntry entry;
|
||||
entry.external_memory = external_memory;
|
||||
entry.device_ptr = device_ptr;
|
||||
entry.size = 0; // Size is stored but not used currently
|
||||
|
||||
m_cache[resource] = entry;
|
||||
*out_ptr = device_ptr;
|
||||
|
||||
OutputDebugStringA("[ExternalMemoryCache] New resource cached\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
void ExternalMemoryCache::Release(ID3D12Resource* resource)
|
||||
{
|
||||
auto it = m_cache.find(resource);
|
||||
if (it == m_cache.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Destroy external memory
|
||||
cudaDestroyExternalMemory(it->second.external_memory);
|
||||
m_cache.erase(it);
|
||||
|
||||
OutputDebugStringA("[ExternalMemoryCache] Resource released\n");
|
||||
}
|
||||
|
||||
void ExternalMemoryCache::ReleaseAll()
|
||||
{
|
||||
for (auto& pair : m_cache) {
|
||||
cudaDestroyExternalMemory(pair.second.external_memory);
|
||||
}
|
||||
m_cache.clear();
|
||||
|
||||
OutputDebugStringA("[ExternalMemoryCache] All resources released\n");
|
||||
}
|
||||
|
||||
bool ExternalMemoryCache::ImportD3D12Resource(ID3D12Resource* resource,
|
||||
cudaExternalMemory_t* out_ext_mem,
|
||||
CUdeviceptr* out_ptr)
|
||||
{
|
||||
// Create shared handle for D3D12 resource
|
||||
HANDLE shared_handle = nullptr;
|
||||
HRESULT hr = m_device->CreateSharedHandle(resource, nullptr, GENERIC_ALL, nullptr, &shared_handle);
|
||||
|
||||
if (FAILED(hr)) {
|
||||
char buf[256];
|
||||
sprintf_s(buf, "[ExternalMemoryCache] CreateSharedHandle failed: 0x%08X\n", hr);
|
||||
OutputDebugStringA(buf);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get resource allocation size
|
||||
D3D12_RESOURCE_DESC desc = resource->GetDesc();
|
||||
D3D12_RESOURCE_ALLOCATION_INFO alloc_info = m_device->GetResourceAllocationInfo(0, 1, &desc);
|
||||
|
||||
// Import external memory
|
||||
cudaExternalMemoryHandleDesc mem_desc = {};
|
||||
mem_desc.type = cudaExternalMemoryHandleTypeD3D12Heap; // Use heap type for shared resources
|
||||
mem_desc.handle.win32.handle = shared_handle;
|
||||
mem_desc.handle.win32.name = nullptr; // No name for unnamed shared handle
|
||||
mem_desc.size = alloc_info.SizeInBytes;
|
||||
mem_desc.flags = cudaExternalMemoryDedicated; // Mark as dedicated allocation
|
||||
|
||||
cudaExternalMemory_t external_memory;
|
||||
cudaError_t err = cudaImportExternalMemory(&external_memory, &mem_desc);
|
||||
|
||||
CloseHandle(shared_handle);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
char buf[512];
|
||||
sprintf_s(buf, "[ExternalMemoryCache] cudaImportExternalMemory failed: %d (%s)\n"
|
||||
" type=%d, handle=%p, size=%llu, flags=%u\n",
|
||||
err, cudaGetErrorString(err),
|
||||
mem_desc.type, mem_desc.handle.win32.handle, mem_desc.size, mem_desc.flags);
|
||||
OutputDebugStringA(buf);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Map to device pointer
|
||||
cudaExternalMemoryBufferDesc buffer_desc = {};
|
||||
buffer_desc.size = mem_desc.size;
|
||||
|
||||
CUdeviceptr device_ptr;
|
||||
err = cudaExternalMemoryGetMappedBuffer((void**)&device_ptr, external_memory, &buffer_desc);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
char buf[256];
|
||||
sprintf_s(buf, "[ExternalMemoryCache] cudaExternalMemoryGetMappedBuffer failed: %d\n", err);
|
||||
OutputDebugStringA(buf);
|
||||
cudaDestroyExternalMemory(external_memory);
|
||||
return false;
|
||||
}
|
||||
|
||||
*out_ext_mem = external_memory;
|
||||
*out_ptr = device_ptr;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace VavCore
|
||||
@@ -0,0 +1,48 @@
|
||||
#pragma once
|
||||
|
||||
#include <d3d12.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <map>
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
// Cache for CUDA-D3D12 interop external memory
|
||||
// Manages cudaExternalMemory_t objects and their associated CUDA device pointers
|
||||
class ExternalMemoryCache {
|
||||
public:
|
||||
ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context);
|
||||
~ExternalMemoryCache();
|
||||
|
||||
// Get or create CUDA device pointer for D3D12 resource
|
||||
// Returns true on success, false on failure
|
||||
bool GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr);
|
||||
|
||||
// Release specific resource from cache
|
||||
void Release(ID3D12Resource* resource);
|
||||
|
||||
// Release all cached resources
|
||||
void ReleaseAll();
|
||||
|
||||
// Get cache size
|
||||
size_t GetCacheSize() const { return m_cache.size(); }
|
||||
|
||||
private:
|
||||
struct CachedEntry {
|
||||
cudaExternalMemory_t external_memory;
|
||||
CUdeviceptr device_ptr;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// Import D3D12 resource as CUDA external memory
|
||||
bool ImportD3D12Resource(ID3D12Resource* resource,
|
||||
cudaExternalMemory_t* out_ext_mem,
|
||||
CUdeviceptr* out_ptr);
|
||||
|
||||
private:
|
||||
ID3D12Device* m_device;
|
||||
CUcontext m_cudaContext;
|
||||
std::map<ID3D12Resource*, CachedEntry> m_cache;
|
||||
};
|
||||
|
||||
} // namespace VavCore
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
// 3rd: User defined
|
||||
#include "NVDECAV1Decoder.h"
|
||||
#include "D3D12SurfaceHandler.h"
|
||||
#include "VideoDecoderFactory.h"
|
||||
|
||||
const char* g_deinterleave_kernel_ptx = R"PTX(
|
||||
@@ -183,14 +184,9 @@ bool NVDECAV1Decoder::Initialize(const VideoMetadata& metadata) {
|
||||
}
|
||||
|
||||
void NVDECAV1Decoder::Cleanup() {
|
||||
// Clean up cached external memory
|
||||
if (m_cachedExternalMemory != nullptr) {
|
||||
OutputDebugStringA("[NVDECAV1Decoder::Cleanup] Destroying cached external memory\n");
|
||||
cudaDestroyExternalMemory(m_cachedExternalMemory);
|
||||
m_cachedExternalMemory = nullptr;
|
||||
m_cachedDevicePtr = nullptr;
|
||||
m_cachedD3D12Resource = nullptr;
|
||||
}
|
||||
// Clean up D3D12 surface handler (releases all cached external memory)
|
||||
m_d3d12Handler.reset();
|
||||
OutputDebugStringA("[NVDECAV1Decoder::Cleanup] D3D12SurfaceHandler released\n");
|
||||
|
||||
// Clean up D3D12 synchronization objects
|
||||
if (m_cudaSemaphore != nullptr) {
|
||||
@@ -1165,7 +1161,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
output_frame.frame_index = m_framesDecoded;
|
||||
output_frame.timestamp_seconds = static_cast<double>(m_framesDecoded) / 30.0;
|
||||
} else if (target_type == VAVCORE_SURFACE_D3D12_RESOURCE) {
|
||||
// D3D12 resource path using CUDA External Memory API
|
||||
// D3D12 resource path using D3D12SurfaceHandler
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 resource path\n");
|
||||
|
||||
if (!target_surface || !m_d3d12Device) {
|
||||
@@ -1174,107 +1170,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
return false;
|
||||
}
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] target_surface=%p, m_d3d12Device=%p\n",
|
||||
target_surface, m_d3d12Device);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Cast target_surface to ID3D12Resource*
|
||||
ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(target_surface);
|
||||
|
||||
// Check if we can reuse cached external memory
|
||||
cudaExternalMemory_t externalMemory = nullptr;
|
||||
void* devicePtr = nullptr;
|
||||
HANDLE sharedHandle = nullptr;
|
||||
ID3D12Device* device = static_cast<ID3D12Device*>(m_d3d12Device);
|
||||
D3D12_RESOURCE_DESC d3d12Desc = d3d12Resource->GetDesc();
|
||||
D3D12_RESOURCE_ALLOCATION_INFO allocInfo = device->GetResourceAllocationInfo(0, 1, &d3d12Desc);
|
||||
|
||||
if (m_cachedD3D12Resource == d3d12Resource && m_cachedExternalMemory != nullptr && m_cachedDevicePtr != nullptr) {
|
||||
// Reuse cached external memory
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Reusing cached external memory (resource=%p)\n", d3d12Resource);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
externalMemory = m_cachedExternalMemory;
|
||||
devicePtr = m_cachedDevicePtr;
|
||||
} else {
|
||||
// Create new external memory import
|
||||
if (m_cachedExternalMemory != nullptr) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Resource changed (%p -> %p), destroying old external memory\n",
|
||||
m_cachedD3D12Resource, d3d12Resource);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Clean up old external memory
|
||||
cudaDestroyExternalMemory(m_cachedExternalMemory);
|
||||
m_cachedExternalMemory = nullptr;
|
||||
m_cachedDevicePtr = nullptr;
|
||||
m_cachedD3D12Resource = nullptr;
|
||||
}
|
||||
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Creating new external memory import...\n");
|
||||
|
||||
// Get Windows shared handle for D3D12 resource
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Getting shared handle for D3D12 resource...\n");
|
||||
HRESULT hr = device->CreateSharedHandle(d3d12Resource, nullptr, GENERIC_ALL, nullptr, &sharedHandle);
|
||||
if (FAILED(hr)) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] CreateSharedHandle failed with HRESULT 0x%08X\n", hr);
|
||||
OutputDebugStringA(debug_buf);
|
||||
return false;
|
||||
}
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Shared handle created: %p\n", sharedHandle);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Import D3D12 resource as CUDA external memory using CUDA Runtime API
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Importing D3D12 resource as CUDA external memory...\n");
|
||||
|
||||
cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {};
|
||||
externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeD3D12Heap; // Use heap type for shared resources
|
||||
externalMemoryHandleDesc.handle.win32.handle = sharedHandle;
|
||||
externalMemoryHandleDesc.handle.win32.name = nullptr; // No name for unnamed shared handle
|
||||
externalMemoryHandleDesc.size = allocInfo.SizeInBytes; // Use actual D3D12 allocation size
|
||||
externalMemoryHandleDesc.flags = cudaExternalMemoryDedicated; // Mark as dedicated allocation
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] D3D12 allocation size: %llu bytes (vs naive %u bytes)\n",
|
||||
allocInfo.SizeInBytes, m_width * (m_height + m_height / 2));
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
cudaError_t cudaStatus = cudaImportExternalMemory(&externalMemory, &externalMemoryHandleDesc);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaImportExternalMemory failed with code %d: %s\n",
|
||||
cudaStatus, cudaGetErrorString(cudaStatus));
|
||||
OutputDebugStringA(debug_buf);
|
||||
CloseHandle(sharedHandle);
|
||||
return false;
|
||||
}
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 resource imported as CUDA external memory\n");
|
||||
|
||||
// Map external memory to CUDA device pointer
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Mapping CUDA external memory to device pointer...\n");
|
||||
cudaExternalMemoryBufferDesc externalMemoryBufferDesc = {};
|
||||
externalMemoryBufferDesc.offset = 0;
|
||||
externalMemoryBufferDesc.size = externalMemoryHandleDesc.size;
|
||||
externalMemoryBufferDesc.flags = 0;
|
||||
|
||||
cudaStatus = cudaExternalMemoryGetMappedBuffer(&devicePtr, externalMemory, &externalMemoryBufferDesc);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaExternalMemoryGetMappedBuffer failed with code %d: %s\n",
|
||||
cudaStatus, cudaGetErrorString(cudaStatus));
|
||||
OutputDebugStringA(debug_buf);
|
||||
cudaDestroyExternalMemory(externalMemory);
|
||||
CloseHandle(sharedHandle);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Cache the external memory for reuse
|
||||
m_cachedD3D12Resource = d3d12Resource;
|
||||
m_cachedExternalMemory = externalMemory;
|
||||
m_cachedDevicePtr = devicePtr;
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Cached external memory (resource=%p, extMem=%p, devPtr=%p)\n",
|
||||
d3d12Resource, externalMemory, devicePtr);
|
||||
OutputDebugStringA(debug_buf);
|
||||
// Create D3D12 surface handler on-demand
|
||||
if (!m_d3d12Handler) {
|
||||
m_d3d12Handler = std::make_unique<D3D12SurfaceHandler>(
|
||||
static_cast<ID3D12Device*>(m_d3d12Device),
|
||||
m_cuContext
|
||||
);
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12SurfaceHandler created\n");
|
||||
}
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] CUDA device pointer mapped: %p\n", devicePtr);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Wait for decoded frame to be available (blocks until callback completes)
|
||||
int frameIdx = -1;
|
||||
@@ -1299,18 +1202,11 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
CUdeviceptr srcDevicePtr = 0;
|
||||
unsigned int srcPitch = 0;
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Calling cuvidMapVideoFrame (frameIdx=%d)\n", frameIdx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidMapVideoFrame failed with code %d\n", result);
|
||||
OutputDebugStringA(debug_buf);
|
||||
LogCUDAError(result, "cuvidMapVideoFrame");
|
||||
// Don't destroy cached external memory on error - it will be cleaned up in Cleanup()
|
||||
if (sharedHandle != nullptr) {
|
||||
CloseHandle(sharedHandle);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1318,212 +1214,32 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
(void*)srcDevicePtr, srcPitch);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Query D3D12 NV12 texture pitch (may be larger than width due to alignment)
|
||||
D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout = {};
|
||||
UINT numRows = 0;
|
||||
UINT64 rowSizeInBytes = 0;
|
||||
UINT64 totalBytes = 0;
|
||||
device->GetCopyableFootprints(&d3d12Desc, 0, 1, 0, &layout, &numRows, &rowSizeInBytes, &totalBytes);
|
||||
UINT d3d12Pitch = layout.Footprint.RowPitch;
|
||||
// Copy NV12 frame from CUDA to D3D12 texture using D3D12SurfaceHandler
|
||||
ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(target_surface);
|
||||
bool copySuccess = m_d3d12Handler->CopyNV12Frame(
|
||||
srcDevicePtr, srcPitch,
|
||||
d3d12Resource,
|
||||
m_width, m_height
|
||||
);
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] D3D12 NV12 texture pitch: %u (width: %u, srcPitch: %u)\n",
|
||||
d3d12Pitch, m_width, srcPitch);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Copy Y plane from NVDEC to D3D12 NV12 texture (via CUDA device pointer)
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying Y plane to D3D12 texture...\n");
|
||||
cudaError_t cudaStatus = cudaMemcpy2D(devicePtr, d3d12Pitch, // Use D3D12 texture pitch
|
||||
(void*)srcDevicePtr, srcPitch,
|
||||
srcPitch, m_height, // Use pitch as width to copy the entire row
|
||||
cudaMemcpyDeviceToDevice);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (Y plane) failed with code %d: %s\n",
|
||||
cudaStatus, cudaGetErrorString(cudaStatus));
|
||||
OutputDebugStringA(debug_buf);
|
||||
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
|
||||
// Don't destroy cached external memory on error - it will be cleaned up in Cleanup()
|
||||
if (sharedHandle != nullptr) {
|
||||
CloseHandle(sharedHandle);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Y plane copied successfully to D3D12 texture\n");
|
||||
|
||||
// Copy UV plane from NVDEC to D3D12 NV12 texture
|
||||
// Calculate proper UV plane offset using D3D12 footprint
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Calculating D3D12 UV plane offset...\n");
|
||||
|
||||
// Get proper layout for both Y and UV planes
|
||||
D3D12_RESOURCE_DESC texDesc = d3d12Resource->GetDesc();
|
||||
D3D12_PLACED_SUBRESOURCE_FOOTPRINT planeLayouts[2];
|
||||
UINT planeNumRows[2] = { 0, };
|
||||
UINT64 planeRowSizes[2] = { 0, };
|
||||
UINT64 planeTotalBytes = 0;
|
||||
|
||||
// Zero out the layout arrays to avoid reading garbage data from uninitialized memory
|
||||
memset(planeLayouts, 0, sizeof(planeLayouts));
|
||||
|
||||
// CRITICAL: Use actual video height, not texture allocation height
|
||||
// Texture was allocated with height * 1.5 for NV12 layout, but GetCopyableFootprints
|
||||
// should use actual video height to calculate correct plane offsets
|
||||
D3D12_RESOURCE_DESC actualVideoDesc = texDesc;
|
||||
actualVideoDesc.Height = m_height; // Use actual video height (2160), not allocation height (3240)
|
||||
|
||||
// CRITICAL: Query BOTH subresources in a SINGLE call to get correct relative offsets
|
||||
// NV12 has 2 subresources (plane 0: Y, plane 1: UV)
|
||||
// When querying separately, each starts at offset 0
|
||||
// When querying together, planeLayouts[1].Offset will be the actual offset from base
|
||||
device->GetCopyableFootprints(&actualVideoDesc, 0, 2, 0, planeLayouts, planeNumRows, planeRowSizes, &planeTotalBytes);
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] GetCopyableFootprints total bytes: %llu (allocation size: %llu)\n",
|
||||
planeTotalBytes, allocInfo.SizeInBytes);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Y plane: offset=%llu, pitch=%u, width=%u, height=%u, rows=%u\n",
|
||||
planeLayouts[0].Offset, planeLayouts[0].Footprint.RowPitch,
|
||||
planeLayouts[0].Footprint.Width, planeLayouts[0].Footprint.Height, planeNumRows[0]);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV plane: offset=%llu, pitch=%u, width=%u, height=%u, rows=%u, rowSize=%llu\n",
|
||||
planeLayouts[1].Offset, planeLayouts[1].Footprint.RowPitch,
|
||||
planeLayouts[1].Footprint.Width, planeLayouts[1].Footprint.Height, planeNumRows[1], planeRowSizes[1]);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying UV plane to D3D12 texture...\n");
|
||||
|
||||
// CRITICAL: Use the EXACT UV plane offset from GetCopyableFootprints
|
||||
// planeLayouts[1].Offset is the correct offset accounting for alignment
|
||||
void* uvDstPtr = static_cast<uint8_t*>(devicePtr) + planeLayouts[1].Offset;
|
||||
|
||||
// NVDEC UV plane offset: For NV12, UV plane starts at Y_height * Y_pitch
|
||||
void* uvSrcPtr = (void*)(srcDevicePtr + (srcPitch * m_height));
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] NVDEC UV source: srcDevicePtr=%p, Y_offset=%llu, UV_srcPtr=%p\n",
|
||||
(void*)srcDevicePtr, (uint64_t)(srcPitch * m_height), uvSrcPtr);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV plane offset from GetCopyableFootprints: %llu bytes\n", planeLayouts[1].Offset);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// UV plane: NV12 format has interleaved UV (UVUVUV...)
|
||||
// CRITICAL: For cudaMemcpy2D, width parameter is in BYTES, not pixels
|
||||
// UV plane stores U and V interleaved: each row has (width/2) UV pairs = width bytes total
|
||||
// Even though D3D12 sees it as (width/2) R8G8 pixels, the memory layout is still width bytes per row
|
||||
UINT uvCopyWidth = m_width; // Copy full width in bytes (includes both U and V channels)
|
||||
UINT uvCopyHeight = m_height / 2; // UV plane has half height
|
||||
|
||||
// Verify we won't exceed allocation bounds
|
||||
size_t uvPlaneNeededSize = static_cast<size_t>(planeLayouts[1].Footprint.RowPitch) * uvCopyHeight;
|
||||
size_t totalNeededSize = planeLayouts[1].Offset + uvPlaneNeededSize;
|
||||
size_t availableForUV = allocInfo.SizeInBytes - planeLayouts[1].Offset;
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Memory check: uvOffset=%llu, uvSize=%llu, total=%llu, allocated=%llu, available=%llu\n",
|
||||
planeLayouts[1].Offset, uvPlaneNeededSize, totalNeededSize, allocInfo.SizeInBytes, availableForUV);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Clamp UV copy to fit within allocated memory
|
||||
UINT maxUVRows = static_cast<UINT>(availableForUV / planeLayouts[1].Footprint.RowPitch);
|
||||
if (maxUVRows < uvCopyHeight) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] WARNING: Clamping UV rows from %u to %u to fit allocation\n",
|
||||
uvCopyHeight, maxUVRows);
|
||||
OutputDebugStringA(debug_buf);
|
||||
uvCopyHeight = maxUVRows;
|
||||
}
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV copy params: width=%u, height=%u, dstPitch=%u, srcPitch=%u\n",
|
||||
uvCopyWidth, uvCopyHeight, planeLayouts[1].Footprint.RowPitch, srcPitch);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
cudaStatus = cudaMemcpy2D(uvDstPtr, planeLayouts[1].Footprint.RowPitch,
|
||||
uvSrcPtr, srcPitch,
|
||||
srcPitch, uvCopyHeight, // Use pitch as width to copy the entire row
|
||||
cudaMemcpyDeviceToDevice);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (UV plane) failed with code %d: %s\n",
|
||||
cudaStatus, cudaGetErrorString(cudaStatus));
|
||||
OutputDebugStringA(debug_buf);
|
||||
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
|
||||
// Don't destroy cached external memory on error - it will be cleaned up in Cleanup()
|
||||
if (sharedHandle != nullptr) {
|
||||
CloseHandle(sharedHandle);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] UV plane copied successfully to D3D12 texture\n");
|
||||
|
||||
// Asynchronously signal the D3D12 fence from CUDA when copies are complete
|
||||
if (m_cudaSemaphore != nullptr) {
|
||||
// Ensure we're in the correct CUDA context
|
||||
CUcontext currentContext = nullptr;
|
||||
cuCtxGetCurrent(¤tContext);
|
||||
|
||||
char debug_buf[256];
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Current CUDA context: %p, Expected: %p\n",
|
||||
currentContext, m_cuContext);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
if (currentContext != m_cuContext) {
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] WARNING: CUDA context mismatch, setting correct context\n");
|
||||
cuCtxSetCurrent(m_cuContext);
|
||||
}
|
||||
|
||||
m_fenceValue++;
|
||||
cudaExternalSemaphoreSignalParams signalParams = {};
|
||||
signalParams.params.fence.value = m_fenceValue;
|
||||
signalParams.flags = 0;
|
||||
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Signaling fence with value: %llu, semaphore: %p, stream: %p\n",
|
||||
m_fenceValue, m_cudaSemaphore, m_stream);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
cudaStatus = cudaSignalExternalSemaphoresAsync(&m_cudaSemaphore, &signalParams, 1, m_stream);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaSignalExternalSemaphoresAsync failed: %s (error code: %d)\n",
|
||||
cudaGetErrorString(cudaStatus), cudaStatus);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Try synchronizing the stream and signaling the fence from CPU instead
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Falling back to CPU fence signaling\n");
|
||||
cudaStreamSynchronize(m_stream);
|
||||
|
||||
if (m_d3d12Fence != nullptr) {
|
||||
HRESULT hr = static_cast<ID3D12Fence*>(m_d3d12Fence)->Signal(m_fenceValue);
|
||||
if (SUCCEEDED(hr)) {
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal succeeded\n");
|
||||
output_frame.sync_fence_value = m_fenceValue;
|
||||
} else {
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal also failed\n");
|
||||
output_frame.sync_fence_value = 0;
|
||||
}
|
||||
} else {
|
||||
output_frame.sync_fence_value = 0;
|
||||
}
|
||||
} else {
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA fence signal enqueued successfully\n");
|
||||
// Pass the fence value to the caller
|
||||
output_frame.sync_fence_value = m_fenceValue;
|
||||
}
|
||||
} else {
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] No CUDA semaphore available, skipping fence signal\n");
|
||||
// No synchronization available
|
||||
output_frame.sync_fence_value = 0;
|
||||
}
|
||||
|
||||
// Unmap frame and close shared handle (if newly created)
|
||||
// Unmap frame
|
||||
result = cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidUnmapVideoFrame failed with code %d\n", result);
|
||||
OutputDebugStringA(debug_buf);
|
||||
LogCUDAError(result, "cuvidUnmapVideoFrame");
|
||||
// Even if unmap fails, try to continue and close the handle
|
||||
}
|
||||
|
||||
if (sharedHandle != nullptr) {
|
||||
CloseHandle(sharedHandle);
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Shared handle closed\n");
|
||||
if (!copySuccess) {
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed\n");
|
||||
return false;
|
||||
}
|
||||
// External memory is cached and will be reused for next frame
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 frame processing complete (external memory cached, GPU synchronized)\n");
|
||||
|
||||
// Signal D3D12 fence (not yet implemented in D3D12SurfaceHandler)
|
||||
m_d3d12Handler->SignalD3D12Fence(++m_fenceValue);
|
||||
output_frame.sync_fence_value = m_fenceValue;
|
||||
|
||||
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] D3D12 frame processing complete\n");
|
||||
|
||||
// Fill output frame metadata
|
||||
output_frame.width = m_width;
|
||||
|
||||
@@ -6,6 +6,11 @@
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
|
||||
// Forward declarations
|
||||
namespace VavCore {
|
||||
class D3D12SurfaceHandler;
|
||||
}
|
||||
|
||||
//// Prevent TIMECODE conflicts by defining it before Windows headers
|
||||
//#define WIN32_LEAN_AND_MEAN
|
||||
//#define NOMINMAX
|
||||
@@ -118,10 +123,8 @@ private:
|
||||
cudaExternalSemaphore_t m_cudaSemaphore = {};
|
||||
uint64_t m_fenceValue = 0;
|
||||
|
||||
// External memory caching for D3D12 interop
|
||||
void* m_cachedD3D12Resource = nullptr; // ID3D12Resource*
|
||||
cudaExternalMemory_t m_cachedExternalMemory = nullptr;
|
||||
void* m_cachedDevicePtr = nullptr;
|
||||
// D3D12 surface handler (created on-demand)
|
||||
std::unique_ptr<D3D12SurfaceHandler> m_d3d12Handler;
|
||||
|
||||
// Decoder configuration
|
||||
CUVIDPARSERPARAMS m_parserParams = {};
|
||||
|
||||
Reference in New Issue
Block a user