This commit is contained in:
2025-10-11 02:08:57 +09:00
parent 9a7330d5fb
commit ff6b753dfe
10 changed files with 1886 additions and 357 deletions

View File

@@ -0,0 +1,416 @@
# NVDEC Decoder State Machine Refactoring Design
## Problem Statement
The current `NVDECAV1Decoder::DecodeToSurface()` has excessive complexity:
- **13+ state variables** tracked across multiple atomic flags and mutexes
- **9+ conditional branches** with nested conditions
- **~150 lines** in a single function
- **High cyclomatic complexity** (2^9 = 512 possible code paths)
This makes the code:
- Hard to maintain and debug
- Difficult to test comprehensively
- Prone to race conditions and edge cases
- Challenging to extend with new features
## Solution: State Machine Pattern
### Core Design Principle
**Consolidate all decoder state into a single enum** with clear transitions, replacing scattered atomic flags and conditional checks.
### State Machine States
```cpp
enum class DecoderState {
UNINITIALIZED, // Before Initialize() is called
READY, // Initialized and ready for decoding
BUFFERING, // Initial buffering (0-15 frames)
DECODING, // Normal frame-by-frame decoding
FLUSHING, // End-of-file reached, draining DPB
FLUSH_COMPLETE, // All frames drained
ERROR // Unrecoverable error state
};
```
### State Transitions
```
UNINITIALIZED → READY (Initialize() called successfully)
READY → BUFFERING (First DecodeToSurface() call)
BUFFERING → DECODING (Display queue has frames)
DECODING → FLUSHING (End-of-file reached, NULL packet)
FLUSHING → FLUSH_COMPLETE (Display queue empty)
FLUSH_COMPLETE → READY (Reset() called)
* → ERROR (Any state can transition to ERROR on failure)
ERROR → READY (Reset() called)
```
### State Machine Class
```cpp
class DecoderStateMachine {
public:
DecoderStateMachine() : m_state(DecoderState::UNINITIALIZED) {}
// State queries
DecoderState GetState() const { return m_state.load(); }
bool IsState(DecoderState state) const { return m_state.load() == state; }
bool CanDecode() const {
auto state = m_state.load();
return state == DecoderState::READY ||
state == DecoderState::BUFFERING ||
state == DecoderState::DECODING ||
state == DecoderState::FLUSHING;
}
// State transitions
bool TransitionTo(DecoderState new_state) {
DecoderState expected = m_state.load();
if (IsValidTransition(expected, new_state)) {
m_state.store(new_state);
LOGF_DEBUG("[DecoderStateMachine] State transition: %s → %s",
StateToString(expected), StateToString(new_state));
return true;
}
LOGF_ERROR("[DecoderStateMachine] Invalid transition: %s → %s",
StateToString(expected), StateToString(new_state));
return false;
}
// Specific transition helpers
void OnInitializeSuccess() {
TransitionTo(DecoderState::READY);
}
void OnFirstPacket() {
if (IsState(DecoderState::READY)) {
TransitionTo(DecoderState::BUFFERING);
}
}
void OnBufferingComplete(size_t queue_size) {
if (IsState(DecoderState::BUFFERING) && queue_size > 0) {
TransitionTo(DecoderState::DECODING);
}
}
void OnEndOfFile() {
if (IsState(DecoderState::DECODING) || IsState(DecoderState::BUFFERING)) {
TransitionTo(DecoderState::FLUSHING);
}
}
void OnFlushComplete() {
if (IsState(DecoderState::FLUSHING)) {
TransitionTo(DecoderState::FLUSH_COMPLETE);
}
}
void OnError() {
TransitionTo(DecoderState::ERROR);
}
void OnReset() {
TransitionTo(DecoderState::READY);
}
private:
std::atomic<DecoderState> m_state;
bool IsValidTransition(DecoderState from, DecoderState to) const {
// Define valid state transitions
switch (from) {
case DecoderState::UNINITIALIZED:
return to == DecoderState::READY || to == DecoderState::ERROR;
case DecoderState::READY:
return to == DecoderState::BUFFERING || to == DecoderState::ERROR;
case DecoderState::BUFFERING:
return to == DecoderState::DECODING || to == DecoderState::FLUSHING ||
to == DecoderState::ERROR || to == DecoderState::READY;
case DecoderState::DECODING:
return to == DecoderState::FLUSHING || to == DecoderState::ERROR ||
to == DecoderState::READY;
case DecoderState::FLUSHING:
return to == DecoderState::FLUSH_COMPLETE || to == DecoderState::ERROR ||
to == DecoderState::READY;
case DecoderState::FLUSH_COMPLETE:
return to == DecoderState::READY || to == DecoderState::ERROR;
case DecoderState::ERROR:
return to == DecoderState::READY;
default:
return false;
}
}
const char* StateToString(DecoderState state) const {
switch (state) {
case DecoderState::UNINITIALIZED: return "UNINITIALIZED";
case DecoderState::READY: return "READY";
case DecoderState::BUFFERING: return "BUFFERING";
case DecoderState::DECODING: return "DECODING";
case DecoderState::FLUSHING: return "FLUSHING";
case DecoderState::FLUSH_COMPLETE: return "FLUSH_COMPLETE";
case DecoderState::ERROR: return "ERROR";
default: return "UNKNOWN";
}
}
};
```
## Refactored DecodeToSurface()
### Before (Complex Branching):
```cpp
bool DecodeToSurface(...) {
// Step 1: Check if initialized
if (!m_initialized) { ... }
// Handle NULL packet_data as flush mode
if (!packet_data || packet_size == 0) {
m_endOfFileReached = true;
}
// Step 2: Submit packet
if (m_endOfFileReached) {
// Flush mode logic
} else {
// Normal mode logic
}
// Step 3: Check initial buffering
if (m_displayQueue.empty() && !m_initialBufferingComplete) {
// Buffering logic
}
if (!m_displayQueue.empty() && !m_initialBufferingComplete) {
m_initialBufferingComplete = true;
}
// Step 4: Pop from display queue
if (m_displayQueue.empty()) {
if (m_endOfFileReached) {
// Flush complete logic
} else {
// Error - queue empty unexpectedly
}
}
// ... (continues for 150 more lines)
}
```
### After (State Machine):
```cpp
bool DecodeToSurface(const uint8_t* packet_data, size_t packet_size,
VavCoreSurfaceType target_type,
void* target_surface,
VideoFrame& output_frame) {
// State validation
if (!m_stateMachine.CanDecode()) {
LOGF_ERROR("[DecodeToSurface] Invalid state: %s",
m_stateMachine.GetStateString());
return false;
}
// Handle end-of-file
if (!packet_data || packet_size == 0) {
return HandleFlushMode(output_frame);
}
// Delegate to state-specific handler
switch (m_stateMachine.GetState()) {
case DecoderState::READY:
case DecoderState::BUFFERING:
return HandleBufferingMode(packet_data, packet_size, target_type,
target_surface, output_frame);
case DecoderState::DECODING:
return HandleDecodingMode(packet_data, packet_size, target_type,
target_surface, output_frame);
default:
LOGF_ERROR("[DecodeToSurface] Unexpected state in DecodeToSurface");
return false;
}
}
```
### Helper Methods (State-Specific Logic):
```cpp
bool HandleBufferingMode(const uint8_t* packet_data, size_t packet_size,
VavCoreSurfaceType target_type,
void* target_surface,
VideoFrame& output_frame) {
// Transition to buffering on first packet
if (m_stateMachine.IsState(DecoderState::READY)) {
m_stateMachine.OnFirstPacket();
}
// Submit packet to NVDEC
if (!SubmitPacketToParser(packet_data, packet_size)) {
return false;
}
// Check if buffering is complete
{
std::lock_guard<std::mutex> lock(m_displayMutex);
if (m_displayQueue.empty()) {
// Still buffering
return false; // VAVCORE_PACKET_ACCEPTED
} else {
// Buffering complete
m_stateMachine.OnBufferingComplete(m_displayQueue.size());
// Fall through to decode the first frame
}
}
return RetrieveAndRenderFrame(target_type, target_surface, output_frame);
}
bool HandleDecodingMode(const uint8_t* packet_data, size_t packet_size,
VavCoreSurfaceType target_type,
void* target_surface,
VideoFrame& output_frame) {
// Submit packet to NVDEC
if (!SubmitPacketToParser(packet_data, packet_size)) {
return false;
}
// Retrieve and render frame
return RetrieveAndRenderFrame(target_type, target_surface, output_frame);
}
bool HandleFlushMode(VideoFrame& output_frame) {
// Transition to flushing if not already
if (!m_stateMachine.IsState(DecoderState::FLUSHING)) {
m_stateMachine.OnEndOfFile();
}
// Submit end-of-stream packet
if (!SubmitFlushPacket()) {
return false;
}
// Check if flush is complete
{
std::lock_guard<std::mutex> lock(m_displayMutex);
if (m_displayQueue.empty()) {
m_stateMachine.OnFlushComplete();
return false; // VAVCORE_END_OF_STREAM
}
}
// Still have frames to drain
return RetrieveAndRenderFrame(...);
}
```
## Removed/Consolidated State Variables
### Before:
```cpp
// 13+ state variables
std::atomic<bool> m_initialBufferingComplete{false};
std::atomic<bool> m_endOfFileReached{false};
std::atomic<bool> m_converterNeedsReinit{false};
std::atomic<uint64_t> m_submissionCounter{0};
std::atomic<uint64_t> m_returnCounter{0};
std::atomic<bool> m_pollingRunning{false};
std::mutex m_frameQueueMutex;
std::mutex m_cudaContextMutex;
std::mutex m_submissionMutex;
std::mutex m_displayMutex;
std::queue<int> m_displayQueue;
FrameSlot m_frameSlots[16]; // Each has 5 atomic flags
```
### After:
```cpp
// Single state machine + minimal supporting variables
DecoderStateMachine m_stateMachine;
// Still needed (but usage clarified by state machine):
std::mutex m_displayMutex;
std::queue<int> m_displayQueue;
FrameSlot m_frameSlots[16]; // Frame-specific state (not global decoder state)
std::atomic<uint64_t> m_submissionCounter{0}; // Submission ordering
std::mutex m_submissionMutex;
```
**Eliminated:**
- `m_initialBufferingComplete` → Replaced by `DecoderState::BUFFERING` vs `DECODING`
- `m_endOfFileReached` → Replaced by `DecoderState::FLUSHING`
- `m_converterNeedsReinit` → Moved to NV12ToRGBAConverter internal state
## Benefits
### 1. Complexity Reduction
- **13+ state variables → 1 state machine** with 7 well-defined states
- **9+ conditional branches → State-driven dispatch** (1 switch statement)
- **~150 lines → ~40 lines** per state handler (modular functions)
### 2. Improved Maintainability
- **Clear state transitions** with validation (no illegal states)
- **State-specific logic** isolated in dedicated functions
- **Easy debugging** with state transition logging
### 3. Better Testability
- **Test individual states** independently
- **Verify state transitions** explicitly
- **Mock state machine** for unit tests
### 4. Enhanced Readability
- **Self-documenting code** (state names describe decoder status)
- **Linear flow** instead of nested conditions
- **Clear intent** from state-specific handler names
## Implementation Plan
### Phase 1: Create State Machine Class (CURRENT)
- [x] Design state machine enum and transitions
- [ ] Implement DecoderStateMachine class
- [ ] Add state transition logging
### Phase 2: Extract Helper Methods
- [ ] Create `SubmitPacketToParser()`
- [ ] Create `RetrieveAndRenderFrame()`
- [ ] Create `SubmitFlushPacket()`
### Phase 3: Refactor DecodeToSurface()
- [ ] Replace state flags with state machine
- [ ] Implement `HandleBufferingMode()`
- [ ] Implement `HandleDecodingMode()`
- [ ] Implement `HandleFlushMode()`
### Phase 4: Update Other Methods
- [ ] Update `Initialize()` → call `m_stateMachine.OnInitializeSuccess()`
- [ ] Update `Reset()` → call `m_stateMachine.OnReset()`
- [ ] Update `Cleanup()` → call `m_stateMachine.TransitionTo(UNINITIALIZED)`
### Phase 5: Remove Obsolete State Variables
- [ ] Remove `m_initialBufferingComplete`
- [ ] Remove `m_endOfFileReached`
- [ ] Verify no regressions with existing tests
## Testing Strategy
### Unit Tests
- State transition validation (legal/illegal transitions)
- State-specific handler behavior
- Error state recovery
### Integration Tests
- Full decode pipeline with state transitions
- Edge cases (empty files, flush mode, errors)
- Multi-threaded decoding with state machine
### Regression Tests
- Existing RedSurfaceNVDECTest
- Vav2PlayerHeadless tests
- Vav2Player GUI tests
---
**Status**: Design complete, implementation in progress
**Last Updated**: 2025-10-11

File diff suppressed because it is too large Load Diff

View File

@@ -359,13 +359,19 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<PostBuildEvent>
<Command>echo Copying VavCore Debug DLL...
copy "$(ProjectDir)..\..\..\vavcore\lib\VavCore-debug.dll" "$(LayoutDir)\VavCore-debug.dll"
echo DLL copy completed.</Command>
<Message>Copying VavCore-debug.dll to output directory</Message>
<Command>echo Copying VavCore Debug DLL to AppX directory...
echo Source: "$(ProjectDir)..\..\..\vavcore\lib\VavCore-debug.dll"
echo Target: "$(LayoutDir)\VavCore-debug.dll"
copy /Y "$(ProjectDir)..\..\..\vavcore\lib\VavCore-debug.dll" "$(LayoutDir)\VavCore-debug.dll"
if errorlevel 1 (
echo ERROR: Failed to copy VavCore-debug.dll
exit /b 1
)
echo DLL copy completed successfully.</Command>
<Message>Copying VavCore-debug.dll to AppX directory</Message>
</PostBuildEvent>
<PreBuildEvent>
<Command>del "$(LayoutDir)\VavCore-debug.dll"</Command>
<Command>if exist "$(LayoutDir)\VavCore-debug.dll" del "$(LayoutDir)\VavCore-debug.dll"</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">

View File

@@ -102,40 +102,65 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
// Expected: VAVCORE_PACKET_ACCEPTED for first 16 frames
// No rendering during buffering phase
} else {
// Phase 2: Normal decoding with D3D12 surface (17th frame onwards)
ID3D12Resource* rgbaTexture = m_renderer->GetNextRGBATextureForCUDAInterop();
if (!rgbaTexture) {
LOGF_ERROR("[FrameProcessor] Failed to get next RGBA texture");
}
// Phase 2: Triple buffer filling (frames 16-18)
// Fill textures 0, 1, 2 before starting normal operation
else if (m_framesDecoded < 19) {
auto backend = m_renderer->GetRGBASurfaceBackend();
if (!backend) {
LOGF_ERROR("[FrameProcessor] Failed to get RGBASurfaceBackend");
m_frameProcessing.store(false);
if (onComplete) onComplete(false);
return false;
}
ID3D12Resource* decodeTexture = backend->GetNextDecodeTexture();
int decodeIndex = backend->GetDecodeTextureIndex();
LOGF_INFO("[FrameProcessor] Triple buffer filling: frame %llu -> texture[%d]",
m_framesDecoded.load(), decodeIndex);
result = vavcore_decode_to_surface(
player,
VAVCORE_SURFACE_D3D12_RESOURCE,
rgbaTexture,
decodeTexture,
&vavFrame
);
// After successful decode, copy to staging texture for safe rendering
// After successful decode, advance decode index only (render index stays at 0)
if (result == VAVCORE_SUCCESS) {
auto backend = m_renderer->GetRGBASurfaceBackend();
if (backend) {
HRESULT hr = backend->CopyToStagingTexture(rgbaTexture);
if (FAILED(hr)) {
LOGF_ERROR("[FrameProcessor] Failed to copy to staging texture: 0x%08X", hr);
} else {
// Wait for GPU copy to complete before proceeding
hr = backend->WaitForCopyCompletion();
if (FAILED(hr)) {
LOGF_ERROR("[FrameProcessor] Failed to wait for copy completion: 0x%08X", hr);
} else {
LOGF_INFO("[FrameProcessor] GPU copy completed, staging texture ready");
}
}
}
backend->AdvanceDecodeOnly();
LOGF_INFO("[FrameProcessor] Triple buffer filled: texture[%d] ready", decodeIndex);
}
}
// Phase 3: Normal operation (frame 19+)
// Render from current texture, decode into next texture
else {
auto backend = m_renderer->GetRGBASurfaceBackend();
if (!backend) {
LOGF_ERROR("[FrameProcessor] Failed to get RGBASurfaceBackend");
m_frameProcessing.store(false);
if (onComplete) onComplete(false);
return false;
}
ID3D12Resource* decodeTexture = backend->GetNextDecodeTexture();
int decodeIndex = backend->GetDecodeTextureIndex();
int renderIndex = backend->GetRenderTextureIndex();
LOGF_DEBUG("[FrameProcessor] Normal operation: render[%d], decode[%d]",
renderIndex, decodeIndex);
result = vavcore_decode_to_surface(
player,
VAVCORE_SURFACE_D3D12_RESOURCE,
decodeTexture,
&vavFrame
);
// After successful decode, advance frame indices
if (result == VAVCORE_SUCCESS) {
backend->AdvanceFrame();
LOGF_DEBUG("[FrameProcessor] Frame advanced: render[%d]->render[%d]",
renderIndex, backend->GetRenderTextureIndex());
}
}
}
@@ -157,6 +182,10 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
// No frame is ready yet - VavCore will return it in a future call
LOGF_DEBUG("[FrameProcessor] PACKET ACCEPTED - Frame buffered in VavCore CUDA DPB (16-frame buffering)");
// CRITICAL: Increment m_framesDecoded for buffered packets
// This counter determines when we switch from NULL surface (buffering) to valid surface (rendering)
m_framesDecoded++;
// No action needed - just wait for next timing tick
// VavCore will return the buffered frame when ready
m_frameProcessing.store(false);

View File

@@ -268,13 +268,6 @@ ID3D12Resource* D3D12VideoRenderer::GetRGBATextureForCUDAInterop() const {
return nullptr;
}
ID3D12Resource* D3D12VideoRenderer::GetNextRGBATextureForCUDAInterop() {
if (m_rgbaSurfaceBackend) {
return m_rgbaSurfaceBackend->GetNextVideoTexture();
}
return nullptr;
}
uint8_t* D3D12VideoRenderer::GetYMappedBuffer(uint32_t bufferIndex) const {
if (m_yuv420pUploadBackend) {
return m_yuv420pUploadBackend->GetYMappedBuffer(bufferIndex);
@@ -496,6 +489,13 @@ IVideoBackend* D3D12VideoRenderer::SelectBackend(const VavCoreVideoFrame& frame)
}
HRESULT D3D12VideoRenderer::EnsureVideoTexture(const VavCoreVideoFrame& frame) {
// Skip if frame has invalid dimensions (can happen during CUDA DPB buffering)
if (frame.width == 0 || frame.height == 0) {
LOGF_DEBUG("[D3D12VideoRenderer] Skipping texture creation for invalid frame dimensions: %dx%d",
frame.width, frame.height);
return S_OK; // Not an error, just skip texture creation
}
// Check if we need to create/recreate video texture
if (m_videoWidth != (uint32_t)frame.width || m_videoHeight != (uint32_t)frame.height) {
IVideoBackend* backend = SelectBackend(frame);

View File

@@ -59,10 +59,9 @@ public:
// Backend-specific texture access for CUDA interop
ID3D12Resource* GetRGBATextureForCUDAInterop() const;
ID3D12Resource* GetNextRGBATextureForCUDAInterop(); // Rotates to next buffer for triple buffering
ID3D12Resource* GetNV12TextureForCUDAInterop() const { return nullptr; } // Future: NV12DirectBackend
// Get RGBASurfaceBackend for staging texture operations
// Get RGBASurfaceBackend for triple buffer management
RGBASurfaceBackend* GetRGBASurfaceBackend() const { return m_rgbaSurfaceBackend.get(); }
// Legacy YUV420P upload buffer access (for backward compatibility)

View File

@@ -54,21 +54,8 @@ void RGBASurfaceBackend::Shutdown() {
for (int i = 0; i < BUFFER_COUNT; i++) {
m_rgbaTextures[i].Reset();
}
m_currentTextureIndex = 0;
// Release staging texture and copy command objects
m_copyCommandList.Reset();
m_copyCommandAllocator.Reset();
m_stagingTexture.Reset();
// Close fence event handle
if (m_copyFenceEvent != nullptr) {
CloseHandle(m_copyFenceEvent);
m_copyFenceEvent = nullptr;
}
// Release fence
m_copyFence.Reset();
m_renderTextureIndex = 0;
m_decodeTextureIndex = 0;
// Clear references (not owned)
m_device = nullptr;
@@ -78,15 +65,13 @@ void RGBASurfaceBackend::Shutdown() {
}
HRESULT RGBASurfaceBackend::CreateVideoTexture(uint32_t width, uint32_t height) {
LOGF_INFO("[RGBASurfaceBackend] CreateVideoTexture called: %ux%u", width, height);
m_videoWidth = width;
m_videoHeight = height;
HRESULT hr = S_OK;
// Create RGBA texture descriptor for CUDA Surface Object write
// Format: DXGI_FORMAT_R8G8B8A8_UNORM (4 bytes per pixel)
// Flags: D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS (enables CUDA Surface Object creation)
// Layout: D3D12_TEXTURE_LAYOUT_UNKNOWN (tiled, CUDA Surface Objects handle this automatically)
D3D12_RESOURCE_DESC rgbaTextureDesc = {};
rgbaTextureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
rgbaTextureDesc.Width = width;
@@ -96,8 +81,8 @@ HRESULT RGBASurfaceBackend::CreateVideoTexture(uint32_t width, uint32_t height)
rgbaTextureDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
rgbaTextureDesc.SampleDesc.Count = 1;
rgbaTextureDesc.SampleDesc.Quality = 0;
rgbaTextureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN; // Tiled layout
rgbaTextureDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; // Enable CUDA write
rgbaTextureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
rgbaTextureDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
D3D12_HEAP_PROPERTIES defaultHeapProps = {};
defaultHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
@@ -108,110 +93,48 @@ HRESULT RGBASurfaceBackend::CreateVideoTexture(uint32_t width, uint32_t height)
for (int i = 0; i < BUFFER_COUNT; i++) {
hr = m_device->CreateCommittedResource(
&defaultHeapProps,
D3D12_HEAP_FLAG_SHARED, // Required for CUDA interop
D3D12_HEAP_FLAG_SHARED,
&rgbaTextureDesc,
D3D12_RESOURCE_STATE_COMMON, // CUDA will transition as needed
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(&m_rgbaTextures[i])
);
if (FAILED(hr)) {
// Cleanup already created textures
LOGF_ERROR("[RGBASurfaceBackend] Failed to create RGBA texture[%d]: 0x%08X", i, hr);
for (int j = 0; j < i; j++) {
m_rgbaTextures[j].Reset();
}
return hr;
}
LOGF_INFO("[RGBASurfaceBackend] Created RGBA texture[%d]: %p", i, m_rgbaTextures[i].Get());
}
m_currentTextureIndex = 0;
// Triple buffer filling logic:
// - Frames 16-18 fill textures 0, 1, 2 (decode only, no rendering yet)
// - Frame 19+ normal operation (decode into different texture than render)
//
// Initial state for filling phase:
// - decodeIndex = 0 (will fill texture[0], then [1], then [2])
// - renderIndex = 2 (will render from texture[2] after filling completes)
//
// After filling completes (frame 18):
// - decodeIndex = 0 (wraps back after filling [2])
// - renderIndex = 2 (will render from texture[2] at frame 19)
// - Frame 19: render from [2], decode into [0] (no conflict!)
m_renderTextureIndex = 2;
m_decodeTextureIndex = 0;
// Create staging texture (same format, but no UAV flag - only for rendering)
D3D12_RESOURCE_DESC stagingTextureDesc = rgbaTextureDesc;
stagingTextureDesc.Flags = D3D12_RESOURCE_FLAG_NONE; // No CUDA access needed
LOGF_INFO("[RGBASurfaceBackend] All %d RGBA textures created successfully", BUFFER_COUNT);
hr = m_device->CreateCommittedResource(
&defaultHeapProps,
D3D12_HEAP_FLAG_NONE,
&stagingTextureDesc,
D3D12_RESOURCE_STATE_COPY_DEST, // Initial state for receiving copies
nullptr,
IID_PPV_ARGS(&m_stagingTexture)
);
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to create staging texture: 0x%08X", hr);
for (int i = 0; i < BUFFER_COUNT; i++) {
m_rgbaTextures[i].Reset();
}
return hr;
}
// Create fence for GPU copy synchronization
hr = m_device->CreateFence(
0,
D3D12_FENCE_FLAG_NONE,
IID_PPV_ARGS(&m_copyFence)
);
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to create copy fence: 0x%08X", hr);
m_stagingTexture.Reset();
for (int i = 0; i < BUFFER_COUNT; i++) {
m_rgbaTextures[i].Reset();
}
return hr;
}
// Create fence event for CPU wait
m_copyFenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
if (m_copyFenceEvent == nullptr) {
hr = HRESULT_FROM_WIN32(GetLastError());
LOGF_ERROR("[RGBASurfaceBackend] Failed to create fence event: 0x%08X", hr);
m_copyFence.Reset();
m_stagingTexture.Reset();
for (int i = 0; i < BUFFER_COUNT; i++) {
m_rgbaTextures[i].Reset();
}
return hr;
}
LOGF_INFO("[RGBASurfaceBackend] Copy fence and event created successfully");
// Create command allocator and list for texture copy operations
hr = m_device->CreateCommandAllocator(
D3D12_COMMAND_LIST_TYPE_DIRECT,
IID_PPV_ARGS(&m_copyCommandAllocator)
);
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to create copy command allocator: 0x%08X", hr);
return hr;
}
hr = m_device->CreateCommandList(
0,
D3D12_COMMAND_LIST_TYPE_DIRECT,
m_copyCommandAllocator.Get(),
nullptr,
IID_PPV_ARGS(&m_copyCommandList)
);
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to create copy command list: 0x%08X", hr);
return hr;
}
// Close the command list (will be reset when needed)
m_copyCommandList->Close();
LOGF_INFO("[RGBASurfaceBackend] Created staging texture for safe rendering");
// Create SRV for RGBA texture
// Create SRV for rendering
hr = CreateSrvHeap();
if (FAILED(hr)) {
return hr;
}
// Update constant buffer with new aspect ratio
// Update constant buffer
hr = UpdateConstantBuffer();
if (FAILED(hr)) {
return hr;
@@ -226,8 +149,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
ID3D12GraphicsCommandList* commandList,
D3D12_CPU_DESCRIPTOR_HANDLE rtvHandle)
{
// RGBASurfaceBackend doesn't need RTV (uses CopyResource)
(void)rtvHandle;
(void)rtvHandle; // RGBASurfaceBackend doesn't use external RTV
if (!m_initialized) {
return E_NOT_VALID_STATE;
}
@@ -236,17 +158,18 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
return E_INVALIDARG;
}
// Use staging texture for rendering (safe from decoder overwrites)
ID3D12Resource* renderTexture = m_stagingTexture.Get();
// Use current render texture (already decoded, safe to read)
ID3D12Resource* renderTexture = m_rgbaTextures[m_renderTextureIndex].Get();
if (!renderTexture) {
LOGF_ERROR("[RGBASurfaceBackend] RenderToBackBuffer: staging texture is NULL!");
LOGF_ERROR("[RGBASurfaceBackend] RenderToBackBuffer: render texture[%d] is NULL!", m_renderTextureIndex);
return E_INVALIDARG;
}
LOGF_DEBUG("[RGBASurfaceBackend] RenderToBackBuffer: using staging texture, ptr=%p", renderTexture);
LOGF_DEBUG("[RGBASurfaceBackend] RenderToBackBuffer: using texture[%d], ptr=%p",
m_renderTextureIndex, renderTexture);
// Staging texture is already in PIXEL_SHADER_RESOURCE state (set by CopyToStagingTexture)
// No barrier needed here
// Render texture is in COMMON state (CUDA managed)
// No barrier needed for reading in pixel shader
// Transition back buffer to render target
D3D12_RESOURCE_BARRIER barrierToRT = {};
@@ -258,13 +181,12 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
barrierToRT.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
commandList->ResourceBarrier(1, &barrierToRT);
// Create RTV for back buffer (not needed anymore - use rtvHandle from parameter)
// Create RTV for back buffer
D3D12_CPU_DESCRIPTOR_HANDLE backBufferRtvHandle;
D3D12_RENDER_TARGET_VIEW_DESC rtvDesc = {};
rtvDesc.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
rtvDesc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D;
// Create temporary RTV heap for back buffer
ComPtr<ID3D12DescriptorHeap> rtvHeap;
D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {};
rtvHeapDesc.NumDescriptors = 1;
@@ -287,7 +209,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
ID3D12DescriptorHeap* heaps[] = { m_srvHeap.Get() };
commandList->SetDescriptorHeaps(1, heaps);
// Use descriptor for staging texture (index 0, the only descriptor)
// Use SRV for current render texture
CD3DX12_GPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetGPUDescriptorHandleForHeapStart());
commandList->SetGraphicsRootDescriptorTable(0, srvHandle);
commandList->SetGraphicsRootConstantBufferView(1, m_constantBuffer->GetGPUVirtualAddress());
@@ -314,7 +236,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
// Draw fullscreen quad
commandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
commandList->DrawInstanced(6, 1, 0, 0); // Fullscreen quad (2 triangles)
commandList->DrawInstanced(6, 1, 0, 0);
// Transition back buffer to present
D3D12_RESOURCE_BARRIER barrierToPresent = {};
@@ -326,9 +248,6 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
barrierToPresent.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
commandList->ResourceBarrier(1, &barrierToPresent);
// Staging texture remains in PIXEL_SHADER_RESOURCE state (no transition needed)
// It will be transitioned back to COPY_DEST when CopyToStagingTexture is called next
return S_OK;
}
@@ -536,9 +455,9 @@ HRESULT RGBASurfaceBackend::CreatePipelineState() {
}
HRESULT RGBASurfaceBackend::CreateSrvHeap() {
// Create descriptor heap with 1 descriptor for staging texture
// Create descriptor heap with 1 descriptor for current render texture
D3D12_DESCRIPTOR_HEAP_DESC srvHeapDesc = {};
srvHeapDesc.NumDescriptors = 1; // Only need SRV for staging texture
srvHeapDesc.NumDescriptors = 1;
srvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
srvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
@@ -547,22 +466,13 @@ HRESULT RGBASurfaceBackend::CreateSrvHeap() {
return hr;
}
// Create SRV for staging texture (the only texture used for rendering)
D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
srvDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
srvDesc.Texture2D.MipLevels = 1;
// Create initial SRV for texture[0] (m_renderTextureIndex = 0)
hr = UpdateSRVForCurrentRenderTexture();
if (FAILED(hr)) {
return hr;
}
CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetCPUDescriptorHandleForHeapStart());
m_device->CreateShaderResourceView(
m_stagingTexture.Get(),
&srvDesc,
srvHandle
);
LOGF_INFO("[RGBASurfaceBackend] Created SRV for staging texture");
LOGF_INFO("[RGBASurfaceBackend] Created SRV heap for render texture");
return S_OK;
}
@@ -605,138 +515,80 @@ HRESULT RGBASurfaceBackend::UpdateConstantBuffer() {
return S_OK;
}
ID3D12Resource* RGBASurfaceBackend::GetNextVideoTexture() {
// Rotate to next buffer index
int prevIndex = m_currentTextureIndex;
m_currentTextureIndex = (m_currentTextureIndex + 1) % BUFFER_COUNT;
LOGF_INFO("[RGBASurfaceBackend] GetNextVideoTexture: %d -> %d, texture=%p",
prevIndex, m_currentTextureIndex, m_rgbaTextures[m_currentTextureIndex].Get());
return m_rgbaTextures[m_currentTextureIndex].Get();
// Triple buffering management functions
ID3D12Resource* RGBASurfaceBackend::GetCurrentRenderTexture() const {
return m_rgbaTextures[m_renderTextureIndex].Get();
}
HRESULT RGBASurfaceBackend::CopyToStagingTexture(ID3D12Resource* sourceTexture) {
if (!m_initialized || !m_stagingTexture || !sourceTexture) {
ID3D12Resource* RGBASurfaceBackend::GetNextDecodeTexture() const {
return m_rgbaTextures[m_decodeTextureIndex].Get();
}
void RGBASurfaceBackend::AdvanceDecodeOnly() {
int prevDecode = m_decodeTextureIndex;
// Filling phase: Only advance decode index, render index stays at 0
// This is used during frames 16-18 to fill all three textures
m_decodeTextureIndex = (m_decodeTextureIndex + 1) % BUFFER_COUNT;
LOGF_INFO("[RGBASurfaceBackend] AdvanceDecodeOnly: decode %d->%d (render stays at %d)",
prevDecode, m_decodeTextureIndex, m_renderTextureIndex);
}
void RGBASurfaceBackend::AdvanceFrame() {
int prevRender = m_renderTextureIndex;
int prevDecode = m_decodeTextureIndex;
// Triple buffering advance logic:
// After Frame N decodes into decodeTexture, we want:
// - Render from the PREVIOUS frame's texture (N-1), not the current one!
// - Decode into the oldest texture (N-2)
//
// Current state: render=R, decode=D
// After decode completes: the old render texture becomes new decode target
// The old decode texture will be rendered NEXT frame (not this frame!)
//
// Example with 3 textures:
// State: render=2, decode=0
// - Frame 19 decodes into texture[0] (now contains frame 19)
// - Advance: render=2 (still showing frame 18!), decode=1
// - Frame 20 decodes into texture[1] (now contains frame 20)
// - Advance: render=0 (now showing frame 19), decode=2
//
// This ensures we NEVER render from a texture that was just written!
m_decodeTextureIndex = m_renderTextureIndex; // Old render texture becomes next decode target
m_renderTextureIndex = prevDecode; // Old decode texture becomes new render texture
LOGF_INFO("[RGBASurfaceBackend] AdvanceFrame: render %d->%d, decode %d->%d",
prevRender, m_renderTextureIndex, prevDecode, m_decodeTextureIndex);
// Update SRV to point to new render texture
HRESULT hr = UpdateSRVForCurrentRenderTexture();
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to update SRV: 0x%08X", hr);
}
}
HRESULT RGBASurfaceBackend::UpdateSRVForCurrentRenderTexture() {
if (!m_srvHeap || !m_rgbaTextures[m_renderTextureIndex]) {
return E_NOT_VALID_STATE;
}
// Reset command allocator and list
HRESULT hr = m_copyCommandAllocator->Reset();
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to reset copy command allocator: 0x%08X", hr);
return hr;
}
D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
srvDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
srvDesc.Texture2D.MipLevels = 1;
hr = m_copyCommandList->Reset(m_copyCommandAllocator.Get(), nullptr);
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to reset copy command list: 0x%08X", hr);
return hr;
}
CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetCPUDescriptorHandleForHeapStart());
// Transition source texture to COPY_SOURCE
D3D12_RESOURCE_BARRIER sourceBarrier = {};
sourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
sourceBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
sourceBarrier.Transition.pResource = sourceTexture;
sourceBarrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON; // CUDA uses COMMON
sourceBarrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
sourceBarrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
// Transition staging texture to COPY_DEST (only if not first copy)
// First copy: staging texture already in COPY_DEST state (created with that state)
// Subsequent copies: staging texture in PIXEL_SHADER_RESOURCE state (from previous render)
if (m_firstCopy) {
// First copy: only transition source
m_copyCommandList->ResourceBarrier(1, &sourceBarrier);
m_firstCopy = false;
LOGF_DEBUG("[RGBASurfaceBackend] First copy: staging texture already in COPY_DEST state");
} else {
// Subsequent copies: transition both staging and source
D3D12_RESOURCE_BARRIER stagingToCopyDest = {};
stagingToCopyDest.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
stagingToCopyDest.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
stagingToCopyDest.Transition.pResource = m_stagingTexture.Get();
stagingToCopyDest.Transition.StateBefore = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
stagingToCopyDest.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST;
stagingToCopyDest.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
D3D12_RESOURCE_BARRIER barriers[] = { stagingToCopyDest, sourceBarrier };
m_copyCommandList->ResourceBarrier(2, barriers);
}
// Copy texture
m_copyCommandList->CopyResource(m_stagingTexture.Get(), sourceTexture);
// Transition source back to COMMON (for CUDA)
sourceBarrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE;
sourceBarrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COMMON;
m_copyCommandList->ResourceBarrier(1, &sourceBarrier);
// Transition staging texture to PIXEL_SHADER_RESOURCE for rendering
D3D12_RESOURCE_BARRIER stagingBarrier = {};
stagingBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
stagingBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
stagingBarrier.Transition.pResource = m_stagingTexture.Get();
stagingBarrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;
stagingBarrier.Transition.StateAfter = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
stagingBarrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
m_copyCommandList->ResourceBarrier(1, &stagingBarrier);
// Close command list
hr = m_copyCommandList->Close();
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to close copy command list: 0x%08X", hr);
return hr;
}
// Execute command list
ID3D12CommandList* commandLists[] = { m_copyCommandList.Get() };
m_commandQueue->ExecuteCommandLists(1, commandLists);
// Signal fence after copy submission
m_copyFenceValue++;
hr = m_commandQueue->Signal(m_copyFence.Get(), m_copyFenceValue);
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] Failed to signal copy fence: 0x%08X", hr);
return hr;
}
LOGF_DEBUG("[RGBASurfaceBackend] GPU copy submitted (fence value: %llu)", m_copyFenceValue);
return S_OK;
}
HRESULT RGBASurfaceBackend::WaitForCopyCompletion() {
if (!m_copyFence || m_copyFenceEvent == nullptr) {
LOGF_ERROR("[RGBASurfaceBackend] Copy fence or event not initialized");
return E_NOT_VALID_STATE;
}
// Check if copy already completed
if (m_copyFence->GetCompletedValue() >= m_copyFenceValue) {
LOGF_DEBUG("[RGBASurfaceBackend] GPU copy already complete (fence value: %llu)", m_copyFenceValue);
return S_OK; // Already complete
}
// Wait for GPU copy to complete
HRESULT hr = m_copyFence->SetEventOnCompletion(
m_copyFenceValue,
m_copyFenceEvent
m_device->CreateShaderResourceView(
m_rgbaTextures[m_renderTextureIndex].Get(),
&srvDesc,
srvHandle
);
if (FAILED(hr)) {
LOGF_ERROR("[RGBASurfaceBackend] SetEventOnCompletion failed: 0x%08X", hr);
return hr;
}
DWORD waitResult = WaitForSingleObject(m_copyFenceEvent, 5000); // 5 second timeout
if (waitResult != WAIT_OBJECT_0) {
LOGF_ERROR("[RGBASurfaceBackend] Wait failed or timed out: %lu", waitResult);
return E_FAIL;
}
LOGF_DEBUG("[RGBASurfaceBackend] GPU copy completed (fence value: %llu)", m_copyFenceValue);
LOGF_DEBUG("[RGBASurfaceBackend] Updated SRV for render texture[%d]", m_renderTextureIndex);
return S_OK;
}

View File

@@ -39,24 +39,25 @@ public:
HRESULT CreateVideoTexture(uint32_t width, uint32_t height) override;
ID3D12Resource* GetVideoTexture() const override {
return m_rgbaTextures[m_currentTextureIndex].Get();
return m_rgbaTextures[m_renderTextureIndex].Get();
}
// Get next available texture for decoding (rotates buffer index)
ID3D12Resource* GetNextVideoTexture();
// Triple buffering management
// Get current texture being rendered to screen
ID3D12Resource* GetCurrentRenderTexture() const;
// Get current rendering texture index
int GetCurrentTextureIndex() const { return m_currentTextureIndex; }
// Get next texture for decoding (not currently being rendered)
ID3D12Resource* GetNextDecodeTexture() const;
// Staging texture management for safe rendering
// Copy decoder texture to stable staging texture
HRESULT CopyToStagingTexture(ID3D12Resource* sourceTexture);
// Advance frame: switch render/decode indices after decoding completes (normal operation)
void AdvanceFrame();
// Wait for GPU copy to complete
HRESULT WaitForCopyCompletion();
// Advance decode only: move to next decode texture without changing render index (filling phase)
void AdvanceDecodeOnly();
// Get stable staging texture for rendering (never overwritten by decoder)
ID3D12Resource* GetStagingTexture() const { return m_stagingTexture.Get(); }
// Get current indices for debugging
int GetRenderTextureIndex() const { return m_renderTextureIndex; }
int GetDecodeTextureIndex() const { return m_decodeTextureIndex; }
HRESULT RenderToBackBuffer(
const VavCoreVideoFrame& frame,
@@ -73,27 +74,19 @@ private:
ID3D12Device* m_device = nullptr;
ID3D12CommandQueue* m_commandQueue = nullptr;
// RGBA video textures (triple buffering) - Decoder writes here
// RGBA video textures (triple buffering)
// Format: DXGI_FORMAT_R8G8B8A8_UNORM
// Flags: D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS (for CUDA Surface Object)
// Layout: D3D12_TEXTURE_LAYOUT_UNKNOWN (tiled, handled by CUDA Surface Objects)
//
// Triple buffering roles:
// - m_renderTextureIndex: Currently rendering to screen (safe to read)
// - m_decodeTextureIndex: Next target for decoding (safe to write)
// - Third texture: Idle, previously decoded (ready to become render texture)
static const int BUFFER_COUNT = 3;
ComPtr<ID3D12Resource> m_rgbaTextures[BUFFER_COUNT];
int m_currentTextureIndex = 0;
// Staging texture - Stable copy for rendering (decoder never touches this)
// This texture is copied from m_rgbaTextures at 30fps
// Renderer always reads from this texture (safe from race conditions)
ComPtr<ID3D12Resource> m_stagingTexture;
// Command allocator and list for async texture copy
ComPtr<ID3D12CommandAllocator> m_copyCommandAllocator;
ComPtr<ID3D12GraphicsCommandList> m_copyCommandList;
// GPU synchronization for copy operations
ComPtr<ID3D12Fence> m_copyFence;
UINT64 m_copyFenceValue = 0;
HANDLE m_copyFenceEvent = nullptr;
int m_renderTextureIndex = 0; // Texture currently being rendered
int m_decodeTextureIndex = 0; // Texture for next decode operation
// Graphics pipeline for simple RGBA texture sampling
ComPtr<ID3D12RootSignature> m_rootSignature;
@@ -122,7 +115,6 @@ private:
uint32_t m_height = 0; // Container height
uint32_t m_videoWidth = 0;
uint32_t m_videoHeight = 0;
bool m_firstCopy = true; // Track first copy to handle initial state
// Helper methods
HRESULT CreateGraphicsResources();
@@ -131,6 +123,7 @@ private:
HRESULT CreatePipelineState();
HRESULT CreateSrvHeap();
HRESULT UpdateConstantBuffer();
HRESULT UpdateSRVForCurrentRenderTexture();
};
} // namespace Vav2Player

View File

@@ -1374,12 +1374,19 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI
slot.pts = timestamp;
slot.ready_for_display.store(true);
// Enqueue picture_index for display queue (for B-frame reordering)
// Enqueue DisplayQueueEntry with PTS for B-frame reordering
{
std::lock_guard<std::mutex> lock(decoder->m_displayMutex);
decoder->m_displayQueue.push(pic_idx);
LOGF_DEBUG("[HandlePictureDisplay] Pushed picture_index=%d (pts=%lld) to display queue (size: %zu)",
pic_idx, timestamp, decoder->m_displayQueue.size());
DisplayQueueEntry entry;
entry.frame_slot_index = pic_idx;
entry.pts = timestamp;
entry.submission_id = slot.submission_id;
decoder->m_displayQueue.push(entry);
LOGF_DEBUG("[HandlePictureDisplay] Pushed DisplayQueueEntry: slot=%d, pts=%lld, submission_id=%llu (queue size: %zu)",
pic_idx, timestamp, slot.submission_id, decoder->m_displayQueue.size());
}
return 1;
@@ -1572,11 +1579,17 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
VideoFrame& output_frame) {
LOGF_DEBUG("[DecodeToSurface] Called with target_type=%d", static_cast<int>(target_type));
if (!m_initialized || !packet_data) {
LOGF_ERROR("[DecodeToSurface] Not initialized or null packet_data");
if (!m_initialized) {
LOGF_ERROR("[DecodeToSurface] Not initialized");
return false;
}
// Handle NULL packet_data as flush mode (end of file reached)
if (!packet_data || packet_size == 0) {
LOGF_DEBUG("[DecodeToSurface] NULL packet - flush mode (end of file)");
m_state = DecoderState::FLUSHING;
}
// Set CUDA context for current thread
{
std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
@@ -1617,13 +1630,25 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
// ===== Step 2: Submit packet to NVDEC parser =====
// This triggers HandlePictureDecode (if new frame) and HandlePictureDisplay (always)
CUVIDSOURCEDATAPACKET packet = {};
packet.payload = packet_data;
packet.payload_size = static_cast<unsigned long>(packet_size);
packet.flags = CUVID_PKT_ENDOFPICTURE;
packet.timestamp = 0; // Not used - NVDEC parser overwrites this value
LOGF_INFO("[DecodeToSurface] Calling cuvidParseVideoData (submission_id=%llu)...",
my_submission_id);
// Handle flush mode (NULL packet)
if (m_state == DecoderState::FLUSHING) {
// Flush mode: send end-of-stream packet to drain CUDA DPB
packet.flags = CUVID_PKT_ENDOFSTREAM;
packet.payload = nullptr;
packet.payload_size = 0;
LOGF_INFO("[DecodeToSurface] Flush mode: sending ENDOFSTREAM packet (submission_id=%llu)",
my_submission_id);
} else {
// Normal mode: send actual packet data
packet.payload = packet_data;
packet.payload_size = static_cast<unsigned long>(packet_size);
packet.flags = CUVID_PKT_ENDOFPICTURE;
packet.timestamp = 0; // Not used - NVDEC parser overwrites this value
LOGF_INFO("[DecodeToSurface] Normal mode: calling cuvidParseVideoData (submission_id=%llu)...",
my_submission_id);
}
CUresult result = cuvidParseVideoData(m_parser, &packet);
// cuvidParseVideoData is SYNCHRONOUS - all callbacks execute before return
@@ -1647,33 +1672,63 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
{
std::lock_guard<std::mutex> lock(m_displayMutex);
// During initial buffering, accept packets until display queue has frames
if (m_displayQueue.empty() && !m_initialBufferingComplete) {
LOGF_DEBUG("[DecodeToSurface] PACKET ACCEPTED - Initial buffering (queue size: 0)");
return VAVCORE_PACKET_ACCEPTED;
// Transition from READY to BUFFERING on first packet
if (m_state == DecoderState::READY && m_displayQueue.empty()) {
m_state = DecoderState::BUFFERING;
LOGF_DEBUG("[DecodeToSurface] State transition: READY → BUFFERING");
}
// Once we have frames in queue, mark buffering complete
if (!m_displayQueue.empty() && !m_initialBufferingComplete) {
m_initialBufferingComplete = true;
LOGF_INFO("[DecodeToSurface] Initial buffering complete, queue size: %zu", m_displayQueue.size());
// During initial buffering, accept packets until display queue has frames
if (m_displayQueue.empty() && m_state == DecoderState::BUFFERING) {
LOGF_DEBUG("[DecodeToSurface] PACKET ACCEPTED - Initial buffering (queue size: 0)");
// Return false to indicate no frame yet (still buffering)
// The C API wrapper will convert this to VAVCORE_PACKET_ACCEPTED
return false;
}
// Once we have frames in queue, transition to DECODING
if (!m_displayQueue.empty() && m_state == DecoderState::BUFFERING) {
m_state = DecoderState::DECODING;
LOGF_INFO("[DecodeToSurface] State transition: BUFFERING → DECODING (queue size: %zu)", m_displayQueue.size());
}
}
// ===== Step 4: Pop from display queue to get picture_index =====
// ===== Step 4: Pop from display queue to get picture_index (PTS-ordered) =====
DisplayQueueEntry entry;
int pic_idx = -1;
{
std::lock_guard<std::mutex> lock(m_displayMutex);
if (m_displayQueue.empty()) {
LOGF_ERROR("[DecodeToSurface] Display queue EMPTY after buffering complete (SHOULD NOT HAPPEN!)");
return false;
// Check if we're in flush mode
if (m_state == DecoderState::FLUSHING) {
// Flush mode: no more frames in CUDA DPB
// Return false to indicate no frame, caller will check end-of-stream
LOGF_INFO("[DecodeToSurface] Flush complete: all frames drained from CUDA DPB");
// Release pending submission before returning
{
std::lock_guard<std::mutex> lock2(m_submissionMutex);
m_pendingSubmissions[pending_idx].in_use.store(false);
}
// Return false - the C API wrapper will convert this to VAVCORE_END_OF_STREAM
// when combined with file reader's IsEndOfFile() check
return false;
} else {
// Normal mode: queue empty unexpectedly
LOGF_ERROR("[DecodeToSurface] Display queue EMPTY after buffering complete (SHOULD NOT HAPPEN!)");
return false;
}
}
pic_idx = m_displayQueue.front();
// Pop from priority queue (PTS-ordered)
entry = m_displayQueue.top();
m_displayQueue.pop();
LOGF_INFO("[DecodeToSurface] Popped picture_index=%d from display queue (queue size now: %zu)",
pic_idx, m_displayQueue.size());
pic_idx = entry.frame_slot_index;
LOGF_INFO("[DecodeToSurface] Popped DisplayQueueEntry: slot=%d, pts=%lld, submission_id=%llu (queue size now: %zu)",
pic_idx, entry.pts, entry.submission_id, m_displayQueue.size());
}
if (pic_idx < 0 || pic_idx >= RING_BUFFER_SIZE) {
@@ -1697,21 +1752,37 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
LOGF_DEBUG("[DecodeToSurface] Frame slot %d ready for display", pic_idx);
// ===== Step 6: Copy from CUDA DPB to target surface =====
if (!CopyFromCUDADPB(pic_idx, target_type, target_surface, output_frame)) {
// ===== Step 6: Update target_surface for this frame =====
// CRITICAL: SwapChain provides different target_surface each frame!
// Always update slot.target_surface to current one.
LOGF_DEBUG("[DecodeToSurface] Updating target_surface: %p -> %p (pic_idx=%d)",
slot.target_surface, target_surface, pic_idx);
if (target_surface == nullptr) {
LOGF_ERROR("[DecodeToSurface] ERROR: target_surface is NULL for pic_idx=%d", pic_idx);
return false;
}
// Always update to current target_surface (SwapChain back buffer changes each frame)
slot.target_surface = target_surface;
slot.surface_type = target_type;
// ===== Step 7: Copy from CUDA DPB to target surface =====
// Now use slot.target_surface which is guaranteed to be valid (either from decode or late binding)
if (!CopyFromCUDADPB(pic_idx, slot.surface_type, slot.target_surface, output_frame)) {
LOGF_ERROR("[DecodeToSurface] CopyFromCUDADPB failed for picture_index=%d", pic_idx);
return false;
}
LOGF_INFO("[DecodeToSurface] SUCCESS - Frame rendered from CUDA DPB (pic_idx=%d)", pic_idx);
// ===== Step 7: Mark slot as reusable =====
// ===== Step 8: Mark slot as reusable =====
slot.ready_for_display.store(false);
slot.in_use.store(false);
LOGF_DEBUG("[DecodeToSurface] Released frame slot %d", pic_idx);
// ===== Step 8: Release pending submission =====
// ===== Step 9: Release pending submission =====
{
std::lock_guard<std::mutex> lock(m_submissionMutex);
m_pendingSubmissions[pending_idx].in_use.store(false);

View File

@@ -102,6 +102,14 @@ protected:
void LogCUDAError(CUresult result, const std::string& operation) const;
private:
// Decoder state enum (simple inline approach)
enum class DecoderState {
READY, // Initialized and ready for first packet
BUFFERING, // Initial buffering (0-15 frames)
DECODING, // Normal frame-by-frame decoding
FLUSHING // End-of-file reached, draining DPB
};
// CUDA and NVDEC objects
CUcontext m_cuContext = nullptr;
CUdevice m_cudaDevice = 0;
@@ -224,7 +232,23 @@ private:
void PollingThreadFunc(); // Polling thread function
// Display-only packet handling (B-frame reordering)
std::queue<int> m_displayQueue; // Queue of picture_index from HandlePictureDisplay
// DisplayQueueEntry: Frame information for PTS-based reordering
struct DisplayQueueEntry {
int frame_slot_index; // FrameSlot index in m_frameSlots[]
int64_t pts; // Presentation timestamp
uint64_t submission_id; // Original submission order
};
// PTSComparator: PTS ascending order (Min-heap for earliest PTS first)
struct PTSComparator {
bool operator()(const DisplayQueueEntry& a, const DisplayQueueEntry& b) const {
return a.pts > b.pts; // Min-heap: smallest PTS has highest priority
}
};
std::priority_queue<DisplayQueueEntry,
std::vector<DisplayQueueEntry>,
PTSComparator> m_displayQueue; // PTS-based priority queue
std::mutex m_displayMutex;
// Helper methods
@@ -245,8 +269,8 @@ private:
bool CopyFromCUDADPB(int pic_idx, VavCoreSurfaceType target_type,
void* target_surface, VideoFrame& output_frame);
// Initial buffering state
std::atomic<bool> m_initialBufferingComplete{false};
// Decoder state (replaces m_initialBufferingComplete and m_endOfFileReached)
DecoderState m_state = DecoderState::READY;
// NV12ToRGBAConverter reinitialization flag (set by HandleVideoSequence)
std::atomic<bool> m_converterNeedsReinit{false};