WIP

2025-10-11 02:08:57 +09:00
parent 9a7330d5fb
commit ff6b753dfe
10 changed files with 1886 additions and 357 deletions
--- a/vav2/docs/working/NVDEC_State_Machine_Refactoring.md
+++ b/vav2/docs/working/NVDEC_State_Machine_Refactoring.md
@@ -0,0 +1,416 @@
+# NVDEC Decoder State Machine Refactoring Design
+
+## Problem Statement
+
+The current `NVDECAV1Decoder::DecodeToSurface()` has excessive complexity:
+- **13+ state variables** tracked across multiple atomic flags and mutexes
+- **9+ conditional branches** with nested conditions
+- **~150 lines** in a single function
+- **High cyclomatic complexity** (2^9 = 512 possible code paths)
+
+This makes the code:
+- Hard to maintain and debug
+- Difficult to test comprehensively
+- Prone to race conditions and edge cases
+- Challenging to extend with new features
+
+## Solution: State Machine Pattern
+
+### Core Design Principle
+
+**Consolidate all decoder state into a single enum** with clear transitions, replacing scattered atomic flags and conditional checks.
+
+### State Machine States
+
+```cpp
+enum class DecoderState {
+    UNINITIALIZED,              // Before Initialize() is called
+    READY,                      // Initialized and ready for decoding
+    BUFFERING,                  // Initial buffering (0-15 frames)
+    DECODING,                   // Normal frame-by-frame decoding
+    FLUSHING,                   // End-of-file reached, draining DPB
+    FLUSH_COMPLETE,             // All frames drained
+    ERROR                       // Unrecoverable error state
+};
+```
+
+### State Transitions
+
+```
+UNINITIALIZED → READY                 (Initialize() called successfully)
+READY → BUFFERING                     (First DecodeToSurface() call)
+BUFFERING → DECODING                  (Display queue has frames)
+DECODING → FLUSHING                   (End-of-file reached, NULL packet)
+FLUSHING → FLUSH_COMPLETE             (Display queue empty)
+FLUSH_COMPLETE → READY                (Reset() called)
+* → ERROR                             (Any state can transition to ERROR on failure)
+ERROR → READY                         (Reset() called)
+```
+
+### State Machine Class
+
+```cpp
+class DecoderStateMachine {
+public:
+    DecoderStateMachine() : m_state(DecoderState::UNINITIALIZED) {}
+
+    // State queries
+    DecoderState GetState() const { return m_state.load(); }
+    bool IsState(DecoderState state) const { return m_state.load() == state; }
+    bool CanDecode() const {
+        auto state = m_state.load();
+        return state == DecoderState::READY ||
+               state == DecoderState::BUFFERING ||
+               state == DecoderState::DECODING ||
+               state == DecoderState::FLUSHING;
+    }
+
+    // State transitions
+    bool TransitionTo(DecoderState new_state) {
+        DecoderState expected = m_state.load();
+        if (IsValidTransition(expected, new_state)) {
+            m_state.store(new_state);
+            LOGF_DEBUG("[DecoderStateMachine] State transition: %s → %s",
+                       StateToString(expected), StateToString(new_state));
+            return true;
+        }
+        LOGF_ERROR("[DecoderStateMachine] Invalid transition: %s → %s",
+                   StateToString(expected), StateToString(new_state));
+        return false;
+    }
+
+    // Specific transition helpers
+    void OnInitializeSuccess() {
+        TransitionTo(DecoderState::READY);
+    }
+
+    void OnFirstPacket() {
+        if (IsState(DecoderState::READY)) {
+            TransitionTo(DecoderState::BUFFERING);
+        }
+    }
+
+    void OnBufferingComplete(size_t queue_size) {
+        if (IsState(DecoderState::BUFFERING) && queue_size > 0) {
+            TransitionTo(DecoderState::DECODING);
+        }
+    }
+
+    void OnEndOfFile() {
+        if (IsState(DecoderState::DECODING) || IsState(DecoderState::BUFFERING)) {
+            TransitionTo(DecoderState::FLUSHING);
+        }
+    }
+
+    void OnFlushComplete() {
+        if (IsState(DecoderState::FLUSHING)) {
+            TransitionTo(DecoderState::FLUSH_COMPLETE);
+        }
+    }
+
+    void OnError() {
+        TransitionTo(DecoderState::ERROR);
+    }
+
+    void OnReset() {
+        TransitionTo(DecoderState::READY);
+    }
+
+private:
+    std::atomic<DecoderState> m_state;
+
+    bool IsValidTransition(DecoderState from, DecoderState to) const {
+        // Define valid state transitions
+        switch (from) {
+            case DecoderState::UNINITIALIZED:
+                return to == DecoderState::READY || to == DecoderState::ERROR;
+            case DecoderState::READY:
+                return to == DecoderState::BUFFERING || to == DecoderState::ERROR;
+            case DecoderState::BUFFERING:
+                return to == DecoderState::DECODING || to == DecoderState::FLUSHING ||
+                       to == DecoderState::ERROR || to == DecoderState::READY;
+            case DecoderState::DECODING:
+                return to == DecoderState::FLUSHING || to == DecoderState::ERROR ||
+                       to == DecoderState::READY;
+            case DecoderState::FLUSHING:
+                return to == DecoderState::FLUSH_COMPLETE || to == DecoderState::ERROR ||
+                       to == DecoderState::READY;
+            case DecoderState::FLUSH_COMPLETE:
+                return to == DecoderState::READY || to == DecoderState::ERROR;
+            case DecoderState::ERROR:
+                return to == DecoderState::READY;
+            default:
+                return false;
+        }
+    }
+
+    const char* StateToString(DecoderState state) const {
+        switch (state) {
+            case DecoderState::UNINITIALIZED: return "UNINITIALIZED";
+            case DecoderState::READY: return "READY";
+            case DecoderState::BUFFERING: return "BUFFERING";
+            case DecoderState::DECODING: return "DECODING";
+            case DecoderState::FLUSHING: return "FLUSHING";
+            case DecoderState::FLUSH_COMPLETE: return "FLUSH_COMPLETE";
+            case DecoderState::ERROR: return "ERROR";
+            default: return "UNKNOWN";
+        }
+    }
+};
+```
+
+## Refactored DecodeToSurface()
+
+### Before (Complex Branching):
+
+```cpp
+bool DecodeToSurface(...) {
+    // Step 1: Check if initialized
+    if (!m_initialized) { ... }
+
+    // Handle NULL packet_data as flush mode
+    if (!packet_data || packet_size == 0) {
+        m_endOfFileReached = true;
+    }
+
+    // Step 2: Submit packet
+    if (m_endOfFileReached) {
+        // Flush mode logic
+    } else {
+        // Normal mode logic
+    }
+
+    // Step 3: Check initial buffering
+    if (m_displayQueue.empty() && !m_initialBufferingComplete) {
+        // Buffering logic
+    }
+    if (!m_displayQueue.empty() && !m_initialBufferingComplete) {
+        m_initialBufferingComplete = true;
+    }
+
+    // Step 4: Pop from display queue
+    if (m_displayQueue.empty()) {
+        if (m_endOfFileReached) {
+            // Flush complete logic
+        } else {
+            // Error - queue empty unexpectedly
+        }
+    }
+
+    // ... (continues for 150 more lines)
+}
+```
+
+### After (State Machine):
+
+```cpp
+bool DecodeToSurface(const uint8_t* packet_data, size_t packet_size,
+                     VavCoreSurfaceType target_type,
+                     void* target_surface,
+                     VideoFrame& output_frame) {
+    // State validation
+    if (!m_stateMachine.CanDecode()) {
+        LOGF_ERROR("[DecodeToSurface] Invalid state: %s",
+                   m_stateMachine.GetStateString());
+        return false;
+    }
+
+    // Handle end-of-file
+    if (!packet_data || packet_size == 0) {
+        return HandleFlushMode(output_frame);
+    }
+
+    // Delegate to state-specific handler
+    switch (m_stateMachine.GetState()) {
+        case DecoderState::READY:
+        case DecoderState::BUFFERING:
+            return HandleBufferingMode(packet_data, packet_size, target_type,
+                                        target_surface, output_frame);
+        case DecoderState::DECODING:
+            return HandleDecodingMode(packet_data, packet_size, target_type,
+                                       target_surface, output_frame);
+        default:
+            LOGF_ERROR("[DecodeToSurface] Unexpected state in DecodeToSurface");
+            return false;
+    }
+}
+```
+
+### Helper Methods (State-Specific Logic):
+
+```cpp
+bool HandleBufferingMode(const uint8_t* packet_data, size_t packet_size,
+                         VavCoreSurfaceType target_type,
+                         void* target_surface,
+                         VideoFrame& output_frame) {
+    // Transition to buffering on first packet
+    if (m_stateMachine.IsState(DecoderState::READY)) {
+        m_stateMachine.OnFirstPacket();
+    }
+
+    // Submit packet to NVDEC
+    if (!SubmitPacketToParser(packet_data, packet_size)) {
+        return false;
+    }
+
+    // Check if buffering is complete
+    {
+        std::lock_guard<std::mutex> lock(m_displayMutex);
+        if (m_displayQueue.empty()) {
+            // Still buffering
+            return false; // VAVCORE_PACKET_ACCEPTED
+        } else {
+            // Buffering complete
+            m_stateMachine.OnBufferingComplete(m_displayQueue.size());
+            // Fall through to decode the first frame
+        }
+    }
+
+    return RetrieveAndRenderFrame(target_type, target_surface, output_frame);
+}
+
+bool HandleDecodingMode(const uint8_t* packet_data, size_t packet_size,
+                        VavCoreSurfaceType target_type,
+                        void* target_surface,
+                        VideoFrame& output_frame) {
+    // Submit packet to NVDEC
+    if (!SubmitPacketToParser(packet_data, packet_size)) {
+        return false;
+    }
+
+    // Retrieve and render frame
+    return RetrieveAndRenderFrame(target_type, target_surface, output_frame);
+}
+
+bool HandleFlushMode(VideoFrame& output_frame) {
+    // Transition to flushing if not already
+    if (!m_stateMachine.IsState(DecoderState::FLUSHING)) {
+        m_stateMachine.OnEndOfFile();
+    }
+
+    // Submit end-of-stream packet
+    if (!SubmitFlushPacket()) {
+        return false;
+    }
+
+    // Check if flush is complete
+    {
+        std::lock_guard<std::mutex> lock(m_displayMutex);
+        if (m_displayQueue.empty()) {
+            m_stateMachine.OnFlushComplete();
+            return false; // VAVCORE_END_OF_STREAM
+        }
+    }
+
+    // Still have frames to drain
+    return RetrieveAndRenderFrame(...);
+}
+```
+
+## Removed/Consolidated State Variables
+
+### Before:
+```cpp
+// 13+ state variables
+std::atomic<bool> m_initialBufferingComplete{false};
+std::atomic<bool> m_endOfFileReached{false};
+std::atomic<bool> m_converterNeedsReinit{false};
+std::atomic<uint64_t> m_submissionCounter{0};
+std::atomic<uint64_t> m_returnCounter{0};
+std::atomic<bool> m_pollingRunning{false};
+std::mutex m_frameQueueMutex;
+std::mutex m_cudaContextMutex;
+std::mutex m_submissionMutex;
+std::mutex m_displayMutex;
+std::queue<int> m_displayQueue;
+FrameSlot m_frameSlots[16]; // Each has 5 atomic flags
+```
+
+### After:
+```cpp
+// Single state machine + minimal supporting variables
+DecoderStateMachine m_stateMachine;
+
+// Still needed (but usage clarified by state machine):
+std::mutex m_displayMutex;
+std::queue<int> m_displayQueue;
+FrameSlot m_frameSlots[16]; // Frame-specific state (not global decoder state)
+std::atomic<uint64_t> m_submissionCounter{0}; // Submission ordering
+std::mutex m_submissionMutex;
+```
+
+**Eliminated:**
+- `m_initialBufferingComplete` → Replaced by `DecoderState::BUFFERING` vs `DECODING`
+- `m_endOfFileReached` → Replaced by `DecoderState::FLUSHING`
+- `m_converterNeedsReinit` → Moved to NV12ToRGBAConverter internal state
+
+## Benefits
+
+### 1. Complexity Reduction
+- **13+ state variables → 1 state machine** with 7 well-defined states
+- **9+ conditional branches → State-driven dispatch** (1 switch statement)
+- **~150 lines → ~40 lines** per state handler (modular functions)
+
+### 2. Improved Maintainability
+- **Clear state transitions** with validation (no illegal states)
+- **State-specific logic** isolated in dedicated functions
+- **Easy debugging** with state transition logging
+
+### 3. Better Testability
+- **Test individual states** independently
+- **Verify state transitions** explicitly
+- **Mock state machine** for unit tests
+
+### 4. Enhanced Readability
+- **Self-documenting code** (state names describe decoder status)
+- **Linear flow** instead of nested conditions
+- **Clear intent** from state-specific handler names
+
+## Implementation Plan
+
+### Phase 1: Create State Machine Class (CURRENT)
+- [x] Design state machine enum and transitions
+- [ ] Implement DecoderStateMachine class
+- [ ] Add state transition logging
+
+### Phase 2: Extract Helper Methods
+- [ ] Create `SubmitPacketToParser()`
+- [ ] Create `RetrieveAndRenderFrame()`
+- [ ] Create `SubmitFlushPacket()`
+
+### Phase 3: Refactor DecodeToSurface()
+- [ ] Replace state flags with state machine
+- [ ] Implement `HandleBufferingMode()`
+- [ ] Implement `HandleDecodingMode()`
+- [ ] Implement `HandleFlushMode()`
+
+### Phase 4: Update Other Methods
+- [ ] Update `Initialize()` → call `m_stateMachine.OnInitializeSuccess()`
+- [ ] Update `Reset()` → call `m_stateMachine.OnReset()`
+- [ ] Update `Cleanup()` → call `m_stateMachine.TransitionTo(UNINITIALIZED)`
+
+### Phase 5: Remove Obsolete State Variables
+- [ ] Remove `m_initialBufferingComplete`
+- [ ] Remove `m_endOfFileReached`
+- [ ] Verify no regressions with existing tests
+
+## Testing Strategy
+
+### Unit Tests
+- State transition validation (legal/illegal transitions)
+- State-specific handler behavior
+- Error state recovery
+
+### Integration Tests
+- Full decode pipeline with state transitions
+- Edge cases (empty files, flush mode, errors)
+- Multi-threaded decoding with state machine
+
+### Regression Tests
+- Existing RedSurfaceNVDECTest
+- Vav2PlayerHeadless tests
+- Vav2Player GUI tests
+
+---
+**Status**: Design complete, implementation in progress
+**Last Updated**: 2025-10-11
--- a/vav2/docs/working/Triple_Buffering_Refactoring_Design.md
+++ b/vav2/docs/working/Triple_Buffering_Refactoring_Design.md
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/Vav2Player.vcxproj
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/Vav2Player.vcxproj
@@ -359,13 +359,19 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <PostBuildEvent>
-      <Command>echo Copying VavCore Debug DLL...
-copy "$(ProjectDir)..\..\..\vavcore\lib\VavCore-debug.dll" "$(LayoutDir)\VavCore-debug.dll"
-echo DLL copy completed.</Command>
-      <Message>Copying VavCore-debug.dll to output directory</Message>
+      <Command>echo Copying VavCore Debug DLL to AppX directory...
+echo Source: "$(ProjectDir)..\..\..\vavcore\lib\VavCore-debug.dll"
+echo Target: "$(LayoutDir)\VavCore-debug.dll"
+copy /Y "$(ProjectDir)..\..\..\vavcore\lib\VavCore-debug.dll" "$(LayoutDir)\VavCore-debug.dll"
+if errorlevel 1 (
+  echo ERROR: Failed to copy VavCore-debug.dll
+  exit /b 1
+)
+echo DLL copy completed successfully.</Command>
+      <Message>Copying VavCore-debug.dll to AppX directory</Message>
    </PostBuildEvent>
    <PreBuildEvent>
-      <Command>del "$(LayoutDir)\VavCore-debug.dll"</Command>
+      <Command>if exist "$(LayoutDir)\VavCore-debug.dll" del "$(LayoutDir)\VavCore-debug.dll"</Command>
    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
@@ -102,40 +102,65 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,

            // Expected: VAVCORE_PACKET_ACCEPTED for first 16 frames
            // No rendering during buffering phase
-        } else {
-            // Phase 2: Normal decoding with D3D12 surface (17th frame onwards)
-            ID3D12Resource* rgbaTexture = m_renderer->GetNextRGBATextureForCUDAInterop();
-            if (!rgbaTexture) {
-                LOGF_ERROR("[FrameProcessor] Failed to get next RGBA texture");
+        }
+        // Phase 2: Triple buffer filling (frames 16-18)
+        // Fill textures 0, 1, 2 before starting normal operation
+        else if (m_framesDecoded < 19) {
+            auto backend = m_renderer->GetRGBASurfaceBackend();
+            if (!backend) {
+                LOGF_ERROR("[FrameProcessor] Failed to get RGBASurfaceBackend");
                m_frameProcessing.store(false);
                if (onComplete) onComplete(false);
                return false;
            }

+            ID3D12Resource* decodeTexture = backend->GetNextDecodeTexture();
+            int decodeIndex = backend->GetDecodeTextureIndex();
+            LOGF_INFO("[FrameProcessor] Triple buffer filling: frame %llu -> texture[%d]",
+                      m_framesDecoded.load(), decodeIndex);
+
            result = vavcore_decode_to_surface(
                player,
                VAVCORE_SURFACE_D3D12_RESOURCE,
-                rgbaTexture,
+                decodeTexture,
                &vavFrame
            );

-            // After successful decode, copy to staging texture for safe rendering
+            // After successful decode, advance decode index only (render index stays at 0)
            if (result == VAVCORE_SUCCESS) {
-                auto backend = m_renderer->GetRGBASurfaceBackend();
-                if (backend) {
-                    HRESULT hr = backend->CopyToStagingTexture(rgbaTexture);
-                    if (FAILED(hr)) {
-                        LOGF_ERROR("[FrameProcessor] Failed to copy to staging texture: 0x%08X", hr);
-                    } else {
-                        // Wait for GPU copy to complete before proceeding
-                        hr = backend->WaitForCopyCompletion();
-                        if (FAILED(hr)) {
-                            LOGF_ERROR("[FrameProcessor] Failed to wait for copy completion: 0x%08X", hr);
-                        } else {
-                            LOGF_INFO("[FrameProcessor] GPU copy completed, staging texture ready");
-                        }
-                    }
-                }
+                backend->AdvanceDecodeOnly();
+                LOGF_INFO("[FrameProcessor] Triple buffer filled: texture[%d] ready", decodeIndex);
+            }
+        }
+        // Phase 3: Normal operation (frame 19+)
+        // Render from current texture, decode into next texture
+        else {
+            auto backend = m_renderer->GetRGBASurfaceBackend();
+            if (!backend) {
+                LOGF_ERROR("[FrameProcessor] Failed to get RGBASurfaceBackend");
+                m_frameProcessing.store(false);
+                if (onComplete) onComplete(false);
+                return false;
+            }
+
+            ID3D12Resource* decodeTexture = backend->GetNextDecodeTexture();
+            int decodeIndex = backend->GetDecodeTextureIndex();
+            int renderIndex = backend->GetRenderTextureIndex();
+            LOGF_DEBUG("[FrameProcessor] Normal operation: render[%d], decode[%d]",
+                       renderIndex, decodeIndex);
+
+            result = vavcore_decode_to_surface(
+                player,
+                VAVCORE_SURFACE_D3D12_RESOURCE,
+                decodeTexture,
+                &vavFrame
+            );
+
+            // After successful decode, advance frame indices
+            if (result == VAVCORE_SUCCESS) {
+                backend->AdvanceFrame();
+                LOGF_DEBUG("[FrameProcessor] Frame advanced: render[%d]->render[%d]",
+                           renderIndex, backend->GetRenderTextureIndex());
            }
        }
    }
@@ -157,6 +182,10 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
            // No frame is ready yet - VavCore will return it in a future call
            LOGF_DEBUG("[FrameProcessor] PACKET ACCEPTED - Frame buffered in VavCore CUDA DPB (16-frame buffering)");

+            // CRITICAL: Increment m_framesDecoded for buffered packets
+            // This counter determines when we switch from NULL surface (buffering) to valid surface (rendering)
+            m_framesDecoded++;
+
            // No action needed - just wait for next timing tick
            // VavCore will return the buffered frame when ready
            m_frameProcessing.store(false);
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp
@@ -268,13 +268,6 @@ ID3D12Resource* D3D12VideoRenderer::GetRGBATextureForCUDAInterop() const {
    return nullptr;
 }

-ID3D12Resource* D3D12VideoRenderer::GetNextRGBATextureForCUDAInterop() {
-    if (m_rgbaSurfaceBackend) {
-        return m_rgbaSurfaceBackend->GetNextVideoTexture();
-    }
-    return nullptr;
-}
-
 uint8_t* D3D12VideoRenderer::GetYMappedBuffer(uint32_t bufferIndex) const {
    if (m_yuv420pUploadBackend) {
        return m_yuv420pUploadBackend->GetYMappedBuffer(bufferIndex);
@@ -496,6 +489,13 @@ IVideoBackend* D3D12VideoRenderer::SelectBackend(const VavCoreVideoFrame& frame)
 }

 HRESULT D3D12VideoRenderer::EnsureVideoTexture(const VavCoreVideoFrame& frame) {
+    // Skip if frame has invalid dimensions (can happen during CUDA DPB buffering)
+    if (frame.width == 0 || frame.height == 0) {
+        LOGF_DEBUG("[D3D12VideoRenderer] Skipping texture creation for invalid frame dimensions: %dx%d",
+                   frame.width, frame.height);
+        return S_OK;  // Not an error, just skip texture creation
+    }
+
    // Check if we need to create/recreate video texture
    if (m_videoWidth != (uint32_t)frame.width || m_videoHeight != (uint32_t)frame.height) {
        IVideoBackend* backend = SelectBackend(frame);
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h
@@ -59,10 +59,9 @@ public:

    // Backend-specific texture access for CUDA interop
    ID3D12Resource* GetRGBATextureForCUDAInterop() const;
-    ID3D12Resource* GetNextRGBATextureForCUDAInterop();  // Rotates to next buffer for triple buffering
    ID3D12Resource* GetNV12TextureForCUDAInterop() const { return nullptr; }  // Future: NV12DirectBackend

-    // Get RGBASurfaceBackend for staging texture operations
+    // Get RGBASurfaceBackend for triple buffer management
    RGBASurfaceBackend* GetRGBASurfaceBackend() const { return m_rgbaSurfaceBackend.get(); }

    // Legacy YUV420P upload buffer access (for backward compatibility)
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/RGBASurfaceBackend.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/RGBASurfaceBackend.cpp
@@ -54,21 +54,8 @@ void RGBASurfaceBackend::Shutdown() {
    for (int i = 0; i < BUFFER_COUNT; i++) {
        m_rgbaTextures[i].Reset();
    }
-    m_currentTextureIndex = 0;
-
-    // Release staging texture and copy command objects
-    m_copyCommandList.Reset();
-    m_copyCommandAllocator.Reset();
-    m_stagingTexture.Reset();
-
-    // Close fence event handle
-    if (m_copyFenceEvent != nullptr) {
-        CloseHandle(m_copyFenceEvent);
-        m_copyFenceEvent = nullptr;
-    }
-
-    // Release fence
-    m_copyFence.Reset();
+    m_renderTextureIndex = 0;
+    m_decodeTextureIndex = 0;

    // Clear references (not owned)
    m_device = nullptr;
@@ -78,15 +65,13 @@ void RGBASurfaceBackend::Shutdown() {
 }

 HRESULT RGBASurfaceBackend::CreateVideoTexture(uint32_t width, uint32_t height) {
+    LOGF_INFO("[RGBASurfaceBackend] CreateVideoTexture called: %ux%u", width, height);
    m_videoWidth = width;
    m_videoHeight = height;

    HRESULT hr = S_OK;

    // Create RGBA texture descriptor for CUDA Surface Object write
-    // Format: DXGI_FORMAT_R8G8B8A8_UNORM (4 bytes per pixel)
-    // Flags: D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS (enables CUDA Surface Object creation)
-    // Layout: D3D12_TEXTURE_LAYOUT_UNKNOWN (tiled, CUDA Surface Objects handle this automatically)
    D3D12_RESOURCE_DESC rgbaTextureDesc = {};
    rgbaTextureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
    rgbaTextureDesc.Width = width;
@@ -96,8 +81,8 @@ HRESULT RGBASurfaceBackend::CreateVideoTexture(uint32_t width, uint32_t height)
    rgbaTextureDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
    rgbaTextureDesc.SampleDesc.Count = 1;
    rgbaTextureDesc.SampleDesc.Quality = 0;
-    rgbaTextureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;  // Tiled layout
-    rgbaTextureDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;  // Enable CUDA write
+    rgbaTextureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
+    rgbaTextureDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;

    D3D12_HEAP_PROPERTIES defaultHeapProps = {};
    defaultHeapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
@@ -108,110 +93,48 @@ HRESULT RGBASurfaceBackend::CreateVideoTexture(uint32_t width, uint32_t height)
    for (int i = 0; i < BUFFER_COUNT; i++) {
        hr = m_device->CreateCommittedResource(
            &defaultHeapProps,
-            D3D12_HEAP_FLAG_SHARED,  // Required for CUDA interop
+            D3D12_HEAP_FLAG_SHARED,
            &rgbaTextureDesc,
-            D3D12_RESOURCE_STATE_COMMON,  // CUDA will transition as needed
+            D3D12_RESOURCE_STATE_COMMON,
            nullptr,
            IID_PPV_ARGS(&m_rgbaTextures[i])
        );

        if (FAILED(hr)) {
-            // Cleanup already created textures
+            LOGF_ERROR("[RGBASurfaceBackend] Failed to create RGBA texture[%d]: 0x%08X", i, hr);
            for (int j = 0; j < i; j++) {
                m_rgbaTextures[j].Reset();
            }
            return hr;
        }
+
+        LOGF_INFO("[RGBASurfaceBackend] Created RGBA texture[%d]: %p", i, m_rgbaTextures[i].Get());
    }

-    m_currentTextureIndex = 0;
+    // Triple buffer filling logic:
+    // - Frames 16-18 fill textures 0, 1, 2 (decode only, no rendering yet)
+    // - Frame 19+ normal operation (decode into different texture than render)
+    //
+    // Initial state for filling phase:
+    // - decodeIndex = 0 (will fill texture[0], then [1], then [2])
+    // - renderIndex = 2 (will render from texture[2] after filling completes)
+    //
+    // After filling completes (frame 18):
+    // - decodeIndex = 0 (wraps back after filling [2])
+    // - renderIndex = 2 (will render from texture[2] at frame 19)
+    // - Frame 19: render from [2], decode into [0] (no conflict!)
+    m_renderTextureIndex = 2;
+    m_decodeTextureIndex = 0;

-    // Create staging texture (same format, but no UAV flag - only for rendering)
-    D3D12_RESOURCE_DESC stagingTextureDesc = rgbaTextureDesc;
-    stagingTextureDesc.Flags = D3D12_RESOURCE_FLAG_NONE;  // No CUDA access needed
+    LOGF_INFO("[RGBASurfaceBackend] All %d RGBA textures created successfully", BUFFER_COUNT);

-    hr = m_device->CreateCommittedResource(
-        &defaultHeapProps,
-        D3D12_HEAP_FLAG_NONE,
-        &stagingTextureDesc,
-        D3D12_RESOURCE_STATE_COPY_DEST,  // Initial state for receiving copies
-        nullptr,
-        IID_PPV_ARGS(&m_stagingTexture)
-    );
-
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to create staging texture: 0x%08X", hr);
-        for (int i = 0; i < BUFFER_COUNT; i++) {
-            m_rgbaTextures[i].Reset();
-        }
-        return hr;
-    }
-
-    // Create fence for GPU copy synchronization
-    hr = m_device->CreateFence(
-        0,
-        D3D12_FENCE_FLAG_NONE,
-        IID_PPV_ARGS(&m_copyFence)
-    );
-
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to create copy fence: 0x%08X", hr);
-        m_stagingTexture.Reset();
-        for (int i = 0; i < BUFFER_COUNT; i++) {
-            m_rgbaTextures[i].Reset();
-        }
-        return hr;
-    }
-
-    // Create fence event for CPU wait
-    m_copyFenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
-    if (m_copyFenceEvent == nullptr) {
-        hr = HRESULT_FROM_WIN32(GetLastError());
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to create fence event: 0x%08X", hr);
-        m_copyFence.Reset();
-        m_stagingTexture.Reset();
-        for (int i = 0; i < BUFFER_COUNT; i++) {
-            m_rgbaTextures[i].Reset();
-        }
-        return hr;
-    }
-
-    LOGF_INFO("[RGBASurfaceBackend] Copy fence and event created successfully");
-
-    // Create command allocator and list for texture copy operations
-    hr = m_device->CreateCommandAllocator(
-        D3D12_COMMAND_LIST_TYPE_DIRECT,
-        IID_PPV_ARGS(&m_copyCommandAllocator)
-    );
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to create copy command allocator: 0x%08X", hr);
-        return hr;
-    }
-
-    hr = m_device->CreateCommandList(
-        0,
-        D3D12_COMMAND_LIST_TYPE_DIRECT,
-        m_copyCommandAllocator.Get(),
-        nullptr,
-        IID_PPV_ARGS(&m_copyCommandList)
-    );
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to create copy command list: 0x%08X", hr);
-        return hr;
-    }
-
-    // Close the command list (will be reset when needed)
-    m_copyCommandList->Close();
-
-    LOGF_INFO("[RGBASurfaceBackend] Created staging texture for safe rendering");
-
-    // Create SRV for RGBA texture
+    // Create SRV for rendering
    hr = CreateSrvHeap();
    if (FAILED(hr)) {
        return hr;
    }

-    // Update constant buffer with new aspect ratio
+    // Update constant buffer
    hr = UpdateConstantBuffer();
    if (FAILED(hr)) {
        return hr;
@@ -226,8 +149,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
    ID3D12GraphicsCommandList* commandList,
    D3D12_CPU_DESCRIPTOR_HANDLE rtvHandle)
 {
-    // RGBASurfaceBackend doesn't need RTV (uses CopyResource)
-    (void)rtvHandle;
+    (void)rtvHandle;  // RGBASurfaceBackend doesn't use external RTV
    if (!m_initialized) {
        return E_NOT_VALID_STATE;
    }
@@ -236,17 +158,18 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
        return E_INVALIDARG;
    }

-    // Use staging texture for rendering (safe from decoder overwrites)
-    ID3D12Resource* renderTexture = m_stagingTexture.Get();
+    // Use current render texture (already decoded, safe to read)
+    ID3D12Resource* renderTexture = m_rgbaTextures[m_renderTextureIndex].Get();
    if (!renderTexture) {
-        LOGF_ERROR("[RGBASurfaceBackend] RenderToBackBuffer: staging texture is NULL!");
+        LOGF_ERROR("[RGBASurfaceBackend] RenderToBackBuffer: render texture[%d] is NULL!", m_renderTextureIndex);
        return E_INVALIDARG;
    }

-    LOGF_DEBUG("[RGBASurfaceBackend] RenderToBackBuffer: using staging texture, ptr=%p", renderTexture);
+    LOGF_DEBUG("[RGBASurfaceBackend] RenderToBackBuffer: using texture[%d], ptr=%p",
+               m_renderTextureIndex, renderTexture);

-    // Staging texture is already in PIXEL_SHADER_RESOURCE state (set by CopyToStagingTexture)
-    // No barrier needed here
+    // Render texture is in COMMON state (CUDA managed)
+    // No barrier needed for reading in pixel shader

    // Transition back buffer to render target
    D3D12_RESOURCE_BARRIER barrierToRT = {};
@@ -258,13 +181,12 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
    barrierToRT.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
    commandList->ResourceBarrier(1, &barrierToRT);

-    // Create RTV for back buffer (not needed anymore - use rtvHandle from parameter)
+    // Create RTV for back buffer
    D3D12_CPU_DESCRIPTOR_HANDLE backBufferRtvHandle;
    D3D12_RENDER_TARGET_VIEW_DESC rtvDesc = {};
    rtvDesc.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
    rtvDesc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D;

-    // Create temporary RTV heap for back buffer
    ComPtr<ID3D12DescriptorHeap> rtvHeap;
    D3D12_DESCRIPTOR_HEAP_DESC rtvHeapDesc = {};
    rtvHeapDesc.NumDescriptors = 1;
@@ -287,7 +209,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
    ID3D12DescriptorHeap* heaps[] = { m_srvHeap.Get() };
    commandList->SetDescriptorHeaps(1, heaps);

-    // Use descriptor for staging texture (index 0, the only descriptor)
+    // Use SRV for current render texture
    CD3DX12_GPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetGPUDescriptorHandleForHeapStart());
    commandList->SetGraphicsRootDescriptorTable(0, srvHandle);
    commandList->SetGraphicsRootConstantBufferView(1, m_constantBuffer->GetGPUVirtualAddress());
@@ -314,7 +236,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(

    // Draw fullscreen quad
    commandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
-    commandList->DrawInstanced(6, 1, 0, 0);  // Fullscreen quad (2 triangles)
+    commandList->DrawInstanced(6, 1, 0, 0);

    // Transition back buffer to present
    D3D12_RESOURCE_BARRIER barrierToPresent = {};
@@ -326,9 +248,6 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
    barrierToPresent.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
    commandList->ResourceBarrier(1, &barrierToPresent);

-    // Staging texture remains in PIXEL_SHADER_RESOURCE state (no transition needed)
-    // It will be transitioned back to COPY_DEST when CopyToStagingTexture is called next
-
    return S_OK;
 }

@@ -536,9 +455,9 @@ HRESULT RGBASurfaceBackend::CreatePipelineState() {
 }

 HRESULT RGBASurfaceBackend::CreateSrvHeap() {
-    // Create descriptor heap with 1 descriptor for staging texture
+    // Create descriptor heap with 1 descriptor for current render texture
    D3D12_DESCRIPTOR_HEAP_DESC srvHeapDesc = {};
-    srvHeapDesc.NumDescriptors = 1;  // Only need SRV for staging texture
+    srvHeapDesc.NumDescriptors = 1;
    srvHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
    srvHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;

@@ -547,22 +466,13 @@ HRESULT RGBASurfaceBackend::CreateSrvHeap() {
        return hr;
    }

-    // Create SRV for staging texture (the only texture used for rendering)
-    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
-    srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
-    srvDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
-    srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
-    srvDesc.Texture2D.MipLevels = 1;
+    // Create initial SRV for texture[0] (m_renderTextureIndex = 0)
+    hr = UpdateSRVForCurrentRenderTexture();
+    if (FAILED(hr)) {
+        return hr;
+    }

-    CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetCPUDescriptorHandleForHeapStart());
-
-    m_device->CreateShaderResourceView(
-        m_stagingTexture.Get(),
-        &srvDesc,
-        srvHandle
-    );
-
-    LOGF_INFO("[RGBASurfaceBackend] Created SRV for staging texture");
+    LOGF_INFO("[RGBASurfaceBackend] Created SRV heap for render texture");

    return S_OK;
 }
@@ -605,138 +515,80 @@ HRESULT RGBASurfaceBackend::UpdateConstantBuffer() {
    return S_OK;
 }

-ID3D12Resource* RGBASurfaceBackend::GetNextVideoTexture() {
-    // Rotate to next buffer index
-    int prevIndex = m_currentTextureIndex;
-    m_currentTextureIndex = (m_currentTextureIndex + 1) % BUFFER_COUNT;
-
-    LOGF_INFO("[RGBASurfaceBackend] GetNextVideoTexture: %d -> %d, texture=%p",
-              prevIndex, m_currentTextureIndex, m_rgbaTextures[m_currentTextureIndex].Get());
-
-    return m_rgbaTextures[m_currentTextureIndex].Get();
+// Triple buffering management functions
+ID3D12Resource* RGBASurfaceBackend::GetCurrentRenderTexture() const {
+    return m_rgbaTextures[m_renderTextureIndex].Get();
 }

-HRESULT RGBASurfaceBackend::CopyToStagingTexture(ID3D12Resource* sourceTexture) {
-    if (!m_initialized || !m_stagingTexture || !sourceTexture) {
+ID3D12Resource* RGBASurfaceBackend::GetNextDecodeTexture() const {
+    return m_rgbaTextures[m_decodeTextureIndex].Get();
+}
+
+void RGBASurfaceBackend::AdvanceDecodeOnly() {
+    int prevDecode = m_decodeTextureIndex;
+
+    // Filling phase: Only advance decode index, render index stays at 0
+    // This is used during frames 16-18 to fill all three textures
+    m_decodeTextureIndex = (m_decodeTextureIndex + 1) % BUFFER_COUNT;
+
+    LOGF_INFO("[RGBASurfaceBackend] AdvanceDecodeOnly: decode %d->%d (render stays at %d)",
+              prevDecode, m_decodeTextureIndex, m_renderTextureIndex);
+}
+
+void RGBASurfaceBackend::AdvanceFrame() {
+    int prevRender = m_renderTextureIndex;
+    int prevDecode = m_decodeTextureIndex;
+
+    // Triple buffering advance logic:
+    // After Frame N decodes into decodeTexture, we want:
+    // - Render from the PREVIOUS frame's texture (N-1), not the current one!
+    // - Decode into the oldest texture (N-2)
+    //
+    // Current state: render=R, decode=D
+    // After decode completes: the old render texture becomes new decode target
+    // The old decode texture will be rendered NEXT frame (not this frame!)
+    //
+    // Example with 3 textures:
+    // State: render=2, decode=0
+    // - Frame 19 decodes into texture[0] (now contains frame 19)
+    // - Advance: render=2 (still showing frame 18!), decode=1
+    // - Frame 20 decodes into texture[1] (now contains frame 20)
+    // - Advance: render=0 (now showing frame 19), decode=2
+    //
+    // This ensures we NEVER render from a texture that was just written!
+    m_decodeTextureIndex = m_renderTextureIndex;  // Old render texture becomes next decode target
+    m_renderTextureIndex = prevDecode;             // Old decode texture becomes new render texture
+
+    LOGF_INFO("[RGBASurfaceBackend] AdvanceFrame: render %d->%d, decode %d->%d",
+              prevRender, m_renderTextureIndex, prevDecode, m_decodeTextureIndex);
+
+    // Update SRV to point to new render texture
+    HRESULT hr = UpdateSRVForCurrentRenderTexture();
+    if (FAILED(hr)) {
+        LOGF_ERROR("[RGBASurfaceBackend] Failed to update SRV: 0x%08X", hr);
+    }
+}
+
+HRESULT RGBASurfaceBackend::UpdateSRVForCurrentRenderTexture() {
+    if (!m_srvHeap || !m_rgbaTextures[m_renderTextureIndex]) {
        return E_NOT_VALID_STATE;
    }

-    // Reset command allocator and list
-    HRESULT hr = m_copyCommandAllocator->Reset();
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to reset copy command allocator: 0x%08X", hr);
-        return hr;
-    }
+    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+    srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+    srvDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
+    srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
+    srvDesc.Texture2D.MipLevels = 1;

-    hr = m_copyCommandList->Reset(m_copyCommandAllocator.Get(), nullptr);
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to reset copy command list: 0x%08X", hr);
-        return hr;
-    }
+    CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetCPUDescriptorHandleForHeapStart());

-    // Transition source texture to COPY_SOURCE
-    D3D12_RESOURCE_BARRIER sourceBarrier = {};
-    sourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
-    sourceBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
-    sourceBarrier.Transition.pResource = sourceTexture;
-    sourceBarrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON;  // CUDA uses COMMON
-    sourceBarrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
-    sourceBarrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
-
-    // Transition staging texture to COPY_DEST (only if not first copy)
-    // First copy: staging texture already in COPY_DEST state (created with that state)
-    // Subsequent copies: staging texture in PIXEL_SHADER_RESOURCE state (from previous render)
-    if (m_firstCopy) {
-        // First copy: only transition source
-        m_copyCommandList->ResourceBarrier(1, &sourceBarrier);
-        m_firstCopy = false;
-        LOGF_DEBUG("[RGBASurfaceBackend] First copy: staging texture already in COPY_DEST state");
-    } else {
-        // Subsequent copies: transition both staging and source
-        D3D12_RESOURCE_BARRIER stagingToCopyDest = {};
-        stagingToCopyDest.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
-        stagingToCopyDest.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
-        stagingToCopyDest.Transition.pResource = m_stagingTexture.Get();
-        stagingToCopyDest.Transition.StateBefore = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
-        stagingToCopyDest.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST;
-        stagingToCopyDest.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
-
-        D3D12_RESOURCE_BARRIER barriers[] = { stagingToCopyDest, sourceBarrier };
-        m_copyCommandList->ResourceBarrier(2, barriers);
-    }
-
-    // Copy texture
-    m_copyCommandList->CopyResource(m_stagingTexture.Get(), sourceTexture);
-
-    // Transition source back to COMMON (for CUDA)
-    sourceBarrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE;
-    sourceBarrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COMMON;
-    m_copyCommandList->ResourceBarrier(1, &sourceBarrier);
-
-    // Transition staging texture to PIXEL_SHADER_RESOURCE for rendering
-    D3D12_RESOURCE_BARRIER stagingBarrier = {};
-    stagingBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
-    stagingBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
-    stagingBarrier.Transition.pResource = m_stagingTexture.Get();
-    stagingBarrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;
-    stagingBarrier.Transition.StateAfter = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
-    stagingBarrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
-
-    m_copyCommandList->ResourceBarrier(1, &stagingBarrier);
-
-    // Close command list
-    hr = m_copyCommandList->Close();
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to close copy command list: 0x%08X", hr);
-        return hr;
-    }
-
-    // Execute command list
-    ID3D12CommandList* commandLists[] = { m_copyCommandList.Get() };
-    m_commandQueue->ExecuteCommandLists(1, commandLists);
-
-    // Signal fence after copy submission
-    m_copyFenceValue++;
-    hr = m_commandQueue->Signal(m_copyFence.Get(), m_copyFenceValue);
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] Failed to signal copy fence: 0x%08X", hr);
-        return hr;
-    }
-
-    LOGF_DEBUG("[RGBASurfaceBackend] GPU copy submitted (fence value: %llu)", m_copyFenceValue);
-
-    return S_OK;
-}
-
-HRESULT RGBASurfaceBackend::WaitForCopyCompletion() {
-    if (!m_copyFence || m_copyFenceEvent == nullptr) {
-        LOGF_ERROR("[RGBASurfaceBackend] Copy fence or event not initialized");
-        return E_NOT_VALID_STATE;
-    }
-
-    // Check if copy already completed
-    if (m_copyFence->GetCompletedValue() >= m_copyFenceValue) {
-        LOGF_DEBUG("[RGBASurfaceBackend] GPU copy already complete (fence value: %llu)", m_copyFenceValue);
-        return S_OK;  // Already complete
-    }
-
-    // Wait for GPU copy to complete
-    HRESULT hr = m_copyFence->SetEventOnCompletion(
-        m_copyFenceValue,
-        m_copyFenceEvent
+    m_device->CreateShaderResourceView(
+        m_rgbaTextures[m_renderTextureIndex].Get(),
+        &srvDesc,
+        srvHandle
    );
-    if (FAILED(hr)) {
-        LOGF_ERROR("[RGBASurfaceBackend] SetEventOnCompletion failed: 0x%08X", hr);
-        return hr;
-    }

-    DWORD waitResult = WaitForSingleObject(m_copyFenceEvent, 5000);  // 5 second timeout
-    if (waitResult != WAIT_OBJECT_0) {
-        LOGF_ERROR("[RGBASurfaceBackend] Wait failed or timed out: %lu", waitResult);
-        return E_FAIL;
-    }
-
-    LOGF_DEBUG("[RGBASurfaceBackend] GPU copy completed (fence value: %llu)", m_copyFenceValue);
+    LOGF_DEBUG("[RGBASurfaceBackend] Updated SRV for render texture[%d]", m_renderTextureIndex);

    return S_OK;
 }
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/RGBASurfaceBackend.h
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/RGBASurfaceBackend.h
@@ -39,24 +39,25 @@ public:

    HRESULT CreateVideoTexture(uint32_t width, uint32_t height) override;
    ID3D12Resource* GetVideoTexture() const override {
-        return m_rgbaTextures[m_currentTextureIndex].Get();
+        return m_rgbaTextures[m_renderTextureIndex].Get();
    }

-    // Get next available texture for decoding (rotates buffer index)
-    ID3D12Resource* GetNextVideoTexture();
+    // Triple buffering management
+    // Get current texture being rendered to screen
+    ID3D12Resource* GetCurrentRenderTexture() const;

-    // Get current rendering texture index
-    int GetCurrentTextureIndex() const { return m_currentTextureIndex; }
+    // Get next texture for decoding (not currently being rendered)
+    ID3D12Resource* GetNextDecodeTexture() const;

-    // Staging texture management for safe rendering
-    // Copy decoder texture to stable staging texture
-    HRESULT CopyToStagingTexture(ID3D12Resource* sourceTexture);
+    // Advance frame: switch render/decode indices after decoding completes (normal operation)
+    void AdvanceFrame();

-    // Wait for GPU copy to complete
-    HRESULT WaitForCopyCompletion();
+    // Advance decode only: move to next decode texture without changing render index (filling phase)
+    void AdvanceDecodeOnly();

-    // Get stable staging texture for rendering (never overwritten by decoder)
-    ID3D12Resource* GetStagingTexture() const { return m_stagingTexture.Get(); }
+    // Get current indices for debugging
+    int GetRenderTextureIndex() const { return m_renderTextureIndex; }
+    int GetDecodeTextureIndex() const { return m_decodeTextureIndex; }

    HRESULT RenderToBackBuffer(
        const VavCoreVideoFrame& frame,
@@ -73,27 +74,19 @@ private:
    ID3D12Device* m_device = nullptr;
    ID3D12CommandQueue* m_commandQueue = nullptr;

-    // RGBA video textures (triple buffering) - Decoder writes here
+    // RGBA video textures (triple buffering)
    // Format: DXGI_FORMAT_R8G8B8A8_UNORM
    // Flags: D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS (for CUDA Surface Object)
    // Layout: D3D12_TEXTURE_LAYOUT_UNKNOWN (tiled, handled by CUDA Surface Objects)
+    //
+    // Triple buffering roles:
+    // - m_renderTextureIndex: Currently rendering to screen (safe to read)
+    // - m_decodeTextureIndex: Next target for decoding (safe to write)
+    // - Third texture: Idle, previously decoded (ready to become render texture)
    static const int BUFFER_COUNT = 3;
    ComPtr<ID3D12Resource> m_rgbaTextures[BUFFER_COUNT];
-    int m_currentTextureIndex = 0;
-
-    // Staging texture - Stable copy for rendering (decoder never touches this)
-    // This texture is copied from m_rgbaTextures at 30fps
-    // Renderer always reads from this texture (safe from race conditions)
-    ComPtr<ID3D12Resource> m_stagingTexture;
-
-    // Command allocator and list for async texture copy
-    ComPtr<ID3D12CommandAllocator> m_copyCommandAllocator;
-    ComPtr<ID3D12GraphicsCommandList> m_copyCommandList;
-
-    // GPU synchronization for copy operations
-    ComPtr<ID3D12Fence> m_copyFence;
-    UINT64 m_copyFenceValue = 0;
-    HANDLE m_copyFenceEvent = nullptr;
+    int m_renderTextureIndex = 0;  // Texture currently being rendered
+    int m_decodeTextureIndex = 0;  // Texture for next decode operation

    // Graphics pipeline for simple RGBA texture sampling
    ComPtr<ID3D12RootSignature> m_rootSignature;
@@ -122,7 +115,6 @@ private:
    uint32_t m_height = 0;  // Container height
    uint32_t m_videoWidth = 0;
    uint32_t m_videoHeight = 0;
-    bool m_firstCopy = true;  // Track first copy to handle initial state

    // Helper methods
    HRESULT CreateGraphicsResources();
@@ -131,6 +123,7 @@ private:
    HRESULT CreatePipelineState();
    HRESULT CreateSrvHeap();
    HRESULT UpdateConstantBuffer();
+    HRESULT UpdateSRVForCurrentRenderTexture();
 };

 } // namespace Vav2Player
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -1374,12 +1374,19 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI
    slot.pts = timestamp;
    slot.ready_for_display.store(true);

-    // Enqueue picture_index for display queue (for B-frame reordering)
+    // Enqueue DisplayQueueEntry with PTS for B-frame reordering
    {
        std::lock_guard<std::mutex> lock(decoder->m_displayMutex);
-        decoder->m_displayQueue.push(pic_idx);
-        LOGF_DEBUG("[HandlePictureDisplay] Pushed picture_index=%d (pts=%lld) to display queue (size: %zu)",
-                   pic_idx, timestamp, decoder->m_displayQueue.size());
+
+        DisplayQueueEntry entry;
+        entry.frame_slot_index = pic_idx;
+        entry.pts = timestamp;
+        entry.submission_id = slot.submission_id;
+
+        decoder->m_displayQueue.push(entry);
+
+        LOGF_DEBUG("[HandlePictureDisplay] Pushed DisplayQueueEntry: slot=%d, pts=%lld, submission_id=%llu (queue size: %zu)",
+                   pic_idx, timestamp, slot.submission_id, decoder->m_displayQueue.size());
    }

    return 1;
@@ -1572,11 +1579,17 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                                     VideoFrame& output_frame) {
    LOGF_DEBUG("[DecodeToSurface] Called with target_type=%d", static_cast<int>(target_type));

-    if (!m_initialized || !packet_data) {
-        LOGF_ERROR("[DecodeToSurface] Not initialized or null packet_data");
+    if (!m_initialized) {
+        LOGF_ERROR("[DecodeToSurface] Not initialized");
        return false;
    }

+    // Handle NULL packet_data as flush mode (end of file reached)
+    if (!packet_data || packet_size == 0) {
+        LOGF_DEBUG("[DecodeToSurface] NULL packet - flush mode (end of file)");
+        m_state = DecoderState::FLUSHING;
+    }
+
    // Set CUDA context for current thread
    {
        std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
@@ -1617,13 +1630,25 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
        // ===== Step 2: Submit packet to NVDEC parser =====
        // This triggers HandlePictureDecode (if new frame) and HandlePictureDisplay (always)
        CUVIDSOURCEDATAPACKET packet = {};
-        packet.payload = packet_data;
-        packet.payload_size = static_cast<unsigned long>(packet_size);
-        packet.flags = CUVID_PKT_ENDOFPICTURE;
-        packet.timestamp = 0;  // Not used - NVDEC parser overwrites this value

-        LOGF_INFO("[DecodeToSurface] Calling cuvidParseVideoData (submission_id=%llu)...",
-                  my_submission_id);
+        // Handle flush mode (NULL packet)
+        if (m_state == DecoderState::FLUSHING) {
+            // Flush mode: send end-of-stream packet to drain CUDA DPB
+            packet.flags = CUVID_PKT_ENDOFSTREAM;
+            packet.payload = nullptr;
+            packet.payload_size = 0;
+            LOGF_INFO("[DecodeToSurface] Flush mode: sending ENDOFSTREAM packet (submission_id=%llu)",
+                      my_submission_id);
+        } else {
+            // Normal mode: send actual packet data
+            packet.payload = packet_data;
+            packet.payload_size = static_cast<unsigned long>(packet_size);
+            packet.flags = CUVID_PKT_ENDOFPICTURE;
+            packet.timestamp = 0;  // Not used - NVDEC parser overwrites this value
+
+            LOGF_INFO("[DecodeToSurface] Normal mode: calling cuvidParseVideoData (submission_id=%llu)...",
+                      my_submission_id);
+        }

        CUresult result = cuvidParseVideoData(m_parser, &packet);
        // cuvidParseVideoData is SYNCHRONOUS - all callbacks execute before return
@@ -1647,33 +1672,63 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
        {
            std::lock_guard<std::mutex> lock(m_displayMutex);

-            // During initial buffering, accept packets until display queue has frames
-            if (m_displayQueue.empty() && !m_initialBufferingComplete) {
-                LOGF_DEBUG("[DecodeToSurface] PACKET ACCEPTED - Initial buffering (queue size: 0)");
-                return VAVCORE_PACKET_ACCEPTED;
+            // Transition from READY to BUFFERING on first packet
+            if (m_state == DecoderState::READY && m_displayQueue.empty()) {
+                m_state = DecoderState::BUFFERING;
+                LOGF_DEBUG("[DecodeToSurface] State transition: READY → BUFFERING");
            }

-            // Once we have frames in queue, mark buffering complete
-            if (!m_displayQueue.empty() && !m_initialBufferingComplete) {
-                m_initialBufferingComplete = true;
-                LOGF_INFO("[DecodeToSurface] Initial buffering complete, queue size: %zu", m_displayQueue.size());
+            // During initial buffering, accept packets until display queue has frames
+            if (m_displayQueue.empty() && m_state == DecoderState::BUFFERING) {
+                LOGF_DEBUG("[DecodeToSurface] PACKET ACCEPTED - Initial buffering (queue size: 0)");
+                // Return false to indicate no frame yet (still buffering)
+                // The C API wrapper will convert this to VAVCORE_PACKET_ACCEPTED
+                return false;
+            }
+
+            // Once we have frames in queue, transition to DECODING
+            if (!m_displayQueue.empty() && m_state == DecoderState::BUFFERING) {
+                m_state = DecoderState::DECODING;
+                LOGF_INFO("[DecodeToSurface] State transition: BUFFERING → DECODING (queue size: %zu)", m_displayQueue.size());
            }
        }

-        // ===== Step 4: Pop from display queue to get picture_index =====
+        // ===== Step 4: Pop from display queue to get picture_index (PTS-ordered) =====
+        DisplayQueueEntry entry;
        int pic_idx = -1;
        {
            std::lock_guard<std::mutex> lock(m_displayMutex);

            if (m_displayQueue.empty()) {
-                LOGF_ERROR("[DecodeToSurface] Display queue EMPTY after buffering complete (SHOULD NOT HAPPEN!)");
-                return false;
+                // Check if we're in flush mode
+                if (m_state == DecoderState::FLUSHING) {
+                    // Flush mode: no more frames in CUDA DPB
+                    // Return false to indicate no frame, caller will check end-of-stream
+                    LOGF_INFO("[DecodeToSurface] Flush complete: all frames drained from CUDA DPB");
+
+                    // Release pending submission before returning
+                    {
+                        std::lock_guard<std::mutex> lock2(m_submissionMutex);
+                        m_pendingSubmissions[pending_idx].in_use.store(false);
+                    }
+
+                    // Return false - the C API wrapper will convert this to VAVCORE_END_OF_STREAM
+                    // when combined with file reader's IsEndOfFile() check
+                    return false;
+                } else {
+                    // Normal mode: queue empty unexpectedly
+                    LOGF_ERROR("[DecodeToSurface] Display queue EMPTY after buffering complete (SHOULD NOT HAPPEN!)");
+                    return false;
+                }
            }

-            pic_idx = m_displayQueue.front();
+            // Pop from priority queue (PTS-ordered)
+            entry = m_displayQueue.top();
            m_displayQueue.pop();
-            LOGF_INFO("[DecodeToSurface] Popped picture_index=%d from display queue (queue size now: %zu)",
-                     pic_idx, m_displayQueue.size());
+            pic_idx = entry.frame_slot_index;
+
+            LOGF_INFO("[DecodeToSurface] Popped DisplayQueueEntry: slot=%d, pts=%lld, submission_id=%llu (queue size now: %zu)",
+                     pic_idx, entry.pts, entry.submission_id, m_displayQueue.size());
        }

        if (pic_idx < 0 || pic_idx >= RING_BUFFER_SIZE) {
@@ -1697,21 +1752,37 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_

        LOGF_DEBUG("[DecodeToSurface] Frame slot %d ready for display", pic_idx);

-        // ===== Step 6: Copy from CUDA DPB to target surface =====
-        if (!CopyFromCUDADPB(pic_idx, target_type, target_surface, output_frame)) {
+        // ===== Step 6: Update target_surface for this frame =====
+        // CRITICAL: SwapChain provides different target_surface each frame!
+        // Always update slot.target_surface to current one.
+        LOGF_DEBUG("[DecodeToSurface] Updating target_surface: %p -> %p (pic_idx=%d)",
+                   slot.target_surface, target_surface, pic_idx);
+
+        if (target_surface == nullptr) {
+            LOGF_ERROR("[DecodeToSurface] ERROR: target_surface is NULL for pic_idx=%d", pic_idx);
+            return false;
+        }
+
+        // Always update to current target_surface (SwapChain back buffer changes each frame)
+        slot.target_surface = target_surface;
+        slot.surface_type = target_type;
+
+        // ===== Step 7: Copy from CUDA DPB to target surface =====
+        // Now use slot.target_surface which is guaranteed to be valid (either from decode or late binding)
+        if (!CopyFromCUDADPB(pic_idx, slot.surface_type, slot.target_surface, output_frame)) {
            LOGF_ERROR("[DecodeToSurface] CopyFromCUDADPB failed for picture_index=%d", pic_idx);
            return false;
        }

        LOGF_INFO("[DecodeToSurface] SUCCESS - Frame rendered from CUDA DPB (pic_idx=%d)", pic_idx);

-        // ===== Step 7: Mark slot as reusable =====
+        // ===== Step 8: Mark slot as reusable =====
        slot.ready_for_display.store(false);
        slot.in_use.store(false);

        LOGF_DEBUG("[DecodeToSurface] Released frame slot %d", pic_idx);

-        // ===== Step 8: Release pending submission =====
+        // ===== Step 9: Release pending submission =====
        {
            std::lock_guard<std::mutex> lock(m_submissionMutex);
            m_pendingSubmissions[pending_idx].in_use.store(false);
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
@@ -102,6 +102,14 @@ protected:
    void LogCUDAError(CUresult result, const std::string& operation) const;

 private:
+    // Decoder state enum (simple inline approach)
+    enum class DecoderState {
+        READY,           // Initialized and ready for first packet
+        BUFFERING,       // Initial buffering (0-15 frames)
+        DECODING,        // Normal frame-by-frame decoding
+        FLUSHING         // End-of-file reached, draining DPB
+    };
+
    // CUDA and NVDEC objects
    CUcontext m_cuContext = nullptr;
    CUdevice m_cudaDevice = 0;
@@ -224,7 +232,23 @@ private:
    void PollingThreadFunc();                     // Polling thread function

    // Display-only packet handling (B-frame reordering)
-    std::queue<int> m_displayQueue;  // Queue of picture_index from HandlePictureDisplay
+    // DisplayQueueEntry: Frame information for PTS-based reordering
+    struct DisplayQueueEntry {
+        int frame_slot_index;    // FrameSlot index in m_frameSlots[]
+        int64_t pts;             // Presentation timestamp
+        uint64_t submission_id;  // Original submission order
+    };
+
+    // PTSComparator: PTS ascending order (Min-heap for earliest PTS first)
+    struct PTSComparator {
+        bool operator()(const DisplayQueueEntry& a, const DisplayQueueEntry& b) const {
+            return a.pts > b.pts;  // Min-heap: smallest PTS has highest priority
+        }
+    };
+
+    std::priority_queue<DisplayQueueEntry,
+                        std::vector<DisplayQueueEntry>,
+                        PTSComparator> m_displayQueue;  // PTS-based priority queue
    std::mutex m_displayMutex;

    // Helper methods
@@ -245,8 +269,8 @@ private:
    bool CopyFromCUDADPB(int pic_idx, VavCoreSurfaceType target_type,
                         void* target_surface, VideoFrame& output_frame);

-    // Initial buffering state
-    std::atomic<bool> m_initialBufferingComplete{false};
+    // Decoder state (replaces m_initialBufferingComplete and m_endOfFileReached)
+    DecoderState m_state = DecoderState::READY;

    // NV12ToRGBAConverter reinitialization flag (set by HandleVideoSequence)
    std::atomic<bool> m_converterNeedsReinit{false};