diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp
index d63bf40..6b1fe64 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp
@@ -319,6 +319,11 @@ namespace winrt::Vav2Player::implementation
             LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2",
                 L"Video loaded: " + std::to_wstring(videoWidth) + L"x" + std::to_wstring(videoHeight));
 
+            // Prepare video texture before decoding (via FrameProcessor)
+            if (m_frameProcessor) {
+                m_frameProcessor->PrepareVideoTexture(videoWidth, videoHeight);
+            }
+
             // Update AspectFit
             UpdateVideoImageAspectFit(videoWidth, videoHeight);
 
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
index f4132c7..f3628bc 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
@@ -28,6 +28,13 @@ void FrameProcessor::SetDispatcherQueue(winrt::Microsoft::UI::Dispatching::Dispa
     m_dispatcherQueue = queue;
 }
 
+void FrameProcessor::PrepareVideoTexture(uint32_t width, uint32_t height)
+{
+    if (m_renderer) {
+        m_renderer->PrepareVideoTexture(width, height);
+    }
+}
+
 bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
                                   std::function<void(bool success)> onComplete)
 {
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h
index ec6a0e1..6e7be43 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h
@@ -26,6 +26,9 @@ public:
     // Set dispatcher queue for UI thread callbacks
     void SetDispatcherQueue(winrt::Microsoft::UI::Dispatching::DispatcherQueue const& queue);
 
+    // Prepare video texture before first decode
+    void PrepareVideoTexture(uint32_t width, uint32_t height);
+
     // Process single frame (called from PlaybackController timing thread)
     // Returns: true if frame processing started, false if skipped (previous frame still rendering)
     // onComplete: Callback invoked on UI thread after render completes (success flag)
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp
index 041c1f1..3ccf720 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp
@@ -62,7 +62,10 @@ HRESULT D3D12VideoRenderer::InitializeWithSwapChain(
 }
 
 void D3D12VideoRenderer::Shutdown() {
-    WaitForGPU();
+    // Only wait for GPU if we have a valid command queue
+    if (m_commandQueue && m_fence && m_fenceEvent) {
+        WaitForGPU();
+    }
 
     // Shutdown backends
     if (m_rgbaSurfaceBackend) {
@@ -224,6 +227,24 @@ void D3D12VideoRenderer::SetSwapChainPanel(winrt::Microsoft::UI::Xaml::Controls:
     m_swapChainPanel = panel;
 }
 
+HRESULT D3D12VideoRenderer::PrepareVideoTexture(uint32_t width, uint32_t height) {
+    if (!m_initialized) {
+        return E_NOT_VALID_STATE;
+    }
+
+    // Create RGBA texture in advance
+    if (m_rgbaSurfaceBackend) {
+        HRESULT hr = m_rgbaSurfaceBackend->CreateVideoTexture(width, height);
+        if (FAILED(hr)) {
+            return hr;
+        }
+        m_videoWidth = width;
+        m_videoHeight = height;
+    }
+
+    return S_OK;
+}
+
 ID3D12Resource* D3D12VideoRenderer::GetRGBATextureForCUDAInterop() const {
     if (m_rgbaSurfaceBackend) {
         return m_rgbaSurfaceBackend->GetVideoTexture();
@@ -383,13 +404,24 @@ HRESULT D3D12VideoRenderer::CreateCommandObjects() {
         }
     }
 
-    return m_device->CreateCommandList(
+    HRESULT hr = m_device->CreateCommandList(
         0,
         D3D12_COMMAND_LIST_TYPE_DIRECT,
         m_commandAllocators[0].Get(),
         nullptr,
         IID_PPV_ARGS(&m_commandList)
     );
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    // Close command list after creation (it starts in recording state)
+    hr = m_commandList->Close();
+    if (FAILED(hr)) {
+        return hr;
+    }
+
+    return S_OK;
 }
 
 HRESULT D3D12VideoRenderer::CreateSynchronizationObjects() {
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h
index bcb36b2..85c2371 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h
@@ -55,6 +55,9 @@ public:
     ID3D12Device* GetD3D12Device() const { return m_device.Get(); }
     ID3D12CommandQueue* GetCommandQueue() const { return m_commandQueue.Get(); }
 
+    // Prepare video texture before decoding
+    HRESULT PrepareVideoTexture(uint32_t width, uint32_t height);
+
     // Backend-specific texture access for CUDA interop
     ID3D12Resource* GetRGBATextureForCUDAInterop() const;
     ID3D12Resource* GetNV12TextureForCUDAInterop() const { return nullptr; }  // Future: NV12DirectBackend
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
index a4bcf88..81e63f7 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -904,34 +904,41 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS
 
     DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
 
-    // Find pending submission context using most recent submission_id
-    // cuvidParseVideoData is SYNCHRONOUS - the callback is for the packet we just submitted
-    // Therefore, m_submissionCounter - 1 is the submission_id for THIS packet
+    // Find pending submission context for this decode slot
+    // CRITICAL: Search for the HIGHEST submission_id (most recent packet being processed)
+    // This handles the case where cuvidParseVideoData callbacks may execute asynchronously
     uint64_t submission_id = 0;
     size_t pending_idx = 0;
     bool found = false;
     {
         std::lock_guard<std::mutex> lock(decoder->m_submissionMutex);
 
-        // Get the most recent submission (the one that triggered this callback)
-        uint64_t current_submission_id = decoder->m_submissionCounter.load() - 1;
-        pending_idx = current_submission_id % RING_BUFFER_SIZE;
+        // Search backwards from current submission counter to find most recent active pending
+        uint64_t current_counter = decoder->m_submissionCounter.load();
+        uint64_t max_submission_id = 0;
+        bool found_any = false;
 
-        auto& pending = decoder->m_pendingSubmissions[pending_idx];
+        for (size_t i = 0; i < RING_BUFFER_SIZE; i++) {
+            auto& pending = decoder->m_pendingSubmissions[i];
 
-        // Verify this pending submission is in use and matches the slot
-        if (pending.in_use.load()) {
-            // Copy pending submission context to decode slot
-            slot.target_surface = pending.target_surface;
-            slot.surface_type = pending.surface_type;
-            slot.submission_id = pending.submission_id;
-            submission_id = pending.submission_id;
+            if (pending.in_use.load()) {
+                // Found an active pending submission
+                if (!found_any || pending.submission_id > max_submission_id) {
+                    // This is the newest one so far
+                    slot.target_surface = pending.target_surface;
+                    slot.surface_type = pending.surface_type;
+                    slot.submission_id = pending.submission_id;
+                    submission_id = pending.submission_id;
+                    pending_idx = i;
+                    max_submission_id = pending.submission_id;
+                    found_any = true;
+                    found = true;
+                }
+            }
+        }
 
-            // Release pending slot for reuse
-            pending.in_use.store(false);
-            found = true;
-        } else {
-            LOGF_ERROR("[HandlePictureDecode] Pending submission slot %zu not in use!", pending_idx);
+        if (!found) {
+            LOGF_ERROR("[HandlePictureDecode] No active pending submission found for slot %d", slot_idx);
         }
     }
 
@@ -1181,12 +1188,11 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
         LOGF_DEBUG("[DecodeToSurface] Allocated submission_id=%llu, pending_idx=%zu",
                    my_submission_id, pending_idx);
 
-        // 2. Wait if pending slot is still in use (overflow protection)
-        while (m_pendingSubmissions[pending_idx].in_use.load()) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(1));
-        }
-
-        // 3. Store submission context in ring buffer slot
+        // 2. Store submission context in ring buffer slot (overwrite old data)
+        // No need to wait - ring buffer naturally cycles after 16 submissions
+        // Old pending submissions will be overwritten, which is safe because:
+        // - Decode slots already have their copy of pending data
+        // - 16 slots is enough buffer for B-frame reordering
         {
             std::lock_guard<std::mutex> lock(m_submissionMutex);
             auto& pending = m_pendingSubmissions[pending_idx];
@@ -1194,7 +1200,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
             pending.target_surface = target_surface;
             pending.surface_type = target_type;
             pending.submission_id = my_submission_id;
-            pending.in_use.store(true);
+            pending.in_use.store(true);  // Mark as active for HandlePictureDecode search
         }
 
         LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu",
@@ -1212,7 +1218,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                   my_submission_id);
 
         CUresult result = cuvidParseVideoData(m_parser, &packet);
-        // cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return
+        // cuvidParseVideoData is SYNCHRONOUS - all callbacks execute before return
 
         if (result != CUDA_SUCCESS) {
             LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed with code %d", result);
@@ -1227,7 +1233,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
             return false;
         }
 
-        LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed");
+        LOGF_DEBUG("[DecodeToSurface] Packet submitted, synchronous callbacks completed");
+
+        // IMPORTANT: Do NOT release pending submission here!
+        // Even though cuvidParseVideoData is documented as synchronous, NVDEC's B-frame
+        // reordering means callbacks from THIS packet may execute during FUTURE packets.
+        // Pending submissions will naturally be overwritten when ring buffer wraps (16 slots).
+        LOGF_DEBUG("[DecodeToSurface] Keeping pending_idx=%zu active (will be reused after %d submissions)",
+                   pending_idx, RING_BUFFER_SIZE);
 
         // ===== Component 4: Wait and Retrieve =====
         // 5. Find which slot NVDEC used (check all slots for our submission_id)
@@ -1240,8 +1253,13 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
         }
 
         if (my_slot_idx == -1) {
-            LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id);
-            return false;
+            // Display-only packet: HandlePictureDisplay was called without HandlePictureDecode
+            // This happens when a packet only triggers display of a previously decoded frame
+            // No new frame was decoded, so we return false to indicate no frame is available
+            LOGF_DEBUG("[DecodeToSurface] Display-only packet (no decode) for submission_id=%llu - returning false", my_submission_id);
+
+            m_returnCounter.fetch_add(1);  // Advance counter to unblock FIFO queue
+            return false;  // No frame decoded - caller should use previous frame
         }
 
         DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
@@ -1254,14 +1272,27 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
 
         LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id);
 
-        // 7. Wait for decode to complete
+        // 7. Wait for decode to complete with adaptive timeout based on resolution
         {
             std::unique_lock<std::mutex> lock(my_slot.slot_mutex);
 
-            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500),
+            // Adaptive timeout: base 500ms for 720p, scale by pixel count
+            // 720p (1280x720 = 921,600 pixels) -> 500ms
+            // 1080p (1920x1080 = 2,073,600 pixels) -> 1,125ms
+            // 4K (3840x2160 = 8,294,400 pixels) -> 4,500ms
+            const uint64_t base_pixels = 1280 * 720;  // 720p reference
+            const uint64_t base_timeout_ms = 500;
+            const uint64_t current_pixels = static_cast<uint64_t>(m_width) * m_height;
+            const uint64_t timeout_ms = std::max<uint64_t>(base_timeout_ms,
+                (current_pixels * base_timeout_ms) / base_pixels);
+
+            LOGF_DEBUG("[DecodeToSurface] Adaptive timeout: %llums for %dx%d (%llu pixels)",
+                      timeout_ms, m_width, m_height, current_pixels);
+
+            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(timeout_ms),
                 [&my_slot]() { return my_slot.is_ready.load(); })) {
                 // Timeout - decode took too long
-                LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx);
+                LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d after %llums", my_slot_idx, timeout_ms);
                 my_slot.in_use.store(false);
                 m_returnCounter.fetch_add(1);  // Skip to avoid deadlock
                 return false;
@@ -1443,6 +1474,8 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
 
         LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx);
 
+        // Note: pending submission already released immediately after cuvidParseVideoData (line 1237)
+
         // 10. Advance return counter (FIFO order)
         m_returnCounter.fetch_add(1);