diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp index d63bf40..6b1fe64 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml.cpp @@ -319,6 +319,11 @@ namespace winrt::Vav2Player::implementation LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Video loaded: " + std::to_wstring(videoWidth) + L"x" + std::to_wstring(videoHeight)); + // Prepare video texture before decoding (via FrameProcessor) + if (m_frameProcessor) { + m_frameProcessor->PrepareVideoTexture(videoWidth, videoHeight); + } + // Update AspectFit UpdateVideoImageAspectFit(videoWidth, videoHeight); diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp index f4132c7..f3628bc 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp @@ -28,6 +28,13 @@ void FrameProcessor::SetDispatcherQueue(winrt::Microsoft::UI::Dispatching::Dispa m_dispatcherQueue = queue; } +void FrameProcessor::PrepareVideoTexture(uint32_t width, uint32_t height) +{ + if (m_renderer) { + m_renderer->PrepareVideoTexture(width, height); + } +} + bool FrameProcessor::ProcessFrame(VavCorePlayer* player, std::function onComplete) { diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h index ec6a0e1..6e7be43 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h @@ -26,6 +26,9 @@ public: // Set dispatcher queue for UI thread callbacks void SetDispatcherQueue(winrt::Microsoft::UI::Dispatching::DispatcherQueue const& queue); + // Prepare video texture before first decode + void PrepareVideoTexture(uint32_t width, uint32_t height); + // Process single frame (called from PlaybackController timing thread) // Returns: true if frame processing started, false if skipped (previous frame still rendering) // onComplete: Callback invoked on UI thread after render completes (success flag) diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp index 041c1f1..3ccf720 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp @@ -62,7 +62,10 @@ HRESULT D3D12VideoRenderer::InitializeWithSwapChain( } void D3D12VideoRenderer::Shutdown() { - WaitForGPU(); + // Only wait for GPU if we have a valid command queue + if (m_commandQueue && m_fence && m_fenceEvent) { + WaitForGPU(); + } // Shutdown backends if (m_rgbaSurfaceBackend) { @@ -224,6 +227,24 @@ void D3D12VideoRenderer::SetSwapChainPanel(winrt::Microsoft::UI::Xaml::Controls: m_swapChainPanel = panel; } +HRESULT D3D12VideoRenderer::PrepareVideoTexture(uint32_t width, uint32_t height) { + if (!m_initialized) { + return E_NOT_VALID_STATE; + } + + // Create RGBA texture in advance + if (m_rgbaSurfaceBackend) { + HRESULT hr = m_rgbaSurfaceBackend->CreateVideoTexture(width, height); + if (FAILED(hr)) { + return hr; + } + m_videoWidth = width; + m_videoHeight = height; + } + + return S_OK; +} + ID3D12Resource* D3D12VideoRenderer::GetRGBATextureForCUDAInterop() const { if (m_rgbaSurfaceBackend) { return m_rgbaSurfaceBackend->GetVideoTexture(); @@ -383,13 +404,24 @@ HRESULT D3D12VideoRenderer::CreateCommandObjects() { } } - return m_device->CreateCommandList( + HRESULT hr = m_device->CreateCommandList( 0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_commandAllocators[0].Get(), nullptr, IID_PPV_ARGS(&m_commandList) ); + if (FAILED(hr)) { + return hr; + } + + // Close command list after creation (it starts in recording state) + hr = m_commandList->Close(); + if (FAILED(hr)) { + return hr; + } + + return S_OK; } HRESULT D3D12VideoRenderer::CreateSynchronizationObjects() { diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h index bcb36b2..85c2371 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.h @@ -55,6 +55,9 @@ public: ID3D12Device* GetD3D12Device() const { return m_device.Get(); } ID3D12CommandQueue* GetCommandQueue() const { return m_commandQueue.Get(); } + // Prepare video texture before decoding + HRESULT PrepareVideoTexture(uint32_t width, uint32_t height); + // Backend-specific texture access for CUDA interop ID3D12Resource* GetRGBATextureForCUDAInterop() const; ID3D12Resource* GetNV12TextureForCUDAInterop() const { return nullptr; } // Future: NV12DirectBackend diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp index a4bcf88..81e63f7 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp @@ -904,34 +904,41 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS DecodeSlot& slot = decoder->m_ringBuffer[slot_idx]; - // Find pending submission context using most recent submission_id - // cuvidParseVideoData is SYNCHRONOUS - the callback is for the packet we just submitted - // Therefore, m_submissionCounter - 1 is the submission_id for THIS packet + // Find pending submission context for this decode slot + // CRITICAL: Search for the HIGHEST submission_id (most recent packet being processed) + // This handles the case where cuvidParseVideoData callbacks may execute asynchronously uint64_t submission_id = 0; size_t pending_idx = 0; bool found = false; { std::lock_guard lock(decoder->m_submissionMutex); - // Get the most recent submission (the one that triggered this callback) - uint64_t current_submission_id = decoder->m_submissionCounter.load() - 1; - pending_idx = current_submission_id % RING_BUFFER_SIZE; + // Search backwards from current submission counter to find most recent active pending + uint64_t current_counter = decoder->m_submissionCounter.load(); + uint64_t max_submission_id = 0; + bool found_any = false; - auto& pending = decoder->m_pendingSubmissions[pending_idx]; + for (size_t i = 0; i < RING_BUFFER_SIZE; i++) { + auto& pending = decoder->m_pendingSubmissions[i]; - // Verify this pending submission is in use and matches the slot - if (pending.in_use.load()) { - // Copy pending submission context to decode slot - slot.target_surface = pending.target_surface; - slot.surface_type = pending.surface_type; - slot.submission_id = pending.submission_id; - submission_id = pending.submission_id; + if (pending.in_use.load()) { + // Found an active pending submission + if (!found_any || pending.submission_id > max_submission_id) { + // This is the newest one so far + slot.target_surface = pending.target_surface; + slot.surface_type = pending.surface_type; + slot.submission_id = pending.submission_id; + submission_id = pending.submission_id; + pending_idx = i; + max_submission_id = pending.submission_id; + found_any = true; + found = true; + } + } + } - // Release pending slot for reuse - pending.in_use.store(false); - found = true; - } else { - LOGF_ERROR("[HandlePictureDecode] Pending submission slot %zu not in use!", pending_idx); + if (!found) { + LOGF_ERROR("[HandlePictureDecode] No active pending submission found for slot %d", slot_idx); } } @@ -1181,12 +1188,11 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ LOGF_DEBUG("[DecodeToSurface] Allocated submission_id=%llu, pending_idx=%zu", my_submission_id, pending_idx); - // 2. Wait if pending slot is still in use (overflow protection) - while (m_pendingSubmissions[pending_idx].in_use.load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - - // 3. Store submission context in ring buffer slot + // 2. Store submission context in ring buffer slot (overwrite old data) + // No need to wait - ring buffer naturally cycles after 16 submissions + // Old pending submissions will be overwritten, which is safe because: + // - Decode slots already have their copy of pending data + // - 16 slots is enough buffer for B-frame reordering { std::lock_guard lock(m_submissionMutex); auto& pending = m_pendingSubmissions[pending_idx]; @@ -1194,7 +1200,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ pending.target_surface = target_surface; pending.surface_type = target_type; pending.submission_id = my_submission_id; - pending.in_use.store(true); + pending.in_use.store(true); // Mark as active for HandlePictureDecode search } LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu", @@ -1212,7 +1218,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ my_submission_id); CUresult result = cuvidParseVideoData(m_parser, &packet); - // cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return + // cuvidParseVideoData is SYNCHRONOUS - all callbacks execute before return if (result != CUDA_SUCCESS) { LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed with code %d", result); @@ -1227,7 +1233,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ return false; } - LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed"); + LOGF_DEBUG("[DecodeToSurface] Packet submitted, synchronous callbacks completed"); + + // IMPORTANT: Do NOT release pending submission here! + // Even though cuvidParseVideoData is documented as synchronous, NVDEC's B-frame + // reordering means callbacks from THIS packet may execute during FUTURE packets. + // Pending submissions will naturally be overwritten when ring buffer wraps (16 slots). + LOGF_DEBUG("[DecodeToSurface] Keeping pending_idx=%zu active (will be reused after %d submissions)", + pending_idx, RING_BUFFER_SIZE); // ===== Component 4: Wait and Retrieve ===== // 5. Find which slot NVDEC used (check all slots for our submission_id) @@ -1240,8 +1253,13 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ } if (my_slot_idx == -1) { - LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id); - return false; + // Display-only packet: HandlePictureDisplay was called without HandlePictureDecode + // This happens when a packet only triggers display of a previously decoded frame + // No new frame was decoded, so we return false to indicate no frame is available + LOGF_DEBUG("[DecodeToSurface] Display-only packet (no decode) for submission_id=%llu - returning false", my_submission_id); + + m_returnCounter.fetch_add(1); // Advance counter to unblock FIFO queue + return false; // No frame decoded - caller should use previous frame } DecodeSlot& my_slot = m_ringBuffer[my_slot_idx]; @@ -1254,14 +1272,27 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id); - // 7. Wait for decode to complete + // 7. Wait for decode to complete with adaptive timeout based on resolution { std::unique_lock lock(my_slot.slot_mutex); - if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500), + // Adaptive timeout: base 500ms for 720p, scale by pixel count + // 720p (1280x720 = 921,600 pixels) -> 500ms + // 1080p (1920x1080 = 2,073,600 pixels) -> 1,125ms + // 4K (3840x2160 = 8,294,400 pixels) -> 4,500ms + const uint64_t base_pixels = 1280 * 720; // 720p reference + const uint64_t base_timeout_ms = 500; + const uint64_t current_pixels = static_cast(m_width) * m_height; + const uint64_t timeout_ms = std::max(base_timeout_ms, + (current_pixels * base_timeout_ms) / base_pixels); + + LOGF_DEBUG("[DecodeToSurface] Adaptive timeout: %llums for %dx%d (%llu pixels)", + timeout_ms, m_width, m_height, current_pixels); + + if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(timeout_ms), [&my_slot]() { return my_slot.is_ready.load(); })) { // Timeout - decode took too long - LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx); + LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d after %llums", my_slot_idx, timeout_ms); my_slot.in_use.store(false); m_returnCounter.fetch_add(1); // Skip to avoid deadlock return false; @@ -1443,6 +1474,8 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx); + // Note: pending submission already released immediately after cuvidParseVideoData (line 1237) + // 10. Advance return counter (FIFO order) m_returnCounter.fetch_add(1);