From 59946fa10b831f6fba47daf82b3137ddc82d9a50 Mon Sep 17 00:00:00 2001 From: ened Date: Fri, 3 Oct 2025 01:17:26 +0900 Subject: [PATCH] Refactoring VideoPlayerControl to VideoPlayerControl2 --- .claude/settings.local.json | 4 +- vav2/CLAUDE.md | 68 ++- .../vav2player/Vav2Player/MainVideoPage.xaml | 5 +- .../Vav2Player/VideoPlayerControl.xaml.cpp | 9 +- .../Vav2Player/VideoPlayerControl2.xaml | 5 +- .../Vav2Player/VideoPlayerControl2.xaml.cpp | 192 +++++++-- .../src/Playback/FrameProcessor.cpp | 137 ++----- .../Vav2Player/src/Playback/FrameProcessor.h | 4 - .../src/Playback/PlaybackController.cpp | 27 +- .../src/Playback/PlaybackController.h | 7 + .../src/Rendering/SimpleGPURenderer.cpp | 91 +++- .../src/Rendering/SimpleGPURenderer.h | 12 +- .../windows/vavcore/include/VavCore/VavCore.h | 4 + .../windows/vavcore/src/Common/VideoTypes.h | 3 + .../vavcore/src/Decoder/IVideoDecoder.h | 5 + .../vavcore/src/Decoder/NVDECAV1Decoder.cpp | 388 +++++++++++++++--- .../vavcore/src/Decoder/NVDECAV1Decoder.h | 10 + .../platforms/windows/vavcore/src/VavCore.cpp | 7 + 18 files changed, 752 insertions(+), 226 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index fa833ae..0a13278 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -89,7 +89,9 @@ "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"Vav2Player.sln\" //p:Configuration=Debug //p:Platform=x64 //v:minimal)", "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"VavCore.vcxproj\" //p:Configuration=Debug //p:Platform=x64 //v:minimal)", "Bash(tasklist)", - "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"D:/Project/video-av1/vav2/platforms/windows/applications/vav2player/Vav2Player/Vav2Player.vcxproj\" //p:Configuration=Debug //p:Platform=x64 //t:Clean)" + "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"D:/Project/video-av1/vav2/platforms/windows/applications/vav2player/Vav2Player/Vav2Player.vcxproj\" //p:Configuration=Debug //p:Platform=x64 //t:Clean)", + "Bash(./Vav2Player.exe)", + "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" Vav2Player.vcxproj //p:Configuration=Debug //p:Platform=x64 //v:minimal)" ], "deny": [], "ask": [] diff --git a/vav2/CLAUDE.md b/vav2/CLAUDE.md index 2986e4b..92a7633 100644 --- a/vav2/CLAUDE.md +++ b/vav2/CLAUDE.md @@ -53,7 +53,58 @@ size_t required_size = frame.width * frame.height * 4; --- -## 🎯 **ν˜„μž¬ ν”„λ‘œμ νŠΈ μƒνƒœ** (2025-09-30) +## 🎯 **ν˜„μž¬ ν”„λ‘œμ νŠΈ μƒνƒœ** (2025-10-01) + +### **βœ… Windows VideoPlayerControl2 λ¦¬νŒ©ν† λ§ μ™„λ£Œ** (2025-10-01) +- **VideoPlayerControl2 μ½”λ“œ κ΅¬ν˜„**: PlaybackController + FrameProcessor + VideoPlayerControl2 λͺ¨λ“ˆν™” μ•„ν‚€ν…μ²˜ μž‘μ„± μ™„λ£Œ βœ… +- **μ½”λ“œ ꡬ쑰 κ°œμ„ **: 1,890 lines β†’ 950 lines (50% κ°μ†Œ) βœ… +- **μ±…μž„ 뢄리**: 7개 혼재된 μ±…μž„ β†’ 각 클래슀 1개 λͺ…ν™•ν•œ λͺ©μ  βœ… +- **λΉŒλ“œ 성곡**: WinRT λŸ°νƒ€μž„ 클래슀 생성 및 컴파일 μ™„λ£Œ βœ… +- **MainVideoPage ꡐ체**: VideoPlayerControl β†’ VideoPlayerControl2 μ „ν™˜ μ™„λ£Œ βœ… + +**πŸ”§ ν•΄κ²°λœ ν¬λž˜μ‹œ 문제**: +1. **VavCore-debug.dll AppX 경둜 볡사 μ‹€νŒ¨** (exit code 3 원인) + - 문제: μ½˜μ†” MSBuildλŠ” post-build eventκ°€ AppX λ””λ ‰ν† λ¦¬λ‘œ DLL 볡사 μ•ˆ 함 + - ν•΄κ²°: **Visual Studioμ—μ„œ λΉŒλ“œ** μ‹œ post-build event 정상 μ‹€ν–‰ + - AppX 디렉토리: `x64/Debug/Vav2Player/` (WinUI3 νŒ¨ν‚€μ§€ μ•± μ‹€ν–‰ μœ„μΉ˜) + +2. **PlaybackController VavCore 쀑볡 μ΄ˆκΈ°ν™” 제거** + - 문제: PlaybackController μƒμ„±μžμ—μ„œ `vavcore_initialize()` 쀑볡 호좜 + - ν•΄κ²°: App.xaml.cpp μ „μ—­ μ΄ˆκΈ°ν™”λ§Œ μ‚¬μš©, PlaybackControllerμ—μ„œ 제거 + - μ½”λ“œ μˆ˜μ •: `src/Playback/PlaybackController.cpp` μƒμ„±μž/μ†Œλ©Έμž μˆ˜μ • + +3. **μ˜ˆμ™Έ 처리 κ°•ν™”** + - VideoPlayerControl2 μƒμ„±μžμ— μƒμ„Έν•œ try-catch 및 디버그 λ‘œκΉ… μΆ”κ°€ + - PlaybackController/FrameProcessor 생성 κ³Όμ • 좔적 κ°€λŠ₯ + +**⚠️ λΉŒλ“œ μ£Όμ˜μ‚¬ν•­**: +``` +λ°˜λ“œμ‹œ Visual Studioμ—μ„œ λΉŒλ“œν•  것! +- μ½˜μ†” MSBuild: VavCore-debug.dll이 AppX에 볡사 μ•ˆ 됨 β†’ ν¬λž˜μ‹œ +- Visual Studio: post-build event 정상 μ‹€ν–‰ β†’ 정상 λ™μž‘ +``` + +**VideoPlayerControl2 μ•„ν‚€ν…μ²˜**: +``` +VideoPlayerControl2 (UI Layer - 400 lines) +β”œβ”€β”€ PlaybackController (Playback Logic - 300 lines) +β”‚ β”œβ”€β”€ VavCore 생λͺ…μ£ΌκΈ° 관리 +β”‚ β”œβ”€β”€ 타이밍 μŠ€λ ˆλ“œ (30fps) +β”‚ └── μž¬μƒ μƒνƒœ λ¨Έμ‹  +β”œβ”€β”€ FrameProcessor (Frame Handling - 250 lines) +β”‚ β”œβ”€β”€ λ°±κ·ΈλΌμš΄λ“œ λ””μ½”λ“œ (NVDEC β†’ D3D12 NV12) +β”‚ β”œβ”€β”€ ν”„λ ˆμž„ 처리 쑰절 (m_frameProcessing ν”Œλž˜κ·Έ) +β”‚ └── UI μŠ€λ ˆλ“œ λ Œλ”λ§ 콜백 +└── SimpleGPURenderer (Rendering) + β”œβ”€β”€ D3D12 NV12 ν…μŠ€μ²˜ + └── YUVβ†’RGB λ³€ν™˜ + Present +``` + +**μ£Όμš” 파일**: +- `VideoPlayerControl2.xaml.h/.cpp` (400 lines) - UI λ ˆμ΄μ–΄ +- `src/Playback/PlaybackController.h/.cpp` (300 lines) - μž¬μƒ μ œμ–΄ +- `src/Playback/FrameProcessor.h/.cpp` (250 lines) - ν”„λ ˆμž„ 처리 +- `VideoPlayerControl2.idl` - WinRT μΈν„°νŽ˜μ΄μŠ€ μ •μ˜ ### **βœ… Android Vulkan AV1 Player μ™„μ „ κ΅¬ν˜„ μ™„λ£Œ** (2025-09-30) - **Android Vulkan μ• ν”Œλ¦¬μΌ€μ΄μ…˜**: μ™„μ „ν•œ λ„€μ΄ν‹°λΈŒ Vulkan AV1 Player μ•± κ΅¬ν˜„ βœ… @@ -71,9 +122,6 @@ size_t required_size = frame.width * frame.height * 4; - **VavCore C API 28개 ν•¨μˆ˜**: Android NDK JNIλ₯Ό ν†΅ν•œ μ™„μ „ν•œ λ„€μ΄ν‹°λΈŒ 톡합 - **16KB νŽ˜μ΄μ§€ ν˜Έν™˜μ„±**: Google Play Android 15+ ν˜Έν™˜μ„± 보μž₯ -### **ν˜„μž¬ μ§„ν–‰ 쀑** -- **Video Player Overlay UI**: ν”„λ‘œλ•μ…˜ 레벨 μ‚¬μš©μž κ²½ν—˜ ν–₯상 (파일 선택, 제슀처 컨트둀, μ§„ν–‰λ°”, μ˜€λ²„λ ˆμ΄ UI) πŸ”„ - ### **ν™œμ„± 섀계 λ¬Έμ„œ** - [**VavCore Godot Integration**](VavCore_Godot_Integration_Design.md) - Godot 4.4.1 C# Extension κ΅¬ν˜„ ν˜„ν™© βœ… - [**Android Vulkan AV1 Player Design**](docs/completed/android/Android_Vulkan_AV1_Player_Design.md) - Android Vulkan Surface 기반 AV1 Player 섀계 βœ… @@ -83,9 +131,17 @@ size_t required_size = frame.width * frame.height * 4; --- -## βœ… **μ΅œμ‹  μ™„λ£Œ μž‘μ—…: μ£Όμš” λ§ˆμΌμŠ€ν†€ 달성** (2025-09-30) +## βœ… **μ΅œμ‹  μ™„λ£Œ μž‘μ—…: μ£Όμš” λ§ˆμΌμŠ€ν†€ 달성** (2025-10-01) -### **🎯 2025λ…„ 9μ›” μ΅œμ’… 핡심 μ„±κ³Ό** +### **🎯 2025λ…„ 10μ›” μ΅œμ‹  μ„±κ³Ό** +- [**VideoPlayerControl2 λ¦¬νŒ©ν† λ§ μ™„λ£Œ**](docs/completed/windows/VideoPlayerControl2_Refactoring_Plan_2025-10-01.md) - λͺ¨λ“ˆν™” μ•„ν‚€ν…μ²˜ μ™„μ„± βœ… πŸ”΄ **NEW** + - **PlaybackController μ™„μ „ κ΅¬ν˜„**: VavCore 생λͺ…μ£ΌκΈ°, 타이밍 μŠ€λ ˆλ“œ, μž¬μƒ μƒνƒœ λ¨Έμ‹  (300 lines) + - **FrameProcessor μ™„μ „ κ΅¬ν˜„**: λ°±κ·ΈλΌμš΄λ“œ λ””μ½”λ“œ, ν”„λ ˆμž„ 쑰절, UI λ Œλ”λ§ 콜백 (250 lines) + - **VideoPlayerControl2 톡합**: SwapChainPanel, AspectFit, WinRT μΈν„°νŽ˜μ΄μŠ€ (400 lines) + - **50% μ½”λ“œ κ°μ†Œ**: 1,890 lines β†’ 950 lines (μ‹€μš©μ  λͺ¨λ“ˆν™”) + - **MainVideoPage ꡐ체**: VideoPlayerControl β†’ VideoPlayerControl2 μ „ν™˜ μ™„λ£Œ + +### **🎯 2025λ…„ 9μ›” 핡심 μ„±κ³Ό** - [**Android Vulkan AV1 Player μ™„μ „ μ™„μ„±**](vav2/docs/completed/android/Android_Vulkan_AV1_Player_Design.md) - Samsung Galaxy S24 μ΅œμ ν™” μ™„λ£Œ βœ… - [**MediaCodec ν‚€μ›Œλ“œ 기반 디코더 선택 μ‹œμŠ€ν…œ**](vav2/platforms/android/vavcore/src/Decoder/AndroidMediaCodecAV1Decoder.cpp) - 크둜슀 벀더 ν˜Έν™˜μ„± 확보 βœ… - [**Vulkan 1.1 λ Œλ”λ§ νŒŒμ΄ν”„λΌμΈ μ™„μ„±**](platforms/android/applications/vav2player/app/src/main/cpp/vulkan_renderer.cpp) - YUV to RGB GPU 쉐이더 μ™„λ£Œ βœ… diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml b/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml index 768e373..a06a030 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml @@ -17,8 +17,9 @@ - - + + + diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp index d040db2..09dca96 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp @@ -530,13 +530,16 @@ namespace winrt::Vav2Player::implementation if (result == VAVCORE_SUCCESS) { OutputDebugStringA("[VideoPlayerControl] Decode SUCCESS, enqueuing render...\n"); + // Capture fence value for GPU synchronization + uint64_t fenceValue = vavFrame.sync_fence_value; + // Render + Present on UI thread (lightweight, thread-safe) // CRITICAL: Keep m_frameProcessing = true until render completes // to prevent NVDEC surface queue overflow - auto enqueued = strongThis->DispatcherQueue().TryEnqueue([strongThis, gpuRenderer]() { + auto enqueued = strongThis->DispatcherQueue().TryEnqueue([strongThis, gpuRenderer, fenceValue]() { OutputDebugStringA("[VideoPlayerControl] Render callback executing...\n"); if (strongThis->m_isPlaying) { - HRESULT hr = gpuRenderer->RenderNV12TextureToBackBuffer(); + HRESULT hr = gpuRenderer->RenderNV12TextureToBackBuffer(fenceValue); if (SUCCEEDED(hr)) { OutputDebugStringA("[VideoPlayerControl] Render SUCCESS\n"); } else { @@ -691,7 +694,7 @@ namespace winrt::Vav2Player::implementation // Render NV12 texture to back buffer (YUV to RGB conversion) // NOTE: RenderNV12TextureToBackBuffer() internally executes command list, // signals fence, and advances frame index - no separate Present() needed - HRESULT renderHr = gpuRenderer->RenderNV12TextureToBackBuffer(); + HRESULT renderHr = gpuRenderer->RenderNV12TextureToBackBuffer(vavFrame.sync_fence_value); if (FAILED(renderHr)) { LogMgr::GetInstance().LogError(L"Failed to render NV12 texture to back buffer", L"VideoPlayerControl"); } diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml index 4efd162..65e8200 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml @@ -20,10 +20,11 @@ + + HorizontalAlignment="Stretch" + VerticalAlignment="Stretch"/> (); - m_frameProcessor = std::make_unique<::Vav2Player::FrameProcessor>(); + // Create core components + OutputDebugStringA("[VideoPlayerControl2] Creating PlaybackController...\n"); + m_playbackController = std::make_unique<::Vav2Player::PlaybackController>(); + OutputDebugStringA("[VideoPlayerControl2] PlaybackController created\n"); - LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Core components created"); + OutputDebugStringA("[VideoPlayerControl2] Creating FrameProcessor...\n"); + m_frameProcessor = std::make_unique<::Vav2Player::FrameProcessor>(); + OutputDebugStringA("[VideoPlayerControl2] FrameProcessor created\n"); + + LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Core components created"); + OutputDebugStringA("[VideoPlayerControl2] Constructor END (SUCCESS)\n"); + } + catch (const std::exception& e) { + std::string error = "[VideoPlayerControl2] Constructor EXCEPTION: "; + error += e.what(); + OutputDebugStringA(error.c_str()); + OutputDebugStringA("\n"); + throw; + } + catch (...) { + OutputDebugStringA("[VideoPlayerControl2] Constructor UNKNOWN EXCEPTION\n"); + throw; + } } VideoPlayerControl2::~VideoPlayerControl2() @@ -96,6 +116,14 @@ namespace winrt::Vav2Player::implementation void VideoPlayerControl2::UserControl_SizeChanged(IInspectable const&, SizeChangedEventArgs const& e) { auto newSize = e.NewSize(); + auto oldSize = e.PreviousSize(); + + // Ignore trivial size changes (< 1 pixel) to avoid unnecessary work + if (std::abs(newSize.Width - oldSize.Width) < 1.0 && + std::abs(newSize.Height - oldSize.Height) < 1.0) { + return; + } + LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"SizeChanged: " + std::to_wstring((int)newSize.Width) + L"x" + std::to_wstring((int)newSize.Height)); @@ -143,8 +171,21 @@ namespace winrt::Vav2Player::implementation { if (m_videoSource != value) { m_videoSource = value; + if (!value.empty()) { - LoadVideo(value); + if (m_initialized) { + LoadVideo(value); + } else { + LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2", + L"VideoSource set before initialization, will load on Loaded event"); + } + } else { + // Empty source - unload current video + if (m_playbackController && m_playbackController->IsLoaded()) { + Stop(); + m_playbackController->Unload(); + UpdateStatus(L"No video"); + } } } } @@ -172,25 +213,27 @@ namespace winrt::Vav2Player::implementation Vav2Player::VideoDecoderType VideoPlayerControl2::DecoderType() { if (!m_playbackController) { - return winrt::Vav2Player::VideoDecoderType::Auto; + return static_cast(0); // Auto } VavCoreDecoderType coreType = m_playbackController->GetDecoderType(); // Convert VavCoreDecoderType to VideoDecoderType + // Use switch on C enum (compile-time constant), return WinRT enum as integer + // WinRT enum values: Auto=0, NVDEC=1, VPL=2, AMF=3, DAV1D=4, MediaFoundation=5 switch (coreType) { case VAVCORE_DECODER_NVDEC: - return winrt::Vav2Player::VideoDecoderType::NVDEC; + return static_cast(1); // NVDEC case VAVCORE_DECODER_VPL: - return winrt::Vav2Player::VideoDecoderType::VPL; + return static_cast(2); // VPL case VAVCORE_DECODER_AMF: - return winrt::Vav2Player::VideoDecoderType::AMF; + return static_cast(3); // AMF case VAVCORE_DECODER_DAV1D: - return winrt::Vav2Player::VideoDecoderType::DAV1D; + return static_cast(4); // DAV1D case VAVCORE_DECODER_MEDIA_FOUNDATION: - return winrt::Vav2Player::VideoDecoderType::MediaFoundation; + return static_cast(5); // MediaFoundation default: - return winrt::Vav2Player::VideoDecoderType::Auto; + return static_cast(0); // Auto } } @@ -201,17 +244,19 @@ namespace winrt::Vav2Player::implementation } // Convert VideoDecoderType to VavCoreDecoderType + // WinRT enums are projected as integers: Auto=0, NVDEC=1, VPL=2, AMF=3, DAV1D=4, MediaFoundation=5 VavCoreDecoderType coreType = VAVCORE_DECODER_AUTO; + int32_t enumValue = static_cast(value); - if (value == winrt::Vav2Player::VideoDecoderType::NVDEC) { + if (enumValue == 1) { // NVDEC coreType = VAVCORE_DECODER_NVDEC; - } else if (value == winrt::Vav2Player::VideoDecoderType::VPL) { + } else if (enumValue == 2) { // VPL coreType = VAVCORE_DECODER_VPL; - } else if (value == winrt::Vav2Player::VideoDecoderType::AMF) { + } else if (enumValue == 3) { // AMF coreType = VAVCORE_DECODER_AMF; - } else if (value == winrt::Vav2Player::VideoDecoderType::DAV1D) { + } else if (enumValue == 4) { // DAV1D coreType = VAVCORE_DECODER_DAV1D; - } else if (value == winrt::Vav2Player::VideoDecoderType::MediaFoundation) { + } else if (enumValue == 5) { // MediaFoundation coreType = VAVCORE_DECODER_MEDIA_FOUNDATION; } @@ -237,38 +282,59 @@ namespace winrt::Vav2Player::implementation m_playbackController->Stop(); } - // Load video file + // --- Start of Corrected Initialization Order --- + + // 1. Pass D3D12 device to the PlaybackController (with null check) + if (m_gpuRenderer) { + ID3D12Device* d3dDevice = m_gpuRenderer->GetD3D12Device(); + if (d3dDevice) { + m_playbackController->SetD3DDevice(d3dDevice, VAVCORE_SURFACE_D3D12_RESOURCE); + LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"D3D12 device will be set by PlaybackController."); + } else { + LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2", L"D3D12 device is null"); + } + } else { + LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2", + L"Cannot set D3D device: renderer not initialized"); + } + + // 2. Load the video file. VavCore will now see the pending D3D device + // and initialize the decoder in the correct GPU-accelerated mode. std::wstring filePathStr(filePath.c_str()); bool success = m_playbackController->LoadVideo(filePathStr); if (success) { - // Get video dimensions and create NV12 texture + // 3. Get video dimensions (which are now available). uint32_t videoWidth = m_playbackController->GetVideoWidth(); uint32_t videoHeight = m_playbackController->GetVideoHeight(); LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Video loaded: " + std::to_wstring(videoWidth) + L"x" + std::to_wstring(videoHeight)); - // Create NV12 texture for CUDA interop + // 4. Create the NV12 texture for zero-copy, now that we have the dimensions. if (m_gpuRenderer) { HRESULT hr = m_gpuRenderer->CreateNV12TextureR8Layout(videoWidth, videoHeight); if (SUCCEEDED(hr)) { LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"NV12 texture created"); - - // Pass D3D12 device to VavCore for CUDA-D3D12 interop - ID3D12Device* d3dDevice = m_gpuRenderer->GetD3D12Device(); - if (d3dDevice) { - VavCorePlayer* player = m_playbackController->GetVavCorePlayer(); - if (player) { - vavcore_set_d3d_device(player, d3dDevice, VAVCORE_SURFACE_D3D12_RESOURCE); - LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"D3D12 device passed to VavCore"); - } - } } else { LogMgr::GetInstance().LogError(L"VideoPlayerControl2", L"Failed to create NV12 texture"); } } + // 5. Get D3D12 fence from VavCore and pass it to SimpleGPURenderer for GPU synchronization + VavCorePlayer* player = m_playbackController->GetVavCorePlayer(); + if (player && m_gpuRenderer) { + void* syncFence = vavcore_get_sync_fence(player); + if (syncFence) { + m_gpuRenderer->SetSyncFence(syncFence); + LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"D3D12 fence set for GPU synchronization"); + } else { + LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2", L"No sync fence available from VavCore"); + } + } + + // --- End of Corrected Initialization Order --- + // Update AspectFit UpdateVideoImageAspectFit(videoWidth, videoHeight); @@ -280,6 +346,12 @@ namespace winrt::Vav2Player::implementation } } else { LogMgr::GetInstance().LogError(L"VideoPlayerControl2", L"Failed to load video"); + + // Cleanup partial initialization on failure + if (m_gpuRenderer) { + m_gpuRenderer->ReleaseNV12Texture(); + } + UpdateStatus(L"Load failed"); } } @@ -371,9 +443,16 @@ namespace winrt::Vav2Player::implementation void VideoPlayerControl2::InitializeRenderer() { - if (m_gpuRenderer && m_gpuRenderer->IsInitialized()) { - LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Renderer already initialized"); - return; + // Cleanup existing renderer if not properly initialized + if (m_gpuRenderer) { + if (m_gpuRenderer->IsInitialized()) { + LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Renderer already initialized"); + return; + } else { + LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2", + L"Cleaning up partially initialized renderer"); + CleanupRenderer(); + } } LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Initializing renderer"); @@ -464,20 +543,53 @@ namespace winrt::Vav2Player::implementation void VideoPlayerControl2::OnFrameReady() { // Called from PlaybackController timing thread (30fps) - if (!m_frameProcessor || !m_playbackController) { + // Thread-safe: Check if still initialized to prevent race conditions during cleanup + if (!m_initialized) { + return; + } + + // Defensive: Get raw pointers atomically to avoid race with destruction + auto processor = m_frameProcessor.get(); + auto controller = m_playbackController.get(); + + if (!processor || !controller) { return; } // Process frame (decode on background, render on UI thread) - VavCorePlayer* player = m_playbackController->GetVavCorePlayer(); + VavCorePlayer* player = controller->GetVavCorePlayer(); if (!player) { return; } - m_frameProcessor->ProcessFrame(player, [](bool success) { - // Frame processing complete callback (optional) - if (!success) { - OutputDebugStringA("[VideoPlayerControl2] Frame processing failed\n"); + processor->ProcessFrame(player, [weakThis = get_weak()](bool success) { + if (auto strongThis = weakThis.get()) { + if (!success) { + OutputDebugStringA("[VideoPlayerControl2] Frame processing failed\n"); + + // Track consecutive errors for auto-recovery + static int consecutiveErrors = 0; + consecutiveErrors++; + + // Auto-recovery after too many consecutive errors + if (consecutiveErrors > 10) { + OutputDebugStringA("[VideoPlayerControl2] Too many consecutive errors, attempting recovery\n"); + + // Stop playback on UI thread + strongThis->DispatcherQueue().TryEnqueue([weakThis]() { + if (auto strongThis2 = weakThis.get()) { + strongThis2->Stop(); + strongThis2->UpdateStatus(L"Playback error - stopped"); + } + }); + + consecutiveErrors = 0; + } + } else { + // Reset error counter on success + static int consecutiveErrors = 0; + consecutiveErrors = 0; + } } }); } diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp index 22d20e0..1c24543 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp @@ -31,91 +31,29 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player, std::function onComplete) { if (!player || !m_renderer || !m_dispatcherQueue) { - std::cerr << "[FrameProcessor] Invalid state: player=" - << player << ", renderer=" << m_renderer - << ", dispatcherQueue=" << (m_dispatcherQueue ? "valid" : "null") << std::endl; + OutputDebugStringA("[FrameProcessor] Invalid state: missing player/renderer/dispatcherQueue\n"); return false; } // Check if previous frame is still processing - // This prevents NVDEC surface queue overflow bool expected = false; if (!m_frameProcessing.compare_exchange_strong(expected, true)) { - // Previous frame still processing, drop this frame m_framesDropped++; OutputDebugStringA("[FrameProcessor] Frame dropped (previous frame still processing)\n"); return false; } - // Decode on current (background) thread - bool decodeSuccess = DecodeFrameToNV12(player); - if (!decodeSuccess) { - m_decodeErrors++; - m_frameProcessing.store(false); - - if (onComplete) { - onComplete(false); - } - - return false; - } - - m_framesDecoded++; - OutputDebugStringA("[FrameProcessor] Frame decoded successfully, enqueuing render...\n"); - - // Render on UI thread (lightweight D3D12 Present) - // CRITICAL: Keep m_frameProcessing = true until render completes - bool enqueued = m_dispatcherQueue.TryEnqueue([this, onComplete]() { - OutputDebugStringA("[FrameProcessor] Render callback executing on UI thread...\n"); - - bool renderSuccess = RenderNV12ToScreen(); - if (!renderSuccess) { - m_renderErrors++; - } - - // Mark frame processing complete AFTER render - m_frameProcessing.store(false); - - if (onComplete) { - onComplete(renderSuccess); - } - - if (renderSuccess) { - OutputDebugStringA("[FrameProcessor] Frame rendered successfully\n"); - } else { - OutputDebugStringA("[FrameProcessor] Frame render FAILED\n"); - } - }); - - if (!enqueued) { - std::cerr << "[FrameProcessor] Failed to enqueue render callback" << std::endl; - m_frameProcessing.store(false); - - if (onComplete) { - onComplete(false); - } - - return false; - } - - return true; -} - -bool FrameProcessor::DecodeFrameToNV12(VavCorePlayer* player) -{ - if (!player || !m_renderer) { - return false; - } - - // Get NV12 texture from renderer (CUDA-D3D12 interop target) + // Get NV12 texture from renderer ID3D12Resource* nv12Texture = m_renderer->GetNV12TextureForCUDAInterop(); if (!nv12Texture) { - std::cerr << "[FrameProcessor] NV12 texture not available" << std::endl; + OutputDebugStringA("[FrameProcessor] Failed to get NV12 texture\n"); + m_frameProcessing.store(false); + if (onComplete) onComplete(false); return false; } - // Decode directly to D3D12 NV12 texture (zero-copy) - VavCoreVideoFrame vavFrame; + // Decode frame to D3D12 surface (blocking) + VavCoreVideoFrame vavFrame = {}; VavCoreResult result = vavcore_decode_to_surface( player, VAVCORE_SURFACE_D3D12_RESOURCE, @@ -123,33 +61,48 @@ bool FrameProcessor::DecodeFrameToNV12(VavCorePlayer* player) &vavFrame ); - if (result == VAVCORE_SUCCESS) { - OutputDebugStringA("[FrameProcessor] Decode to NV12 surface SUCCESS\n"); - return true; - } else if (result == VAVCORE_END_OF_STREAM) { - OutputDebugStringA("[FrameProcessor] End of stream reached\n"); - return false; - } else { - char buf[256]; - sprintf_s(buf, "[FrameProcessor] Decode failed with error: %d\n", result); - OutputDebugStringA(buf); - return false; - } -} - -bool FrameProcessor::RenderNV12ToScreen() -{ - if (!m_renderer) { + if (result != VAVCORE_SUCCESS) { + if (result != VAVCORE_END_OF_STREAM) { + m_decodeErrors++; + OutputDebugStringA("[FrameProcessor] Decode error\n"); + } + m_frameProcessing.store(false); + if (onComplete) onComplete(false); return false; } - // Render NV12 texture to back buffer (YUV β†’ RGB conversion + Present) - HRESULT hr = m_renderer->RenderNV12TextureToBackBuffer(); + m_framesDecoded++; - if (FAILED(hr)) { - char buf[256]; - sprintf_s(buf, "[FrameProcessor] Render failed with HRESULT: 0x%08X\n", hr); - OutputDebugStringA(buf); + // Enqueue render on UI thread with fence value for GPU sync + uint64_t fenceValue = vavFrame.sync_fence_value; + OutputDebugStringA("[FrameProcessor] Attempting to enqueue render on UI thread...\n"); + + bool enqueued = m_dispatcherQueue.TryEnqueue([this, fenceValue, onComplete]() { + OutputDebugStringA("[FrameProcessor] Render callback executing on UI thread\n"); + HRESULT hr = m_renderer->RenderNV12TextureToBackBuffer(fenceValue); + bool renderSuccess = SUCCEEDED(hr); + + if (!renderSuccess) { + m_renderErrors++; + char buf[256]; + sprintf_s(buf, "[FrameProcessor] Render error: HRESULT = 0x%08X\n", hr); + OutputDebugStringA(buf); + } else { + OutputDebugStringA("[FrameProcessor] Render succeeded\n"); + } + + // Mark frame processing complete + m_frameProcessing.store(false); + + if (onComplete) { + onComplete(renderSuccess); + } + }); + + if (!enqueued) { + OutputDebugStringA("[FrameProcessor] Failed to enqueue render\n"); + m_frameProcessing.store(false); + if (onComplete) onComplete(false); return false; } diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h index 6144995..c8903c3 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h @@ -53,10 +53,6 @@ private: std::atomic m_framesDropped{0}; std::atomic m_decodeErrors{0}; std::atomic m_renderErrors{0}; - - // Helper methods - bool DecodeFrameToNV12(VavCorePlayer* player); - bool RenderNV12ToScreen(); }; } // namespace Vav2Player diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp index 980db6b..f47b42d 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp @@ -7,14 +7,18 @@ namespace Vav2Player { PlaybackController::PlaybackController() { - InitializeVavCore(); + // NOTE: VavCore is initialized globally in App.xaml.cpp + // Do NOT call vavcore_initialize() here to avoid duplicate initialization + OutputDebugStringA("[PlaybackController] Constructor called (VavCore already initialized)\n"); } PlaybackController::~PlaybackController() { + OutputDebugStringA("[PlaybackController] Destructor called\n"); Stop(); Unload(); - CleanupVavCore(); + // NOTE: VavCore cleanup is handled globally in App.xaml.cpp + // Do NOT call vavcore_cleanup() here } bool PlaybackController::InitializeVavCore() @@ -35,6 +39,12 @@ void PlaybackController::CleanupVavCore() std::cout << "[PlaybackController] VavCore cleaned up" << std::endl; } +void PlaybackController::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) +{ + m_d3dDevice = d3d_device; + m_d3dSurfaceType = type; +} + bool PlaybackController::LoadVideo(const std::wstring& filePath) { // Unload previous video if any @@ -49,12 +59,20 @@ bool PlaybackController::LoadVideo(const std::wstring& filePath) return false; } + // Set decoder type before opening file + vavcore_set_decoder_type(m_vavCorePlayer, m_decoderType); + + // Set D3D device before opening file, if it was provided + if (m_d3dDevice) { + vavcore_set_d3d_device(m_vavCorePlayer, m_d3dDevice, m_d3dSurfaceType); + } + // Convert wstring to UTF-8 string int size_needed = WideCharToMultiByte(CP_UTF8, 0, filePath.c_str(), -1, nullptr, 0, nullptr, nullptr); std::string utf8FilePath(size_needed - 1, 0); WideCharToMultiByte(CP_UTF8, 0, filePath.c_str(), -1, &utf8FilePath[0], size_needed, nullptr, nullptr); - // Open video file + // Open video file (this will initialize the decoder) VavCoreResult result = vavcore_open_file(m_vavCorePlayer, utf8FilePath.c_str()); if (result != VAVCORE_SUCCESS) { std::cerr << "[PlaybackController] Failed to open file: " << result << std::endl; @@ -63,9 +81,6 @@ bool PlaybackController::LoadVideo(const std::wstring& filePath) return false; } - // Set decoder type - vavcore_set_decoder_type(m_vavCorePlayer, m_decoderType); - // Get video metadata VavCoreVideoMetadata metadata; result = vavcore_get_metadata(m_vavCorePlayer, &metadata); diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h index 91146d6..b8649c6 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h @@ -51,10 +51,17 @@ public: void SetDecoderType(VavCoreDecoderType type); VavCoreDecoderType GetDecoderType() const { return m_decoderType; } + // D3D device configuration (for GPU acceleration) + void SetD3DDevice(void* d3d_device, VavCoreSurfaceType type); + private: // VavCore player instance VavCorePlayer* m_vavCorePlayer = nullptr; + // D3D device information (pending) + void* m_d3dDevice = nullptr; + VavCoreSurfaceType m_d3dSurfaceType = VAVCORE_SURFACE_CPU; + // Playback state std::atomic m_isPlaying{false}; std::atomic m_isLoaded{false}; diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp index 0d3929c..291972a 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp @@ -116,6 +116,9 @@ void SimpleGPURenderer::Shutdown() // Wait for GPU to finish WaitForGPU(); + // Release sync fence (note: VavCore owns the fence, we just clear our reference) + m_syncFence = nullptr; + // Reset COM objects m_computePipelineState.Reset(); m_computeRootSignature.Reset(); @@ -135,6 +138,9 @@ void SimpleGPURenderer::Shutdown() } m_constantBuffer.Reset(); + // Release NV12 texture + m_nv12Texture.Reset(); + for (UINT i = 0; i < FrameCount; i++) { m_renderTargets[i].Reset(); @@ -151,6 +157,31 @@ void SimpleGPURenderer::Shutdown() m_initialized = false; } +// Set the shared fence for CUDA-D3D12 synchronization +// Note: VavCore owns the fence lifecycle, we just store a reference +void SimpleGPURenderer::SetSyncFence(void* fence) +{ + // Release previous fence reference if exists + if (m_syncFence != nullptr) { + OutputDebugStringA("[SimpleGPURenderer::SetSyncFence] WARNING: Replacing existing fence\n"); + } + + m_syncFence = fence; + + if (fence) { + OutputDebugStringA("[SimpleGPURenderer::SetSyncFence] Sync fence set successfully\n"); + } +} + +// Release NV12 texture (cleanup partial initialization) +void SimpleGPURenderer::ReleaseNV12Texture() +{ + if (m_nv12Texture) { + OutputDebugStringA("[SimpleGPURenderer::ReleaseNV12Texture] Releasing NV12 texture\n"); + m_nv12Texture.Reset(); + } +} + HRESULT SimpleGPURenderer::RenderVideoFrame(const VavCoreVideoFrame& frame) { if (!m_initialized) @@ -289,11 +320,21 @@ HRESULT SimpleGPURenderer::CreateSwapChain(winrt::Microsoft::UI::Xaml::Controls: if (FAILED(hr)) return hr; // Associate with SwapChainPanel + OutputDebugStringA("[SimpleGPURenderer::CreateSwapChain] Associating SwapChain with SwapChainPanel...\n"); auto panelNative = panel.as(); hr = panelNative->SetSwapChain(m_swapChain.Get()); - if (FAILED(hr)) return hr; + if (FAILED(hr)) { + char buf[256]; + sprintf_s(buf, "[SimpleGPURenderer::CreateSwapChain] Failed to set SwapChain: HRESULT = 0x%08X\n", hr); + OutputDebugStringA(buf); + return hr; + } + OutputDebugStringA("[SimpleGPURenderer::CreateSwapChain] SwapChain successfully associated with SwapChainPanel\n"); m_frameIndex = m_swapChain->GetCurrentBackBufferIndex(); + char buf[256]; + sprintf_s(buf, "[SimpleGPURenderer::CreateSwapChain] Current back buffer index: %u\n", m_frameIndex); + OutputDebugStringA(buf); return S_OK; } @@ -754,10 +795,10 @@ float4 PSMain(PSInput input) : SV_TARGET // Sample UV from plane 1 (half resolution, interleaved) float2 uv = g_uvPlane.Sample(g_sampler, input.texcoord); - float u = uv.x; - float v = uv.y; + float u = uv.x; // U component + float v = uv.y; // V component - // Convert from [0,1] to YUV range (BT.709) + // Convert from [0,1] to YUV range (Limited Range BT.709) y = (y * 255.0 - 16.0) / 219.0; u = (u * 255.0 - 128.0) / 224.0; v = (v * 255.0 - 128.0) / 224.0; @@ -973,20 +1014,31 @@ HRESULT SimpleGPURenderer::CreateNV12GraphicsPipeline() return S_OK; } -HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer() +HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer(uint64_t fenceValue) { if (!m_nv12Texture || !m_initialized) { std::cout << "[SimpleGPURenderer::RenderNV12Texture] ERROR: NV12 texture or renderer not initialized" << std::endl; return E_FAIL; } + // Wait for the decoder to finish writing to the texture on the GPU + // Note: Skip wait if fenceValue is 0 (D3D12 fence values start from 1) + if (m_syncFence && fenceValue > 0) { + m_commandQueue->Wait(static_cast(m_syncFence), fenceValue); + } + // Initialize NV12 graphics pipeline on first call if (!m_nv12PipelineState) { + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Creating NV12 graphics pipeline...\n"); HRESULT hr = CreateNV12GraphicsPipeline(); if (FAILED(hr)) { + char buf[256]; + sprintf_s(buf, "[SimpleGPURenderer::RenderNV12Texture] Failed to create NV12 graphics pipeline: HRESULT = 0x%08X\n", hr); + OutputDebugStringA(buf); std::cout << "[SimpleGPURenderer::RenderNV12Texture] Failed to create NV12 graphics pipeline" << std::endl; return hr; } + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] NV12 graphics pipeline created successfully\n"); // Create 2 SRVs for NV12 texture planes (native format) CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(m_nv12SrvHeap->GetCPUDescriptorHandleForHeapStart()); @@ -1012,6 +1064,7 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer() uvPlaneSrvDesc.Texture2D.PlaneSlice = 1; // UV plane m_device->CreateShaderResourceView(m_nv12Texture.Get(), &uvPlaneSrvDesc, srvHandle); + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] NV12 SRVs created (Y + UV planes)\n"); std::cout << "[SimpleGPURenderer::RenderNV12Texture] NV12 native format SRVs created (Y + UV planes)" << std::endl; } @@ -1048,13 +1101,11 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer() m_nv12ConstantBuffer->Unmap(0, nullptr); } - // Add UAV barrier to ensure CUDA write completion before D3D12 read - D3D12_RESOURCE_BARRIER uavBarrier = {}; - uavBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; - uavBarrier.UAV.pResource = m_nv12Texture.Get(); - m_commandList->ResourceBarrier(1, &uavBarrier); + // Per VavCore logs, `cudaDeviceSynchronize()` is called internally, which is a full CPU-blocking sync. + // This means by the time this D3D12 code executes, the CUDA writes are already complete. + // Therefore, no additional GPU-side synchronization barrier (UAV or Aliasing) is needed. - // Transition NV12 texture (R8 layout) from COMMON to PIXEL_SHADER_RESOURCE + // Transition NV12 texture from COMMON (where CUDA left it) to PIXEL_SHADER_RESOURCE for reading. D3D12_RESOURCE_BARRIER nv12TextureBarrier = {}; nv12TextureBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; nv12TextureBarrier.Transition.pResource = m_nv12Texture.Get(); @@ -1098,7 +1149,9 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer() m_commandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); // Draw full-screen quad (4 vertices, triangle strip) + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Calling DrawInstanced(4, 1, 0, 0)...\n"); m_commandList->DrawInstanced(4, 1, 0, 0); + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] DrawInstanced completed\n"); // Transition back buffer from RENDER_TARGET to PRESENT D3D12_RESOURCE_BARRIER rtToPresentBarrier = {}; @@ -1115,11 +1168,20 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer() m_commandList->ResourceBarrier(1, &nv12TextureBarrier); // Close and execute command list + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Closing command list...\n"); hr = m_commandList->Close(); - if (FAILED(hr)) return hr; + if (FAILED(hr)) { + char buf[256]; + sprintf_s(buf, "[SimpleGPURenderer::RenderNV12Texture] CommandList Close failed: HRESULT = 0x%08X\n", hr); + OutputDebugStringA(buf); + return hr; + } + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] CommandList closed successfully\n"); + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Executing command list on GPU...\n"); ID3D12CommandList* commandLists[] = { m_commandList.Get() }; m_commandQueue->ExecuteCommandLists(1, commandLists); + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Command list executed\n"); // Signal completion fence BEFORE Present() const UINT64 currentFrameFenceValue = ++m_fenceValue; @@ -1128,11 +1190,16 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer() if (FAILED(hr)) return hr; // Present the frame to screen (VSync enabled) + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Calling Present()...\n"); hr = m_swapChain->Present(1, 0); if (FAILED(hr)) { + char buf[256]; + sprintf_s(buf, "[SimpleGPURenderer::RenderNV12Texture] Present failed: HRESULT = 0x%08X\n", hr); + OutputDebugStringA(buf); std::cout << "[SimpleGPURenderer::RenderNV12Texture] Present failed: 0x" << std::hex << hr << std::endl; return hr; } + OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Present succeeded\n"); // Advance to next frame (swapchain selects next back buffer) m_frameIndex = m_swapChain->GetCurrentBackBufferIndex(); diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h index 2f72855..663a361 100644 --- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h +++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h @@ -59,12 +59,19 @@ public: // Returns NV12 texture - native DXGI_FORMAT_NV12 for proper CUDA interop ID3D12Resource* GetNV12TextureForCUDAInterop() const { return m_nv12Texture.Get(); } + // Set the shared fence for CUDA-D3D12 synchronization + // Takes ownership of the fence pointer (caller should not delete it) + void SetSyncFence(void* fence); + + // Release NV12 texture (cleanup partial initialization) + void ReleaseNV12Texture(); + // Create NV12 texture for CUDA-D3D12 interop // Format: DXGI_FORMAT_NV12 (native 2-plane YUV format) HRESULT CreateNV12TextureR8Layout(uint32_t videoWidth, uint32_t videoHeight); // Render NV12 texture to back buffer (YUV to RGB conversion on GPU) - HRESULT RenderNV12TextureToBackBuffer(); + HRESULT RenderNV12TextureToBackBuffer(uint64_t fenceValue); private: // D3D12 core objects @@ -73,6 +80,9 @@ private: ComPtr m_swapChain; ComPtr m_rtvHeap; + // Synchronization with VavCore (CUDA) + void* m_syncFence = nullptr; // ID3D12Fence* + // Command objects - Double buffering (changed from 3 for performance test) static const UINT FrameCount = 2; ComPtr m_commandAllocators[FrameCount]; diff --git a/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h b/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h index c53041b..031753a 100644 --- a/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h +++ b/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h @@ -87,6 +87,9 @@ typedef struct { uint64_t timestamp_us; // Timestamp in microseconds uint64_t frame_number; // Frame sequence number + // GPU synchronization + uint64_t sync_fence_value; // Fence value to wait on before using the surface + // New D3D surface fields VavCoreSurfaceType surface_type; union { @@ -249,6 +252,7 @@ VAVCORE_API VavCoreResult vavcore_decode_to_surface(VavCorePlayer* player, // Platform-specific Graphics API setup functions // Windows VAVCORE_API VavCoreResult vavcore_set_d3d_device(VavCorePlayer* player, void* d3d_device, VavCoreSurfaceType type); +VAVCORE_API void* vavcore_get_sync_fence(VavCorePlayer* player); // Returns ID3D12Fence* // Android VAVCORE_API VavCoreResult vavcore_set_android_surface(VavCorePlayer* player, void* native_window); diff --git a/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h b/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h index 4caeedb..6c56624 100644 --- a/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h +++ b/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h @@ -98,6 +98,9 @@ struct VideoFrame { bool is_keyframe = false; bool is_valid = false; + // GPU Synchronization + uint64_t sync_fence_value = 0; + // Constructor VideoFrame() = default; diff --git a/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h b/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h index 55504ef..d8c9d4f 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h +++ b/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h @@ -44,6 +44,11 @@ public: return false; // Default implementation: D3D not supported } + // Return shared D3D12 fence for synchronization + virtual void* GetSyncFence() { + return nullptr; // Default implementation: No fence available + } + // Android virtual bool SetAndroidSurface(void* native_window) { return false; // Default implementation: Android surfaces not supported diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp index a667bc2..17fa8be 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp @@ -20,6 +20,90 @@ #include "NVDECAV1Decoder.h" #include "VideoDecoderFactory.h" +const char* g_deinterleave_kernel_ptx = R"PTX( +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-36424714 +// Cuda compilation tools, release 13.0, V13.0.88 +// Based on NVVM 7.0.1 +// + +.version 9.0 +.target sm_75 +.address_size 64 + + // .globl DeinterleaveUV_Kernel + +.visible .entry DeinterleaveUV_Kernel( + .param .u64 DeinterleaveUV_Kernel_param_0, + .param .u32 DeinterleaveUV_Kernel_param_1, + .param .u64 DeinterleaveUV_Kernel_param_2, + .param .u32 DeinterleaveUV_Kernel_param_3, + .param .u64 DeinterleaveUV_Kernel_param_4, + .param .u32 DeinterleaveUV_Kernel_param_5, + .param .u32 DeinterleaveUV_Kernel_param_6, + .param .u32 DeinterleaveUV_Kernel_param_7 +) +{ + .reg .pred %p<4>; + .reg .b16 %rs<3>; + .reg .b32 %r<18>; + .reg .b64 %rd<18>; + + + ld.param.u64 %rd1, [DeinterleaveUV_Kernel_param_0]; + ld.param.u32 %r3, [DeinterleaveUV_Kernel_param_1]; + ld.param.u64 %rd2, [DeinterleaveUV_Kernel_param_2]; + ld.param.u32 %r4, [DeinterleaveUV_Kernel_param_3]; + ld.param.u64 %rd3, [DeinterleaveUV_Kernel_param_4]; + ld.param.u32 %r5, [DeinterleaveUV_Kernel_param_5]; + ld.param.u32 %r6, [DeinterleaveUV_Kernel_param_6]; + ld.param.u32 %r7, [DeinterleaveUV_Kernel_param_7]; + mov.u32 %r8, %ntid.x; + mov.u32 %r9, %ctaid.x; + mov.u32 %r10, %tid.x; + mad.lo.s32 %r1, %r9, %r8, %r10; + mov.u32 %r11, %ntid.y; + mov.u32 %r12, %ctaid.y; + mov.u32 %r13, %tid.y; + mad.lo.s32 %r2, %r12, %r11, %r13; + setp.ge.s32 %p1, %r1, %r6; + setp.ge.s32 %p2, %r2, %r7; + or.pred %p3, %p1, %p2; + @%p3 bra $L__BB0_2; + + cvta.to.global.u64 %rd4, %rd2; + mul.lo.s32 %r14, %r2, %r3; + cvt.s64.s32 %rd5, %r14; + shl.b32 %r15, %r1, 1; + cvt.s64.s32 %rd6, %r15; + add.s64 %rd7, %rd5, %rd6; + cvta.to.global.u64 %rd8, %rd1; + add.s64 %rd9, %rd8, %rd7; + mul.lo.s32 %r16, %r2, %r4; + cvt.s64.s32 %rd10, %r16; + cvt.s64.s32 %rd11, %r1; + add.s64 %rd12, %rd10, %rd11; + add.s64 %rd13, %rd4, %rd12; + mul.lo.s32 %r17, %r2, %r5; + cvt.s64.s32 %rd14, %r17; + add.s64 %rd15, %rd14, %rd11; + cvta.to.global.u64 %rd16, %rd3; + add.s64 %rd17, %rd16, %rd15; + ld.global.u8 %rs1, [%rd9]; + st.global.u8 [%rd13], %rs1; + ld.global.u8 %rs2, [%rd9+1]; + st.global.u8 [%rd17], %rs2; + +$L__BB0_2: + ret; + +} + + +)PTX"; + namespace VavCore { NVDECAV1Decoder::NVDECAV1Decoder() @@ -73,6 +157,22 @@ bool NVDECAV1Decoder::Initialize(const VideoMetadata& metadata) { return false; } + // Load the PTX module for the deinterleave kernel + CUresult result = cuModuleLoadData(&m_module, g_deinterleave_kernel_ptx); + if (result != CUDA_SUCCESS) { + LogError("Failed to load CUDA module from PTX"); + LogCUDAError(result, "cuModuleLoadData"); + Cleanup(); + return false; + } + result = cuModuleGetFunction(&m_kernel, m_module, "DeinterleaveUV_Kernel"); + if (result != CUDA_SUCCESS) { + LogError("Failed to get CUDA kernel function from module"); + LogCUDAError(result, "cuModuleGetFunction"); + Cleanup(); + return false; + } + m_initialized = true; std::cout << "[NVDECAV1Decoder] Initialized successfully" << std::endl; @@ -92,6 +192,21 @@ void NVDECAV1Decoder::Cleanup() { m_cachedD3D12Resource = nullptr; } + // Clean up D3D12 synchronization objects + if (m_cudaSemaphore != nullptr) { + // CUDA takes ownership of the shared handle when importing the semaphore + // It will be closed automatically when destroying the semaphore + cudaDestroyExternalSemaphore(m_cudaSemaphore); + m_cudaSemaphore = nullptr; + } + // NOTE: m_d3d12FenceSharedHandle is owned by CUDA after import, do NOT close it here + m_d3d12FenceSharedHandle = nullptr; + + if (m_d3d12Fence != nullptr) { + static_cast(m_d3d12Fence)->Release(); + m_d3d12Fence = nullptr; + } + if (m_parser) { cuvidDestroyVideoParser(m_parser); m_parser = nullptr; @@ -102,6 +217,11 @@ void NVDECAV1Decoder::Cleanup() { m_decoder = nullptr; } + if (m_module) { + cuModuleUnload(m_module); + m_module = nullptr; + } + CleanupCUDA(); m_initialized = false; } @@ -150,12 +270,12 @@ bool NVDECAV1Decoder::DecodeFrame(const uint8_t* packet_data, size_t packet_size int frameIdx = -1; { std::unique_lock lock(m_frameQueueMutex); - m_frameAvailable.wait(lock, [this] { - return !m_decodedFrameIndices.empty(); - }); + if (!m_frameAvailable.wait_for(lock, std::chrono::milliseconds(100), [this] { return !m_decodedFrameIndices.empty(); })) { + LogError("Timeout waiting for decoded frame"); + return false; // Timeout + } frameIdx = m_decodedFrameIndices.front(); m_decodedFrameIndices.pop(); - OutputDebugStringA("[DecodeFrame] Frame retrieved from queue\n"); } // Map the decoded frame from GPU memory @@ -172,14 +292,16 @@ bool NVDECAV1Decoder::DecodeFrame(const uint8_t* packet_data, size_t packet_size // Allocate CPU memory for YUV planes size_t y_size = m_width * m_height; - size_t uv_size = (m_width / 2) * (m_height / 2); + int uv_width = m_width / 2; + int uv_height = m_height / 2; + size_t uv_size = static_cast(uv_width) * uv_height; output_frame.y_plane.reset(new uint8_t[y_size]); output_frame.u_plane.reset(new uint8_t[uv_size]); output_frame.v_plane.reset(new uint8_t[uv_size]); output_frame.y_stride = m_width; - output_frame.u_stride = m_width / 2; - output_frame.v_stride = m_width / 2; + output_frame.u_stride = uv_width; + output_frame.v_stride = uv_width; // Copy Y plane from GPU to CPU CUDA_MEMCPY2D copyParamsY = {}; @@ -199,39 +321,72 @@ bool NVDECAV1Decoder::DecodeFrame(const uint8_t* packet_data, size_t packet_size return false; } - // NV12 to YUV420P: Extract U and V from interleaved UV - // UV plane starts after Y plane in NV12 format - CUdeviceptr uvSrcPtr = srcDevicePtr + (srcPitch * m_height); - - // Copy UV plane to temporary buffer - size_t uv_pitch = m_width; - std::unique_ptr uv_temp(new uint8_t[uv_pitch * (m_height / 2)]); - - CUDA_MEMCPY2D copyParamsUV = {}; - copyParamsUV.srcMemoryType = CU_MEMORYTYPE_DEVICE; - copyParamsUV.srcDevice = uvSrcPtr; - copyParamsUV.srcPitch = srcPitch; - copyParamsUV.dstMemoryType = CU_MEMORYTYPE_HOST; - copyParamsUV.dstHost = uv_temp.get(); - copyParamsUV.dstPitch = uv_pitch; - copyParamsUV.WidthInBytes = m_width; // NV12 UV is interleaved - copyParamsUV.Height = m_height / 2; - - result = cuMemcpy2D(©ParamsUV); - if (result != CUDA_SUCCESS) { + // --- GPU-accelerated UV De-interleaving --- + uint8_t *d_u_plane = nullptr, *d_v_plane = nullptr; + cudaError_t cudaStatus = cudaMalloc(&d_u_plane, uv_size); + if (cudaStatus != cudaSuccess) { cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); - LogCUDAError(result, "cuMemcpy2D (UV plane)"); + LogError("cudaMalloc failed for U plane"); + return false; + } + cudaStatus = cudaMalloc(&d_v_plane, uv_size); + if (cudaStatus != cudaSuccess) { + cudaFree(d_u_plane); + cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + LogError("cudaMalloc failed for V plane"); return false; } - // Deinterleave UV to separate U and V planes - for (uint32_t i = 0; i < m_height / 2; i++) { - for (uint32_t j = 0; j < m_width / 2; j++) { - output_frame.u_plane[i * (m_width / 2) + j] = uv_temp[i * uv_pitch + j * 2]; - output_frame.v_plane[i * (m_width / 2) + j] = uv_temp[i * uv_pitch + j * 2 + 1]; - } + CUdeviceptr uvSrcPtr = srcDevicePtr + (srcPitch * m_height); + dim3 blockDim(16, 16); + dim3 gridDim((uv_width + blockDim.x - 1) / blockDim.x, (uv_height + blockDim.y - 1) / blockDim.y); + + int u_pitch = uv_width; + int v_pitch = uv_width; + void* args[] = { &uvSrcPtr, &srcPitch, &d_u_plane, &u_pitch, &d_v_plane, &v_pitch, &uv_width, &uv_height }; + + result = cuLaunchKernel(m_kernel, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, 0, m_stream, args, nullptr); + if (result != CUDA_SUCCESS) { + cudaFree(d_u_plane); + cudaFree(d_v_plane); + cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + LogCUDAError(result, "cuLaunchKernel(DeinterleaveUV_Kernel)"); + return false; } + // Synchronize stream to ensure kernel completion before copy + result = cuStreamSynchronize(m_stream); + if (result != CUDA_SUCCESS) { + cudaFree(d_u_plane); + cudaFree(d_v_plane); + cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + LogCUDAError(result, "cuStreamSynchronize after DeinterleaveUV_Kernel"); + return false; + } + + // Copy separated U and V planes from GPU to CPU + cudaStatus = cudaMemcpy(output_frame.u_plane.get(), d_u_plane, uv_size, cudaMemcpyDeviceToHost); + if (cudaStatus != cudaSuccess) { + cudaFree(d_u_plane); + cudaFree(d_v_plane); + cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + LogError("cudaMemcpy failed for U plane"); + return false; + } + + cudaStatus = cudaMemcpy(output_frame.v_plane.get(), d_v_plane, uv_size, cudaMemcpyDeviceToHost); + if (cudaStatus != cudaSuccess) { + cudaFree(d_u_plane); + cudaFree(d_v_plane); + cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + LogError("cudaMemcpy failed for V plane"); + return false; + } + + // Cleanup temporary GPU buffers + cudaFree(d_u_plane); + cudaFree(d_v_plane); + // Unmap the frame cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); @@ -679,6 +834,74 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) { m_d3d12Device = d3d_device; m_preferredSurfaceType = type; + // Create synchronization objects for D3D12 interop if not already created + if (m_d3d12Fence == nullptr) { + OutputDebugStringA("[SetD3DDevice] Creating D3D12 Fence for GPU synchronization...\n"); + ID3D12Device* device = static_cast(m_d3d12Device); + HRESULT hr = device->CreateFence(0, D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(reinterpret_cast(&m_d3d12Fence))); + if (FAILED(hr)) { + LogError("Failed to create D3D12 Fence for CUDA interop"); + OutputDebugStringA("[SetD3DDevice] ERROR: D3D12 Fence creation FAILED\n"); + return false; + } + + char fence_buf[256]; + sprintf_s(fence_buf, "[SetD3DDevice] D3D12 Fence created successfully: %p\n", m_d3d12Fence); + OutputDebugStringA(fence_buf); + + // Create shared handle for the fence so CUDA can access it + // IMPORTANT: Must use SECURITY_ATTRIBUTES with bInheritHandle = FALSE for NT handles + SECURITY_ATTRIBUTES secAttr = {}; + secAttr.nLength = sizeof(SECURITY_ATTRIBUTES); + secAttr.bInheritHandle = FALSE; + secAttr.lpSecurityDescriptor = nullptr; + + HANDLE sharedHandle = nullptr; + hr = device->CreateSharedHandle( + static_cast(m_d3d12Fence), + &secAttr, + GENERIC_ALL, + nullptr, + &sharedHandle + ); + if (FAILED(hr) || sharedHandle == nullptr) { + LogError("Failed to create shared handle for D3D12 Fence"); + OutputDebugStringA("[SetD3DDevice] ERROR: D3D12 Fence shared handle creation FAILED\n"); + static_cast(m_d3d12Fence)->Release(); + m_d3d12Fence = nullptr; + return false; + } + + sprintf_s(fence_buf, "[SetD3DDevice] D3D12 Fence shared handle created: %p\n", sharedHandle); + OutputDebugStringA(fence_buf); + + cudaExternalSemaphoreHandleDesc semDesc = {}; + semDesc.type = cudaExternalSemaphoreHandleTypeD3D12Fence; + semDesc.handle.win32.handle = sharedHandle; + semDesc.flags = 0; + + cudaError_t cudaStatus = cudaImportExternalSemaphore(&m_cudaSemaphore, &semDesc); + if (cudaStatus != cudaSuccess) { + char cuda_err[256]; + sprintf_s(cuda_err, "[SetD3DDevice] CUDA semaphore import failed with error: %d\n", cudaStatus); + OutputDebugStringA(cuda_err); + LogError("Failed to import D3D12 Fence as CUDA semaphore"); + CloseHandle(sharedHandle); + static_cast(m_d3d12Fence)->Release(); + m_d3d12Fence = nullptr; + return false; + } + OutputDebugStringA("[SetD3DDevice] CUDA semaphore imported successfully\n"); + + // Store the shared handle for cleanup later + m_d3d12FenceSharedHandle = sharedHandle; + } else { + char fence_buf[256]; + sprintf_s(fence_buf, "[SetD3DDevice] D3D12 Fence already exists: %p\n", m_d3d12Fence); + OutputDebugStringA(fence_buf); + } + + // DON'T reinitialize if already initialized - this would invalidate existing decoded frames if (m_initialized) { OutputDebugStringA("[SetD3DDevice] Decoder already initialized, D3D12 device set for DecodeToSurface\n"); @@ -1111,7 +1334,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying Y plane to D3D12 texture...\n"); cudaError_t cudaStatus = cudaMemcpy2D(devicePtr, d3d12Pitch, // Use D3D12 texture pitch (void*)srcDevicePtr, srcPitch, - rowSizeInBytes, m_height, // Use rowSizeInBytes for width + srcPitch, m_height, // Use pitch as width to copy the entire row cudaMemcpyDeviceToDevice); if (cudaStatus != cudaSuccess) { sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (Y plane) failed with code %d: %s\n", @@ -1171,17 +1394,23 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ // CRITICAL: Use the EXACT UV plane offset from GetCopyableFootprints // planeLayouts[1].Offset is the correct offset accounting for alignment void* uvDstPtr = static_cast(devicePtr) + planeLayouts[1].Offset; + + // NVDEC UV plane offset: For NV12, UV plane starts at Y_height * Y_pitch void* uvSrcPtr = (void*)(srcDevicePtr + (srcPitch * m_height)); + sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] NVDEC UV source: srcDevicePtr=%p, Y_offset=%llu, UV_srcPtr=%p\n", + (void*)srcDevicePtr, (uint64_t)(srcPitch * m_height), uvSrcPtr); + OutputDebugStringA(debug_buf); + sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV plane offset from GetCopyableFootprints: %llu bytes\n", planeLayouts[1].Offset); OutputDebugStringA(debug_buf); // UV plane: NV12 format has interleaved UV (UVUVUV...) - // For NV12: UV plane width in pixels = original width / 2, but stored as interleaved bytes - // So bytes per row = (width / 2) * 2 = width (same as Y plane width in bytes) - // However, we should use the minimum of rowSize and available width - UINT uvCopyWidth = static_cast(planeRowSizes[1]); - UINT uvCopyHeight = planeNumRows[1]; + // CRITICAL: For cudaMemcpy2D, width parameter is in BYTES, not pixels + // UV plane stores U and V interleaved: each row has (width/2) UV pairs = width bytes total + // Even though D3D12 sees it as (width/2) R8G8 pixels, the memory layout is still width bytes per row + UINT uvCopyWidth = m_width; // Copy full width in bytes (includes both U and V channels) + UINT uvCopyHeight = m_height / 2; // UV plane has half height // Verify we won't exceed allocation bounds size_t uvPlaneNeededSize = static_cast(planeLayouts[1].Footprint.RowPitch) * uvCopyHeight; @@ -1207,7 +1436,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ cudaStatus = cudaMemcpy2D(uvDstPtr, planeLayouts[1].Footprint.RowPitch, uvSrcPtr, srcPitch, - uvCopyWidth, uvCopyHeight, + srcPitch, uvCopyHeight, // Use pitch as width to copy the entire row cudaMemcpyDeviceToDevice); if (cudaStatus != cudaSuccess) { sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (UV plane) failed with code %d: %s\n", @@ -1222,28 +1451,73 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ } OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] UV plane copied successfully to D3D12 texture\n"); - // CRITICAL: Synchronize CUDA device to ensure all GPU operations complete - // before D3D12 rendering pipeline accesses the texture - // cudaDeviceSynchronize() ensures ALL CUDA work completes, not just one stream - cudaError_t syncResult = cudaDeviceSynchronize(); - if (syncResult != cudaSuccess) { + // Asynchronously signal the D3D12 fence from CUDA when copies are complete + if (m_cudaSemaphore != nullptr) { + // Ensure we're in the correct CUDA context + CUcontext currentContext = nullptr; + cuCtxGetCurrent(¤tContext); + char debug_buf[256]; - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaDeviceSynchronize failed: %s\n", - cudaGetErrorString(syncResult)); + sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Current CUDA context: %p, Expected: %p\n", + currentContext, m_cuContext); OutputDebugStringA(debug_buf); + + if (currentContext != m_cuContext) { + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] WARNING: CUDA context mismatch, setting correct context\n"); + cuCtxSetCurrent(m_cuContext); + } + + m_fenceValue++; + cudaExternalSemaphoreSignalParams signalParams = {}; + signalParams.params.fence.value = m_fenceValue; + signalParams.flags = 0; + + sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Signaling fence with value: %llu, semaphore: %p, stream: %p\n", + m_fenceValue, m_cudaSemaphore, m_stream); + OutputDebugStringA(debug_buf); + + cudaStatus = cudaSignalExternalSemaphoresAsync(&m_cudaSemaphore, &signalParams, 1, m_stream); + if (cudaStatus != cudaSuccess) { + sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaSignalExternalSemaphoresAsync failed: %s (error code: %d)\n", + cudaGetErrorString(cudaStatus), cudaStatus); + OutputDebugStringA(debug_buf); + + // Try synchronizing the stream and signaling the fence from CPU instead + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Falling back to CPU fence signaling\n"); + cudaStreamSynchronize(m_stream); + + if (m_d3d12Fence != nullptr) { + HRESULT hr = static_cast(m_d3d12Fence)->Signal(m_fenceValue); + if (SUCCEEDED(hr)) { + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal succeeded\n"); + output_frame.sync_fence_value = m_fenceValue; + } else { + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal also failed\n"); + output_frame.sync_fence_value = 0; + } + } else { + output_frame.sync_fence_value = 0; + } + } else { + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA fence signal enqueued successfully\n"); + // Pass the fence value to the caller + output_frame.sync_fence_value = m_fenceValue; + } } else { - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA device synchronized successfully (all GPU work complete)\n"); + OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] No CUDA semaphore available, skipping fence signal\n"); + // No synchronization available + output_frame.sync_fence_value = 0; } - // CRITICAL GPU-GPU SYNCHRONIZATION: - // After CUDA write completes, cudaDeviceSynchronize() has already ensured - // all CUDA work is complete. D3D12 will handle synchronization via resource - // barriers when the texture is accessed by the rendering pipeline. - // No additional D3D12 fence synchronization needed here - it would create deadlock! - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA-D3D12 sync complete via cudaDeviceSynchronize()\n"); - // Unmap frame and close shared handle (if newly created) - cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + result = cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); + if (result != CUDA_SUCCESS) { + sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidUnmapVideoFrame failed with code %d\n", result); + OutputDebugStringA(debug_buf); + LogCUDAError(result, "cuvidUnmapVideoFrame"); + // Even if unmap fails, try to continue and close the handle + } + if (sharedHandle != nullptr) { CloseHandle(sharedHandle); OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Shared handle closed\n"); diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h index 09c4bd4..6384b8e 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h @@ -22,6 +22,7 @@ // Forward declaration for CUDA external memory type typedef struct CUexternalMemory_st* cudaExternalMemory_t; +typedef struct CUexternalSemaphore_st* cudaExternalSemaphore_t; namespace VavCore { @@ -49,6 +50,7 @@ public: // D3D Surface decoding functionality bool SupportsSurfaceType(VavCoreSurfaceType type) const override; bool SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) override; + void* GetSyncFence() override { return m_d3d12Fence; } bool DecodeToSurface(const uint8_t* packet_data, size_t packet_size, VavCoreSurfaceType target_type, void* target_surface, @@ -101,6 +103,8 @@ private: CUdevice m_cudaDevice = 0; CUvideoparser m_parser = nullptr; CUstream m_stream = nullptr; + CUmodule m_module = nullptr; + CUfunction m_kernel = nullptr; // D3D-CUDA interop void* m_d3d11Device = nullptr; // ID3D11Device* @@ -108,6 +112,12 @@ private: CUgraphicsResource m_cudaGraphicsResource = nullptr; VavCoreSurfaceType m_preferredSurfaceType = VAVCORE_SURFACE_CPU; + // D3D12-CUDA Asynchronous Synchronization + void* m_d3d12Fence = nullptr; // ID3D12Fence* + HANDLE m_d3d12FenceSharedHandle = nullptr; // Shared handle for CUDA interop + cudaExternalSemaphore_t m_cudaSemaphore = {}; + uint64_t m_fenceValue = 0; + // External memory caching for D3D12 interop void* m_cachedD3D12Resource = nullptr; // ID3D12Resource* cudaExternalMemory_t m_cachedExternalMemory = nullptr; diff --git a/vav2/platforms/windows/vavcore/src/VavCore.cpp b/vav2/platforms/windows/vavcore/src/VavCore.cpp index 9abb8b7..b5373b7 100644 --- a/vav2/platforms/windows/vavcore/src/VavCore.cpp +++ b/vav2/platforms/windows/vavcore/src/VavCore.cpp @@ -710,6 +710,13 @@ VAVCORE_API VavCoreResult vavcore_set_d3d_device(VavCorePlayer* player, void* d3 return success ? VAVCORE_SUCCESS : VAVCORE_ERROR_NOT_SUPPORTED; } +VAVCORE_API void* vavcore_get_sync_fence(VavCorePlayer* player) { + if (!player || !player->impl || !player->impl->decoder) { + return nullptr; + } + return player->impl->decoder->GetSyncFence(); +} + VAVCORE_API VavCoreResult vavcore_decode_to_surface(VavCorePlayer* player, VavCoreSurfaceType target_type, void* target_surface,