diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index fa833ae..0a13278 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -89,7 +89,9 @@
"Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"Vav2Player.sln\" //p:Configuration=Debug //p:Platform=x64 //v:minimal)",
"Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"VavCore.vcxproj\" //p:Configuration=Debug //p:Platform=x64 //v:minimal)",
"Bash(tasklist)",
- "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"D:/Project/video-av1/vav2/platforms/windows/applications/vav2player/Vav2Player/Vav2Player.vcxproj\" //p:Configuration=Debug //p:Platform=x64 //t:Clean)"
+ "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" \"D:/Project/video-av1/vav2/platforms/windows/applications/vav2player/Vav2Player/Vav2Player.vcxproj\" //p:Configuration=Debug //p:Platform=x64 //t:Clean)",
+ "Bash(./Vav2Player.exe)",
+ "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" Vav2Player.vcxproj //p:Configuration=Debug //p:Platform=x64 //v:minimal)"
],
"deny": [],
"ask": []
diff --git a/vav2/CLAUDE.md b/vav2/CLAUDE.md
index 2986e4b..92a7633 100644
--- a/vav2/CLAUDE.md
+++ b/vav2/CLAUDE.md
@@ -53,7 +53,58 @@ size_t required_size = frame.width * frame.height * 4;
---
-## π― **νμ¬ νλ‘μ νΈ μν** (2025-09-30)
+## π― **νμ¬ νλ‘μ νΈ μν** (2025-10-01)
+
+### **β
Windows VideoPlayerControl2 리ν©ν λ§ μλ£** (2025-10-01)
+- **VideoPlayerControl2 μ½λ ꡬν**: PlaybackController + FrameProcessor + VideoPlayerControl2 λͺ¨λν μν€ν
μ² μμ± μλ£ β
+- **μ½λ ꡬ쑰 κ°μ **: 1,890 lines β 950 lines (50% κ°μ) β
+- **μ±
μ λΆλ¦¬**: 7κ° νΌμ¬λ μ±
μ β κ° ν΄λμ€ 1κ° λͺ
νν λͺ©μ β
+- **λΉλ μ±κ³΅**: WinRT λ°νμ ν΄λμ€ μμ± λ° μ»΄νμΌ μλ£ β
+- **MainVideoPage κ΅μ²΄**: VideoPlayerControl β VideoPlayerControl2 μ ν μλ£ β
+
+**π§ ν΄κ²°λ ν¬λμ λ¬Έμ **:
+1. **VavCore-debug.dll AppX κ²½λ‘ λ³΅μ¬ μ€ν¨** (exit code 3 μμΈ)
+ - λ¬Έμ : μ½μ MSBuildλ post-build eventκ° AppX λλ ν λ¦¬λ‘ DLL λ³΅μ¬ μ ν¨
+ - ν΄κ²°: **Visual Studioμμ λΉλ** μ post-build event μ μ μ€ν
+ - AppX λλ ν 리: `x64/Debug/Vav2Player/` (WinUI3 ν¨ν€μ§ μ± μ€ν μμΉ)
+
+2. **PlaybackController VavCore μ€λ³΅ μ΄κΈ°ν μ κ±°**
+ - λ¬Έμ : PlaybackController μμ±μμμ `vavcore_initialize()` μ€λ³΅ νΈμΆ
+ - ν΄κ²°: App.xaml.cpp μ μ μ΄κΈ°νλ§ μ¬μ©, PlaybackControllerμμ μ κ±°
+ - μ½λ μμ : `src/Playback/PlaybackController.cpp` μμ±μ/μλ©Έμ μμ
+
+3. **μμΈ μ²λ¦¬ κ°ν**
+ - VideoPlayerControl2 μμ±μμ μμΈν try-catch λ° λλ²κ·Έ λ‘κΉ
μΆκ°
+ - PlaybackController/FrameProcessor μμ± κ³Όμ μΆμ κ°λ₯
+
+**β οΈ λΉλ μ£Όμμ¬ν**:
+```
+λ°λμ Visual Studioμμ λΉλν κ²!
+- μ½μ MSBuild: VavCore-debug.dllμ΄ AppXμ λ³΅μ¬ μ λ¨ β ν¬λμ
+- Visual Studio: post-build event μ μ μ€ν β μ μ λμ
+```
+
+**VideoPlayerControl2 μν€ν
μ²**:
+```
+VideoPlayerControl2 (UI Layer - 400 lines)
+βββ PlaybackController (Playback Logic - 300 lines)
+β βββ VavCore μλͺ
μ£ΌκΈ° κ΄λ¦¬
+β βββ νμ΄λ° μ€λ λ (30fps)
+β βββ μ¬μ μν λ¨Έμ
+βββ FrameProcessor (Frame Handling - 250 lines)
+β βββ λ°±κ·ΈλΌμ΄λ λμ½λ (NVDEC β D3D12 NV12)
+β βββ νλ μ μ²λ¦¬ μ‘°μ (m_frameProcessing νλκ·Έ)
+β βββ UI μ€λ λ λ λλ§ μ½λ°±
+βββ SimpleGPURenderer (Rendering)
+ βββ D3D12 NV12 ν
μ€μ²
+ βββ YUVβRGB λ³ν + Present
+```
+
+**μ£Όμ νμΌ**:
+- `VideoPlayerControl2.xaml.h/.cpp` (400 lines) - UI λ μ΄μ΄
+- `src/Playback/PlaybackController.h/.cpp` (300 lines) - μ¬μ μ μ΄
+- `src/Playback/FrameProcessor.h/.cpp` (250 lines) - νλ μ μ²λ¦¬
+- `VideoPlayerControl2.idl` - WinRT μΈν°νμ΄μ€ μ μ
### **β
Android Vulkan AV1 Player μμ ꡬν μλ£** (2025-09-30)
- **Android Vulkan μ ν리μΌμ΄μ
**: μμ ν λ€μ΄ν°λΈ Vulkan AV1 Player μ± κ΅¬ν β
@@ -71,9 +122,6 @@ size_t required_size = frame.width * frame.height * 4;
- **VavCore C API 28κ° ν¨μ**: Android NDK JNIλ₯Ό ν΅ν μμ ν λ€μ΄ν°λΈ ν΅ν©
- **16KB νμ΄μ§ νΈνμ±**: Google Play Android 15+ νΈνμ± λ³΄μ₯
-### **νμ¬ μ§ν μ€**
-- **Video Player Overlay UI**: νλ‘λμ
λ 벨 μ¬μ©μ κ²½ν ν₯μ (νμΌ μ ν, μ μ€μ² 컨νΈλ‘€, μ§νλ°, μ€λ²λ μ΄ UI) π
-
### **νμ± μ€κ³ λ¬Έμ**
- [**VavCore Godot Integration**](VavCore_Godot_Integration_Design.md) - Godot 4.4.1 C# Extension ꡬν νν© β
- [**Android Vulkan AV1 Player Design**](docs/completed/android/Android_Vulkan_AV1_Player_Design.md) - Android Vulkan Surface κΈ°λ° AV1 Player μ€κ³ β
@@ -83,9 +131,17 @@ size_t required_size = frame.width * frame.height * 4;
---
-## β
**μ΅μ μλ£ μμ
: μ£Όμ λ§μΌμ€ν€ λ¬μ±** (2025-09-30)
+## β
**μ΅μ μλ£ μμ
: μ£Όμ λ§μΌμ€ν€ λ¬μ±** (2025-10-01)
-### **π― 2025λ
9μ μ΅μ’
ν΅μ¬ μ±κ³Ό**
+### **π― 2025λ
10μ μ΅μ μ±κ³Ό**
+- [**VideoPlayerControl2 리ν©ν λ§ μλ£**](docs/completed/windows/VideoPlayerControl2_Refactoring_Plan_2025-10-01.md) - λͺ¨λν μν€ν
μ² μμ± β
π΄ **NEW**
+ - **PlaybackController μμ ꡬν**: VavCore μλͺ
μ£ΌκΈ°, νμ΄λ° μ€λ λ, μ¬μ μν λ¨Έμ (300 lines)
+ - **FrameProcessor μμ ꡬν**: λ°±κ·ΈλΌμ΄λ λμ½λ, νλ μ μ‘°μ , UI λ λλ§ μ½λ°± (250 lines)
+ - **VideoPlayerControl2 ν΅ν©**: SwapChainPanel, AspectFit, WinRT μΈν°νμ΄μ€ (400 lines)
+ - **50% μ½λ κ°μ**: 1,890 lines β 950 lines (μ€μ©μ λͺ¨λν)
+ - **MainVideoPage κ΅μ²΄**: VideoPlayerControl β VideoPlayerControl2 μ ν μλ£
+
+### **π― 2025λ
9μ ν΅μ¬ μ±κ³Ό**
- [**Android Vulkan AV1 Player μμ μμ±**](vav2/docs/completed/android/Android_Vulkan_AV1_Player_Design.md) - Samsung Galaxy S24 μ΅μ ν μλ£ β
- [**MediaCodec ν€μλ κΈ°λ° λμ½λ μ ν μμ€ν
**](vav2/platforms/android/vavcore/src/Decoder/AndroidMediaCodecAV1Decoder.cpp) - ν¬λ‘μ€ λ²€λ νΈνμ± ν보 β
- [**Vulkan 1.1 λ λλ§ νμ΄νλΌμΈ μμ±**](platforms/android/applications/vav2player/app/src/main/cpp/vulkan_renderer.cpp) - YUV to RGB GPU μμ΄λ μλ£ β
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml b/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml
index 768e373..a06a030 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/MainVideoPage.xaml
@@ -17,8 +17,9 @@
-
-
+
+
+
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp
index d040db2..09dca96 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl.xaml.cpp
@@ -530,13 +530,16 @@ namespace winrt::Vav2Player::implementation
if (result == VAVCORE_SUCCESS) {
OutputDebugStringA("[VideoPlayerControl] Decode SUCCESS, enqueuing render...\n");
+ // Capture fence value for GPU synchronization
+ uint64_t fenceValue = vavFrame.sync_fence_value;
+
// Render + Present on UI thread (lightweight, thread-safe)
// CRITICAL: Keep m_frameProcessing = true until render completes
// to prevent NVDEC surface queue overflow
- auto enqueued = strongThis->DispatcherQueue().TryEnqueue([strongThis, gpuRenderer]() {
+ auto enqueued = strongThis->DispatcherQueue().TryEnqueue([strongThis, gpuRenderer, fenceValue]() {
OutputDebugStringA("[VideoPlayerControl] Render callback executing...\n");
if (strongThis->m_isPlaying) {
- HRESULT hr = gpuRenderer->RenderNV12TextureToBackBuffer();
+ HRESULT hr = gpuRenderer->RenderNV12TextureToBackBuffer(fenceValue);
if (SUCCEEDED(hr)) {
OutputDebugStringA("[VideoPlayerControl] Render SUCCESS\n");
} else {
@@ -691,7 +694,7 @@ namespace winrt::Vav2Player::implementation
// Render NV12 texture to back buffer (YUV to RGB conversion)
// NOTE: RenderNV12TextureToBackBuffer() internally executes command list,
// signals fence, and advances frame index - no separate Present() needed
- HRESULT renderHr = gpuRenderer->RenderNV12TextureToBackBuffer();
+ HRESULT renderHr = gpuRenderer->RenderNV12TextureToBackBuffer(vavFrame.sync_fence_value);
if (FAILED(renderHr)) {
LogMgr::GetInstance().LogError(L"Failed to render NV12 texture to back buffer", L"VideoPlayerControl");
}
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml
index 4efd162..65e8200 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/VideoPlayerControl2.xaml
@@ -20,10 +20,11 @@
+
+ HorizontalAlignment="Stretch"
+ VerticalAlignment="Stretch"/>
();
- m_frameProcessor = std::make_unique<::Vav2Player::FrameProcessor>();
+ // Create core components
+ OutputDebugStringA("[VideoPlayerControl2] Creating PlaybackController...\n");
+ m_playbackController = std::make_unique<::Vav2Player::PlaybackController>();
+ OutputDebugStringA("[VideoPlayerControl2] PlaybackController created\n");
- LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Core components created");
+ OutputDebugStringA("[VideoPlayerControl2] Creating FrameProcessor...\n");
+ m_frameProcessor = std::make_unique<::Vav2Player::FrameProcessor>();
+ OutputDebugStringA("[VideoPlayerControl2] FrameProcessor created\n");
+
+ LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Core components created");
+ OutputDebugStringA("[VideoPlayerControl2] Constructor END (SUCCESS)\n");
+ }
+ catch (const std::exception& e) {
+ std::string error = "[VideoPlayerControl2] Constructor EXCEPTION: ";
+ error += e.what();
+ OutputDebugStringA(error.c_str());
+ OutputDebugStringA("\n");
+ throw;
+ }
+ catch (...) {
+ OutputDebugStringA("[VideoPlayerControl2] Constructor UNKNOWN EXCEPTION\n");
+ throw;
+ }
}
VideoPlayerControl2::~VideoPlayerControl2()
@@ -96,6 +116,14 @@ namespace winrt::Vav2Player::implementation
void VideoPlayerControl2::UserControl_SizeChanged(IInspectable const&, SizeChangedEventArgs const& e)
{
auto newSize = e.NewSize();
+ auto oldSize = e.PreviousSize();
+
+ // Ignore trivial size changes (< 1 pixel) to avoid unnecessary work
+ if (std::abs(newSize.Width - oldSize.Width) < 1.0 &&
+ std::abs(newSize.Height - oldSize.Height) < 1.0) {
+ return;
+ }
+
LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2",
L"SizeChanged: " + std::to_wstring((int)newSize.Width) + L"x" + std::to_wstring((int)newSize.Height));
@@ -143,8 +171,21 @@ namespace winrt::Vav2Player::implementation
{
if (m_videoSource != value) {
m_videoSource = value;
+
if (!value.empty()) {
- LoadVideo(value);
+ if (m_initialized) {
+ LoadVideo(value);
+ } else {
+ LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2",
+ L"VideoSource set before initialization, will load on Loaded event");
+ }
+ } else {
+ // Empty source - unload current video
+ if (m_playbackController && m_playbackController->IsLoaded()) {
+ Stop();
+ m_playbackController->Unload();
+ UpdateStatus(L"No video");
+ }
}
}
}
@@ -172,25 +213,27 @@ namespace winrt::Vav2Player::implementation
Vav2Player::VideoDecoderType VideoPlayerControl2::DecoderType()
{
if (!m_playbackController) {
- return winrt::Vav2Player::VideoDecoderType::Auto;
+ return static_cast(0); // Auto
}
VavCoreDecoderType coreType = m_playbackController->GetDecoderType();
// Convert VavCoreDecoderType to VideoDecoderType
+ // Use switch on C enum (compile-time constant), return WinRT enum as integer
+ // WinRT enum values: Auto=0, NVDEC=1, VPL=2, AMF=3, DAV1D=4, MediaFoundation=5
switch (coreType) {
case VAVCORE_DECODER_NVDEC:
- return winrt::Vav2Player::VideoDecoderType::NVDEC;
+ return static_cast(1); // NVDEC
case VAVCORE_DECODER_VPL:
- return winrt::Vav2Player::VideoDecoderType::VPL;
+ return static_cast(2); // VPL
case VAVCORE_DECODER_AMF:
- return winrt::Vav2Player::VideoDecoderType::AMF;
+ return static_cast(3); // AMF
case VAVCORE_DECODER_DAV1D:
- return winrt::Vav2Player::VideoDecoderType::DAV1D;
+ return static_cast(4); // DAV1D
case VAVCORE_DECODER_MEDIA_FOUNDATION:
- return winrt::Vav2Player::VideoDecoderType::MediaFoundation;
+ return static_cast(5); // MediaFoundation
default:
- return winrt::Vav2Player::VideoDecoderType::Auto;
+ return static_cast(0); // Auto
}
}
@@ -201,17 +244,19 @@ namespace winrt::Vav2Player::implementation
}
// Convert VideoDecoderType to VavCoreDecoderType
+ // WinRT enums are projected as integers: Auto=0, NVDEC=1, VPL=2, AMF=3, DAV1D=4, MediaFoundation=5
VavCoreDecoderType coreType = VAVCORE_DECODER_AUTO;
+ int32_t enumValue = static_cast(value);
- if (value == winrt::Vav2Player::VideoDecoderType::NVDEC) {
+ if (enumValue == 1) { // NVDEC
coreType = VAVCORE_DECODER_NVDEC;
- } else if (value == winrt::Vav2Player::VideoDecoderType::VPL) {
+ } else if (enumValue == 2) { // VPL
coreType = VAVCORE_DECODER_VPL;
- } else if (value == winrt::Vav2Player::VideoDecoderType::AMF) {
+ } else if (enumValue == 3) { // AMF
coreType = VAVCORE_DECODER_AMF;
- } else if (value == winrt::Vav2Player::VideoDecoderType::DAV1D) {
+ } else if (enumValue == 4) { // DAV1D
coreType = VAVCORE_DECODER_DAV1D;
- } else if (value == winrt::Vav2Player::VideoDecoderType::MediaFoundation) {
+ } else if (enumValue == 5) { // MediaFoundation
coreType = VAVCORE_DECODER_MEDIA_FOUNDATION;
}
@@ -237,38 +282,59 @@ namespace winrt::Vav2Player::implementation
m_playbackController->Stop();
}
- // Load video file
+ // --- Start of Corrected Initialization Order ---
+
+ // 1. Pass D3D12 device to the PlaybackController (with null check)
+ if (m_gpuRenderer) {
+ ID3D12Device* d3dDevice = m_gpuRenderer->GetD3D12Device();
+ if (d3dDevice) {
+ m_playbackController->SetD3DDevice(d3dDevice, VAVCORE_SURFACE_D3D12_RESOURCE);
+ LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"D3D12 device will be set by PlaybackController.");
+ } else {
+ LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2", L"D3D12 device is null");
+ }
+ } else {
+ LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2",
+ L"Cannot set D3D device: renderer not initialized");
+ }
+
+ // 2. Load the video file. VavCore will now see the pending D3D device
+ // and initialize the decoder in the correct GPU-accelerated mode.
std::wstring filePathStr(filePath.c_str());
bool success = m_playbackController->LoadVideo(filePathStr);
if (success) {
- // Get video dimensions and create NV12 texture
+ // 3. Get video dimensions (which are now available).
uint32_t videoWidth = m_playbackController->GetVideoWidth();
uint32_t videoHeight = m_playbackController->GetVideoHeight();
LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2",
L"Video loaded: " + std::to_wstring(videoWidth) + L"x" + std::to_wstring(videoHeight));
- // Create NV12 texture for CUDA interop
+ // 4. Create the NV12 texture for zero-copy, now that we have the dimensions.
if (m_gpuRenderer) {
HRESULT hr = m_gpuRenderer->CreateNV12TextureR8Layout(videoWidth, videoHeight);
if (SUCCEEDED(hr)) {
LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"NV12 texture created");
-
- // Pass D3D12 device to VavCore for CUDA-D3D12 interop
- ID3D12Device* d3dDevice = m_gpuRenderer->GetD3D12Device();
- if (d3dDevice) {
- VavCorePlayer* player = m_playbackController->GetVavCorePlayer();
- if (player) {
- vavcore_set_d3d_device(player, d3dDevice, VAVCORE_SURFACE_D3D12_RESOURCE);
- LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"D3D12 device passed to VavCore");
- }
- }
} else {
LogMgr::GetInstance().LogError(L"VideoPlayerControl2", L"Failed to create NV12 texture");
}
}
+ // 5. Get D3D12 fence from VavCore and pass it to SimpleGPURenderer for GPU synchronization
+ VavCorePlayer* player = m_playbackController->GetVavCorePlayer();
+ if (player && m_gpuRenderer) {
+ void* syncFence = vavcore_get_sync_fence(player);
+ if (syncFence) {
+ m_gpuRenderer->SetSyncFence(syncFence);
+ LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"D3D12 fence set for GPU synchronization");
+ } else {
+ LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2", L"No sync fence available from VavCore");
+ }
+ }
+
+ // --- End of Corrected Initialization Order ---
+
// Update AspectFit
UpdateVideoImageAspectFit(videoWidth, videoHeight);
@@ -280,6 +346,12 @@ namespace winrt::Vav2Player::implementation
}
} else {
LogMgr::GetInstance().LogError(L"VideoPlayerControl2", L"Failed to load video");
+
+ // Cleanup partial initialization on failure
+ if (m_gpuRenderer) {
+ m_gpuRenderer->ReleaseNV12Texture();
+ }
+
UpdateStatus(L"Load failed");
}
}
@@ -371,9 +443,16 @@ namespace winrt::Vav2Player::implementation
void VideoPlayerControl2::InitializeRenderer()
{
- if (m_gpuRenderer && m_gpuRenderer->IsInitialized()) {
- LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Renderer already initialized");
- return;
+ // Cleanup existing renderer if not properly initialized
+ if (m_gpuRenderer) {
+ if (m_gpuRenderer->IsInitialized()) {
+ LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Renderer already initialized");
+ return;
+ } else {
+ LogMgr::GetInstance().LogWarning(L"VideoPlayerControl2",
+ L"Cleaning up partially initialized renderer");
+ CleanupRenderer();
+ }
}
LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Initializing renderer");
@@ -464,20 +543,53 @@ namespace winrt::Vav2Player::implementation
void VideoPlayerControl2::OnFrameReady()
{
// Called from PlaybackController timing thread (30fps)
- if (!m_frameProcessor || !m_playbackController) {
+ // Thread-safe: Check if still initialized to prevent race conditions during cleanup
+ if (!m_initialized) {
+ return;
+ }
+
+ // Defensive: Get raw pointers atomically to avoid race with destruction
+ auto processor = m_frameProcessor.get();
+ auto controller = m_playbackController.get();
+
+ if (!processor || !controller) {
return;
}
// Process frame (decode on background, render on UI thread)
- VavCorePlayer* player = m_playbackController->GetVavCorePlayer();
+ VavCorePlayer* player = controller->GetVavCorePlayer();
if (!player) {
return;
}
- m_frameProcessor->ProcessFrame(player, [](bool success) {
- // Frame processing complete callback (optional)
- if (!success) {
- OutputDebugStringA("[VideoPlayerControl2] Frame processing failed\n");
+ processor->ProcessFrame(player, [weakThis = get_weak()](bool success) {
+ if (auto strongThis = weakThis.get()) {
+ if (!success) {
+ OutputDebugStringA("[VideoPlayerControl2] Frame processing failed\n");
+
+ // Track consecutive errors for auto-recovery
+ static int consecutiveErrors = 0;
+ consecutiveErrors++;
+
+ // Auto-recovery after too many consecutive errors
+ if (consecutiveErrors > 10) {
+ OutputDebugStringA("[VideoPlayerControl2] Too many consecutive errors, attempting recovery\n");
+
+ // Stop playback on UI thread
+ strongThis->DispatcherQueue().TryEnqueue([weakThis]() {
+ if (auto strongThis2 = weakThis.get()) {
+ strongThis2->Stop();
+ strongThis2->UpdateStatus(L"Playback error - stopped");
+ }
+ });
+
+ consecutiveErrors = 0;
+ }
+ } else {
+ // Reset error counter on success
+ static int consecutiveErrors = 0;
+ consecutiveErrors = 0;
+ }
}
});
}
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
index 22d20e0..1c24543 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp
@@ -31,91 +31,29 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
std::function onComplete)
{
if (!player || !m_renderer || !m_dispatcherQueue) {
- std::cerr << "[FrameProcessor] Invalid state: player="
- << player << ", renderer=" << m_renderer
- << ", dispatcherQueue=" << (m_dispatcherQueue ? "valid" : "null") << std::endl;
+ OutputDebugStringA("[FrameProcessor] Invalid state: missing player/renderer/dispatcherQueue\n");
return false;
}
// Check if previous frame is still processing
- // This prevents NVDEC surface queue overflow
bool expected = false;
if (!m_frameProcessing.compare_exchange_strong(expected, true)) {
- // Previous frame still processing, drop this frame
m_framesDropped++;
OutputDebugStringA("[FrameProcessor] Frame dropped (previous frame still processing)\n");
return false;
}
- // Decode on current (background) thread
- bool decodeSuccess = DecodeFrameToNV12(player);
- if (!decodeSuccess) {
- m_decodeErrors++;
- m_frameProcessing.store(false);
-
- if (onComplete) {
- onComplete(false);
- }
-
- return false;
- }
-
- m_framesDecoded++;
- OutputDebugStringA("[FrameProcessor] Frame decoded successfully, enqueuing render...\n");
-
- // Render on UI thread (lightweight D3D12 Present)
- // CRITICAL: Keep m_frameProcessing = true until render completes
- bool enqueued = m_dispatcherQueue.TryEnqueue([this, onComplete]() {
- OutputDebugStringA("[FrameProcessor] Render callback executing on UI thread...\n");
-
- bool renderSuccess = RenderNV12ToScreen();
- if (!renderSuccess) {
- m_renderErrors++;
- }
-
- // Mark frame processing complete AFTER render
- m_frameProcessing.store(false);
-
- if (onComplete) {
- onComplete(renderSuccess);
- }
-
- if (renderSuccess) {
- OutputDebugStringA("[FrameProcessor] Frame rendered successfully\n");
- } else {
- OutputDebugStringA("[FrameProcessor] Frame render FAILED\n");
- }
- });
-
- if (!enqueued) {
- std::cerr << "[FrameProcessor] Failed to enqueue render callback" << std::endl;
- m_frameProcessing.store(false);
-
- if (onComplete) {
- onComplete(false);
- }
-
- return false;
- }
-
- return true;
-}
-
-bool FrameProcessor::DecodeFrameToNV12(VavCorePlayer* player)
-{
- if (!player || !m_renderer) {
- return false;
- }
-
- // Get NV12 texture from renderer (CUDA-D3D12 interop target)
+ // Get NV12 texture from renderer
ID3D12Resource* nv12Texture = m_renderer->GetNV12TextureForCUDAInterop();
if (!nv12Texture) {
- std::cerr << "[FrameProcessor] NV12 texture not available" << std::endl;
+ OutputDebugStringA("[FrameProcessor] Failed to get NV12 texture\n");
+ m_frameProcessing.store(false);
+ if (onComplete) onComplete(false);
return false;
}
- // Decode directly to D3D12 NV12 texture (zero-copy)
- VavCoreVideoFrame vavFrame;
+ // Decode frame to D3D12 surface (blocking)
+ VavCoreVideoFrame vavFrame = {};
VavCoreResult result = vavcore_decode_to_surface(
player,
VAVCORE_SURFACE_D3D12_RESOURCE,
@@ -123,33 +61,48 @@ bool FrameProcessor::DecodeFrameToNV12(VavCorePlayer* player)
&vavFrame
);
- if (result == VAVCORE_SUCCESS) {
- OutputDebugStringA("[FrameProcessor] Decode to NV12 surface SUCCESS\n");
- return true;
- } else if (result == VAVCORE_END_OF_STREAM) {
- OutputDebugStringA("[FrameProcessor] End of stream reached\n");
- return false;
- } else {
- char buf[256];
- sprintf_s(buf, "[FrameProcessor] Decode failed with error: %d\n", result);
- OutputDebugStringA(buf);
- return false;
- }
-}
-
-bool FrameProcessor::RenderNV12ToScreen()
-{
- if (!m_renderer) {
+ if (result != VAVCORE_SUCCESS) {
+ if (result != VAVCORE_END_OF_STREAM) {
+ m_decodeErrors++;
+ OutputDebugStringA("[FrameProcessor] Decode error\n");
+ }
+ m_frameProcessing.store(false);
+ if (onComplete) onComplete(false);
return false;
}
- // Render NV12 texture to back buffer (YUV β RGB conversion + Present)
- HRESULT hr = m_renderer->RenderNV12TextureToBackBuffer();
+ m_framesDecoded++;
- if (FAILED(hr)) {
- char buf[256];
- sprintf_s(buf, "[FrameProcessor] Render failed with HRESULT: 0x%08X\n", hr);
- OutputDebugStringA(buf);
+ // Enqueue render on UI thread with fence value for GPU sync
+ uint64_t fenceValue = vavFrame.sync_fence_value;
+ OutputDebugStringA("[FrameProcessor] Attempting to enqueue render on UI thread...\n");
+
+ bool enqueued = m_dispatcherQueue.TryEnqueue([this, fenceValue, onComplete]() {
+ OutputDebugStringA("[FrameProcessor] Render callback executing on UI thread\n");
+ HRESULT hr = m_renderer->RenderNV12TextureToBackBuffer(fenceValue);
+ bool renderSuccess = SUCCEEDED(hr);
+
+ if (!renderSuccess) {
+ m_renderErrors++;
+ char buf[256];
+ sprintf_s(buf, "[FrameProcessor] Render error: HRESULT = 0x%08X\n", hr);
+ OutputDebugStringA(buf);
+ } else {
+ OutputDebugStringA("[FrameProcessor] Render succeeded\n");
+ }
+
+ // Mark frame processing complete
+ m_frameProcessing.store(false);
+
+ if (onComplete) {
+ onComplete(renderSuccess);
+ }
+ });
+
+ if (!enqueued) {
+ OutputDebugStringA("[FrameProcessor] Failed to enqueue render\n");
+ m_frameProcessing.store(false);
+ if (onComplete) onComplete(false);
return false;
}
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h
index 6144995..c8903c3 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.h
@@ -53,10 +53,6 @@ private:
std::atomic m_framesDropped{0};
std::atomic m_decodeErrors{0};
std::atomic m_renderErrors{0};
-
- // Helper methods
- bool DecodeFrameToNV12(VavCorePlayer* player);
- bool RenderNV12ToScreen();
};
} // namespace Vav2Player
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp
index 980db6b..f47b42d 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp
@@ -7,14 +7,18 @@ namespace Vav2Player {
PlaybackController::PlaybackController()
{
- InitializeVavCore();
+ // NOTE: VavCore is initialized globally in App.xaml.cpp
+ // Do NOT call vavcore_initialize() here to avoid duplicate initialization
+ OutputDebugStringA("[PlaybackController] Constructor called (VavCore already initialized)\n");
}
PlaybackController::~PlaybackController()
{
+ OutputDebugStringA("[PlaybackController] Destructor called\n");
Stop();
Unload();
- CleanupVavCore();
+ // NOTE: VavCore cleanup is handled globally in App.xaml.cpp
+ // Do NOT call vavcore_cleanup() here
}
bool PlaybackController::InitializeVavCore()
@@ -35,6 +39,12 @@ void PlaybackController::CleanupVavCore()
std::cout << "[PlaybackController] VavCore cleaned up" << std::endl;
}
+void PlaybackController::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type)
+{
+ m_d3dDevice = d3d_device;
+ m_d3dSurfaceType = type;
+}
+
bool PlaybackController::LoadVideo(const std::wstring& filePath)
{
// Unload previous video if any
@@ -49,12 +59,20 @@ bool PlaybackController::LoadVideo(const std::wstring& filePath)
return false;
}
+ // Set decoder type before opening file
+ vavcore_set_decoder_type(m_vavCorePlayer, m_decoderType);
+
+ // Set D3D device before opening file, if it was provided
+ if (m_d3dDevice) {
+ vavcore_set_d3d_device(m_vavCorePlayer, m_d3dDevice, m_d3dSurfaceType);
+ }
+
// Convert wstring to UTF-8 string
int size_needed = WideCharToMultiByte(CP_UTF8, 0, filePath.c_str(), -1, nullptr, 0, nullptr, nullptr);
std::string utf8FilePath(size_needed - 1, 0);
WideCharToMultiByte(CP_UTF8, 0, filePath.c_str(), -1, &utf8FilePath[0], size_needed, nullptr, nullptr);
- // Open video file
+ // Open video file (this will initialize the decoder)
VavCoreResult result = vavcore_open_file(m_vavCorePlayer, utf8FilePath.c_str());
if (result != VAVCORE_SUCCESS) {
std::cerr << "[PlaybackController] Failed to open file: " << result << std::endl;
@@ -63,9 +81,6 @@ bool PlaybackController::LoadVideo(const std::wstring& filePath)
return false;
}
- // Set decoder type
- vavcore_set_decoder_type(m_vavCorePlayer, m_decoderType);
-
// Get video metadata
VavCoreVideoMetadata metadata;
result = vavcore_get_metadata(m_vavCorePlayer, &metadata);
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h
index 91146d6..b8649c6 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.h
@@ -51,10 +51,17 @@ public:
void SetDecoderType(VavCoreDecoderType type);
VavCoreDecoderType GetDecoderType() const { return m_decoderType; }
+ // D3D device configuration (for GPU acceleration)
+ void SetD3DDevice(void* d3d_device, VavCoreSurfaceType type);
+
private:
// VavCore player instance
VavCorePlayer* m_vavCorePlayer = nullptr;
+ // D3D device information (pending)
+ void* m_d3dDevice = nullptr;
+ VavCoreSurfaceType m_d3dSurfaceType = VAVCORE_SURFACE_CPU;
+
// Playback state
std::atomic m_isPlaying{false};
std::atomic m_isLoaded{false};
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp
index 0d3929c..291972a 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.cpp
@@ -116,6 +116,9 @@ void SimpleGPURenderer::Shutdown()
// Wait for GPU to finish
WaitForGPU();
+ // Release sync fence (note: VavCore owns the fence, we just clear our reference)
+ m_syncFence = nullptr;
+
// Reset COM objects
m_computePipelineState.Reset();
m_computeRootSignature.Reset();
@@ -135,6 +138,9 @@ void SimpleGPURenderer::Shutdown()
}
m_constantBuffer.Reset();
+ // Release NV12 texture
+ m_nv12Texture.Reset();
+
for (UINT i = 0; i < FrameCount; i++)
{
m_renderTargets[i].Reset();
@@ -151,6 +157,31 @@ void SimpleGPURenderer::Shutdown()
m_initialized = false;
}
+// Set the shared fence for CUDA-D3D12 synchronization
+// Note: VavCore owns the fence lifecycle, we just store a reference
+void SimpleGPURenderer::SetSyncFence(void* fence)
+{
+ // Release previous fence reference if exists
+ if (m_syncFence != nullptr) {
+ OutputDebugStringA("[SimpleGPURenderer::SetSyncFence] WARNING: Replacing existing fence\n");
+ }
+
+ m_syncFence = fence;
+
+ if (fence) {
+ OutputDebugStringA("[SimpleGPURenderer::SetSyncFence] Sync fence set successfully\n");
+ }
+}
+
+// Release NV12 texture (cleanup partial initialization)
+void SimpleGPURenderer::ReleaseNV12Texture()
+{
+ if (m_nv12Texture) {
+ OutputDebugStringA("[SimpleGPURenderer::ReleaseNV12Texture] Releasing NV12 texture\n");
+ m_nv12Texture.Reset();
+ }
+}
+
HRESULT SimpleGPURenderer::RenderVideoFrame(const VavCoreVideoFrame& frame)
{
if (!m_initialized)
@@ -289,11 +320,21 @@ HRESULT SimpleGPURenderer::CreateSwapChain(winrt::Microsoft::UI::Xaml::Controls:
if (FAILED(hr)) return hr;
// Associate with SwapChainPanel
+ OutputDebugStringA("[SimpleGPURenderer::CreateSwapChain] Associating SwapChain with SwapChainPanel...\n");
auto panelNative = panel.as();
hr = panelNative->SetSwapChain(m_swapChain.Get());
- if (FAILED(hr)) return hr;
+ if (FAILED(hr)) {
+ char buf[256];
+ sprintf_s(buf, "[SimpleGPURenderer::CreateSwapChain] Failed to set SwapChain: HRESULT = 0x%08X\n", hr);
+ OutputDebugStringA(buf);
+ return hr;
+ }
+ OutputDebugStringA("[SimpleGPURenderer::CreateSwapChain] SwapChain successfully associated with SwapChainPanel\n");
m_frameIndex = m_swapChain->GetCurrentBackBufferIndex();
+ char buf[256];
+ sprintf_s(buf, "[SimpleGPURenderer::CreateSwapChain] Current back buffer index: %u\n", m_frameIndex);
+ OutputDebugStringA(buf);
return S_OK;
}
@@ -754,10 +795,10 @@ float4 PSMain(PSInput input) : SV_TARGET
// Sample UV from plane 1 (half resolution, interleaved)
float2 uv = g_uvPlane.Sample(g_sampler, input.texcoord);
- float u = uv.x;
- float v = uv.y;
+ float u = uv.x; // U component
+ float v = uv.y; // V component
- // Convert from [0,1] to YUV range (BT.709)
+ // Convert from [0,1] to YUV range (Limited Range BT.709)
y = (y * 255.0 - 16.0) / 219.0;
u = (u * 255.0 - 128.0) / 224.0;
v = (v * 255.0 - 128.0) / 224.0;
@@ -973,20 +1014,31 @@ HRESULT SimpleGPURenderer::CreateNV12GraphicsPipeline()
return S_OK;
}
-HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer()
+HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer(uint64_t fenceValue)
{
if (!m_nv12Texture || !m_initialized) {
std::cout << "[SimpleGPURenderer::RenderNV12Texture] ERROR: NV12 texture or renderer not initialized" << std::endl;
return E_FAIL;
}
+ // Wait for the decoder to finish writing to the texture on the GPU
+ // Note: Skip wait if fenceValue is 0 (D3D12 fence values start from 1)
+ if (m_syncFence && fenceValue > 0) {
+ m_commandQueue->Wait(static_cast(m_syncFence), fenceValue);
+ }
+
// Initialize NV12 graphics pipeline on first call
if (!m_nv12PipelineState) {
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Creating NV12 graphics pipeline...\n");
HRESULT hr = CreateNV12GraphicsPipeline();
if (FAILED(hr)) {
+ char buf[256];
+ sprintf_s(buf, "[SimpleGPURenderer::RenderNV12Texture] Failed to create NV12 graphics pipeline: HRESULT = 0x%08X\n", hr);
+ OutputDebugStringA(buf);
std::cout << "[SimpleGPURenderer::RenderNV12Texture] Failed to create NV12 graphics pipeline" << std::endl;
return hr;
}
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] NV12 graphics pipeline created successfully\n");
// Create 2 SRVs for NV12 texture planes (native format)
CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(m_nv12SrvHeap->GetCPUDescriptorHandleForHeapStart());
@@ -1012,6 +1064,7 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer()
uvPlaneSrvDesc.Texture2D.PlaneSlice = 1; // UV plane
m_device->CreateShaderResourceView(m_nv12Texture.Get(), &uvPlaneSrvDesc, srvHandle);
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] NV12 SRVs created (Y + UV planes)\n");
std::cout << "[SimpleGPURenderer::RenderNV12Texture] NV12 native format SRVs created (Y + UV planes)" << std::endl;
}
@@ -1048,13 +1101,11 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer()
m_nv12ConstantBuffer->Unmap(0, nullptr);
}
- // Add UAV barrier to ensure CUDA write completion before D3D12 read
- D3D12_RESOURCE_BARRIER uavBarrier = {};
- uavBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
- uavBarrier.UAV.pResource = m_nv12Texture.Get();
- m_commandList->ResourceBarrier(1, &uavBarrier);
+ // Per VavCore logs, `cudaDeviceSynchronize()` is called internally, which is a full CPU-blocking sync.
+ // This means by the time this D3D12 code executes, the CUDA writes are already complete.
+ // Therefore, no additional GPU-side synchronization barrier (UAV or Aliasing) is needed.
- // Transition NV12 texture (R8 layout) from COMMON to PIXEL_SHADER_RESOURCE
+ // Transition NV12 texture from COMMON (where CUDA left it) to PIXEL_SHADER_RESOURCE for reading.
D3D12_RESOURCE_BARRIER nv12TextureBarrier = {};
nv12TextureBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
nv12TextureBarrier.Transition.pResource = m_nv12Texture.Get();
@@ -1098,7 +1149,9 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer()
m_commandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
// Draw full-screen quad (4 vertices, triangle strip)
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Calling DrawInstanced(4, 1, 0, 0)...\n");
m_commandList->DrawInstanced(4, 1, 0, 0);
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] DrawInstanced completed\n");
// Transition back buffer from RENDER_TARGET to PRESENT
D3D12_RESOURCE_BARRIER rtToPresentBarrier = {};
@@ -1115,11 +1168,20 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer()
m_commandList->ResourceBarrier(1, &nv12TextureBarrier);
// Close and execute command list
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Closing command list...\n");
hr = m_commandList->Close();
- if (FAILED(hr)) return hr;
+ if (FAILED(hr)) {
+ char buf[256];
+ sprintf_s(buf, "[SimpleGPURenderer::RenderNV12Texture] CommandList Close failed: HRESULT = 0x%08X\n", hr);
+ OutputDebugStringA(buf);
+ return hr;
+ }
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] CommandList closed successfully\n");
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Executing command list on GPU...\n");
ID3D12CommandList* commandLists[] = { m_commandList.Get() };
m_commandQueue->ExecuteCommandLists(1, commandLists);
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Command list executed\n");
// Signal completion fence BEFORE Present()
const UINT64 currentFrameFenceValue = ++m_fenceValue;
@@ -1128,11 +1190,16 @@ HRESULT SimpleGPURenderer::RenderNV12TextureToBackBuffer()
if (FAILED(hr)) return hr;
// Present the frame to screen (VSync enabled)
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Calling Present()...\n");
hr = m_swapChain->Present(1, 0);
if (FAILED(hr)) {
+ char buf[256];
+ sprintf_s(buf, "[SimpleGPURenderer::RenderNV12Texture] Present failed: HRESULT = 0x%08X\n", hr);
+ OutputDebugStringA(buf);
std::cout << "[SimpleGPURenderer::RenderNV12Texture] Present failed: 0x" << std::hex << hr << std::endl;
return hr;
}
+ OutputDebugStringA("[SimpleGPURenderer::RenderNV12Texture] Present succeeded\n");
// Advance to next frame (swapchain selects next back buffer)
m_frameIndex = m_swapChain->GetCurrentBackBufferIndex();
diff --git a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h
index 2f72855..663a361 100644
--- a/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h
+++ b/vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/SimpleGPURenderer.h
@@ -59,12 +59,19 @@ public:
// Returns NV12 texture - native DXGI_FORMAT_NV12 for proper CUDA interop
ID3D12Resource* GetNV12TextureForCUDAInterop() const { return m_nv12Texture.Get(); }
+ // Set the shared fence for CUDA-D3D12 synchronization
+ // Takes ownership of the fence pointer (caller should not delete it)
+ void SetSyncFence(void* fence);
+
+ // Release NV12 texture (cleanup partial initialization)
+ void ReleaseNV12Texture();
+
// Create NV12 texture for CUDA-D3D12 interop
// Format: DXGI_FORMAT_NV12 (native 2-plane YUV format)
HRESULT CreateNV12TextureR8Layout(uint32_t videoWidth, uint32_t videoHeight);
// Render NV12 texture to back buffer (YUV to RGB conversion on GPU)
- HRESULT RenderNV12TextureToBackBuffer();
+ HRESULT RenderNV12TextureToBackBuffer(uint64_t fenceValue);
private:
// D3D12 core objects
@@ -73,6 +80,9 @@ private:
ComPtr m_swapChain;
ComPtr m_rtvHeap;
+ // Synchronization with VavCore (CUDA)
+ void* m_syncFence = nullptr; // ID3D12Fence*
+
// Command objects - Double buffering (changed from 3 for performance test)
static const UINT FrameCount = 2;
ComPtr m_commandAllocators[FrameCount];
diff --git a/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h b/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h
index c53041b..031753a 100644
--- a/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h
+++ b/vav2/platforms/windows/vavcore/include/VavCore/VavCore.h
@@ -87,6 +87,9 @@ typedef struct {
uint64_t timestamp_us; // Timestamp in microseconds
uint64_t frame_number; // Frame sequence number
+ // GPU synchronization
+ uint64_t sync_fence_value; // Fence value to wait on before using the surface
+
// New D3D surface fields
VavCoreSurfaceType surface_type;
union {
@@ -249,6 +252,7 @@ VAVCORE_API VavCoreResult vavcore_decode_to_surface(VavCorePlayer* player,
// Platform-specific Graphics API setup functions
// Windows
VAVCORE_API VavCoreResult vavcore_set_d3d_device(VavCorePlayer* player, void* d3d_device, VavCoreSurfaceType type);
+VAVCORE_API void* vavcore_get_sync_fence(VavCorePlayer* player); // Returns ID3D12Fence*
// Android
VAVCORE_API VavCoreResult vavcore_set_android_surface(VavCorePlayer* player, void* native_window);
diff --git a/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h b/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h
index 4caeedb..6c56624 100644
--- a/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h
+++ b/vav2/platforms/windows/vavcore/src/Common/VideoTypes.h
@@ -98,6 +98,9 @@ struct VideoFrame {
bool is_keyframe = false;
bool is_valid = false;
+ // GPU Synchronization
+ uint64_t sync_fence_value = 0;
+
// Constructor
VideoFrame() = default;
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h b/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h
index 55504ef..d8c9d4f 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/IVideoDecoder.h
@@ -44,6 +44,11 @@ public:
return false; // Default implementation: D3D not supported
}
+ // Return shared D3D12 fence for synchronization
+ virtual void* GetSyncFence() {
+ return nullptr; // Default implementation: No fence available
+ }
+
// Android
virtual bool SetAndroidSurface(void* native_window) {
return false; // Default implementation: Android surfaces not supported
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
index a667bc2..17fa8be 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -20,6 +20,90 @@
#include "NVDECAV1Decoder.h"
#include "VideoDecoderFactory.h"
+const char* g_deinterleave_kernel_ptx = R"PTX(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-36424714
+// Cuda compilation tools, release 13.0, V13.0.88
+// Based on NVVM 7.0.1
+//
+
+.version 9.0
+.target sm_75
+.address_size 64
+
+ // .globl DeinterleaveUV_Kernel
+
+.visible .entry DeinterleaveUV_Kernel(
+ .param .u64 DeinterleaveUV_Kernel_param_0,
+ .param .u32 DeinterleaveUV_Kernel_param_1,
+ .param .u64 DeinterleaveUV_Kernel_param_2,
+ .param .u32 DeinterleaveUV_Kernel_param_3,
+ .param .u64 DeinterleaveUV_Kernel_param_4,
+ .param .u32 DeinterleaveUV_Kernel_param_5,
+ .param .u32 DeinterleaveUV_Kernel_param_6,
+ .param .u32 DeinterleaveUV_Kernel_param_7
+)
+{
+ .reg .pred %p<4>;
+ .reg .b16 %rs<3>;
+ .reg .b32 %r<18>;
+ .reg .b64 %rd<18>;
+
+
+ ld.param.u64 %rd1, [DeinterleaveUV_Kernel_param_0];
+ ld.param.u32 %r3, [DeinterleaveUV_Kernel_param_1];
+ ld.param.u64 %rd2, [DeinterleaveUV_Kernel_param_2];
+ ld.param.u32 %r4, [DeinterleaveUV_Kernel_param_3];
+ ld.param.u64 %rd3, [DeinterleaveUV_Kernel_param_4];
+ ld.param.u32 %r5, [DeinterleaveUV_Kernel_param_5];
+ ld.param.u32 %r6, [DeinterleaveUV_Kernel_param_6];
+ ld.param.u32 %r7, [DeinterleaveUV_Kernel_param_7];
+ mov.u32 %r8, %ntid.x;
+ mov.u32 %r9, %ctaid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r1, %r9, %r8, %r10;
+ mov.u32 %r11, %ntid.y;
+ mov.u32 %r12, %ctaid.y;
+ mov.u32 %r13, %tid.y;
+ mad.lo.s32 %r2, %r12, %r11, %r13;
+ setp.ge.s32 %p1, %r1, %r6;
+ setp.ge.s32 %p2, %r2, %r7;
+ or.pred %p3, %p1, %p2;
+ @%p3 bra $L__BB0_2;
+
+ cvta.to.global.u64 %rd4, %rd2;
+ mul.lo.s32 %r14, %r2, %r3;
+ cvt.s64.s32 %rd5, %r14;
+ shl.b32 %r15, %r1, 1;
+ cvt.s64.s32 %rd6, %r15;
+ add.s64 %rd7, %rd5, %rd6;
+ cvta.to.global.u64 %rd8, %rd1;
+ add.s64 %rd9, %rd8, %rd7;
+ mul.lo.s32 %r16, %r2, %r4;
+ cvt.s64.s32 %rd10, %r16;
+ cvt.s64.s32 %rd11, %r1;
+ add.s64 %rd12, %rd10, %rd11;
+ add.s64 %rd13, %rd4, %rd12;
+ mul.lo.s32 %r17, %r2, %r5;
+ cvt.s64.s32 %rd14, %r17;
+ add.s64 %rd15, %rd14, %rd11;
+ cvta.to.global.u64 %rd16, %rd3;
+ add.s64 %rd17, %rd16, %rd15;
+ ld.global.u8 %rs1, [%rd9];
+ st.global.u8 [%rd13], %rs1;
+ ld.global.u8 %rs2, [%rd9+1];
+ st.global.u8 [%rd17], %rs2;
+
+$L__BB0_2:
+ ret;
+
+}
+
+
+)PTX";
+
namespace VavCore {
NVDECAV1Decoder::NVDECAV1Decoder()
@@ -73,6 +157,22 @@ bool NVDECAV1Decoder::Initialize(const VideoMetadata& metadata) {
return false;
}
+ // Load the PTX module for the deinterleave kernel
+ CUresult result = cuModuleLoadData(&m_module, g_deinterleave_kernel_ptx);
+ if (result != CUDA_SUCCESS) {
+ LogError("Failed to load CUDA module from PTX");
+ LogCUDAError(result, "cuModuleLoadData");
+ Cleanup();
+ return false;
+ }
+ result = cuModuleGetFunction(&m_kernel, m_module, "DeinterleaveUV_Kernel");
+ if (result != CUDA_SUCCESS) {
+ LogError("Failed to get CUDA kernel function from module");
+ LogCUDAError(result, "cuModuleGetFunction");
+ Cleanup();
+ return false;
+ }
+
m_initialized = true;
std::cout << "[NVDECAV1Decoder] Initialized successfully" << std::endl;
@@ -92,6 +192,21 @@ void NVDECAV1Decoder::Cleanup() {
m_cachedD3D12Resource = nullptr;
}
+ // Clean up D3D12 synchronization objects
+ if (m_cudaSemaphore != nullptr) {
+ // CUDA takes ownership of the shared handle when importing the semaphore
+ // It will be closed automatically when destroying the semaphore
+ cudaDestroyExternalSemaphore(m_cudaSemaphore);
+ m_cudaSemaphore = nullptr;
+ }
+ // NOTE: m_d3d12FenceSharedHandle is owned by CUDA after import, do NOT close it here
+ m_d3d12FenceSharedHandle = nullptr;
+
+ if (m_d3d12Fence != nullptr) {
+ static_cast(m_d3d12Fence)->Release();
+ m_d3d12Fence = nullptr;
+ }
+
if (m_parser) {
cuvidDestroyVideoParser(m_parser);
m_parser = nullptr;
@@ -102,6 +217,11 @@ void NVDECAV1Decoder::Cleanup() {
m_decoder = nullptr;
}
+ if (m_module) {
+ cuModuleUnload(m_module);
+ m_module = nullptr;
+ }
+
CleanupCUDA();
m_initialized = false;
}
@@ -150,12 +270,12 @@ bool NVDECAV1Decoder::DecodeFrame(const uint8_t* packet_data, size_t packet_size
int frameIdx = -1;
{
std::unique_lock lock(m_frameQueueMutex);
- m_frameAvailable.wait(lock, [this] {
- return !m_decodedFrameIndices.empty();
- });
+ if (!m_frameAvailable.wait_for(lock, std::chrono::milliseconds(100), [this] { return !m_decodedFrameIndices.empty(); })) {
+ LogError("Timeout waiting for decoded frame");
+ return false; // Timeout
+ }
frameIdx = m_decodedFrameIndices.front();
m_decodedFrameIndices.pop();
- OutputDebugStringA("[DecodeFrame] Frame retrieved from queue\n");
}
// Map the decoded frame from GPU memory
@@ -172,14 +292,16 @@ bool NVDECAV1Decoder::DecodeFrame(const uint8_t* packet_data, size_t packet_size
// Allocate CPU memory for YUV planes
size_t y_size = m_width * m_height;
- size_t uv_size = (m_width / 2) * (m_height / 2);
+ int uv_width = m_width / 2;
+ int uv_height = m_height / 2;
+ size_t uv_size = static_cast(uv_width) * uv_height;
output_frame.y_plane.reset(new uint8_t[y_size]);
output_frame.u_plane.reset(new uint8_t[uv_size]);
output_frame.v_plane.reset(new uint8_t[uv_size]);
output_frame.y_stride = m_width;
- output_frame.u_stride = m_width / 2;
- output_frame.v_stride = m_width / 2;
+ output_frame.u_stride = uv_width;
+ output_frame.v_stride = uv_width;
// Copy Y plane from GPU to CPU
CUDA_MEMCPY2D copyParamsY = {};
@@ -199,39 +321,72 @@ bool NVDECAV1Decoder::DecodeFrame(const uint8_t* packet_data, size_t packet_size
return false;
}
- // NV12 to YUV420P: Extract U and V from interleaved UV
- // UV plane starts after Y plane in NV12 format
- CUdeviceptr uvSrcPtr = srcDevicePtr + (srcPitch * m_height);
-
- // Copy UV plane to temporary buffer
- size_t uv_pitch = m_width;
- std::unique_ptr uv_temp(new uint8_t[uv_pitch * (m_height / 2)]);
-
- CUDA_MEMCPY2D copyParamsUV = {};
- copyParamsUV.srcMemoryType = CU_MEMORYTYPE_DEVICE;
- copyParamsUV.srcDevice = uvSrcPtr;
- copyParamsUV.srcPitch = srcPitch;
- copyParamsUV.dstMemoryType = CU_MEMORYTYPE_HOST;
- copyParamsUV.dstHost = uv_temp.get();
- copyParamsUV.dstPitch = uv_pitch;
- copyParamsUV.WidthInBytes = m_width; // NV12 UV is interleaved
- copyParamsUV.Height = m_height / 2;
-
- result = cuMemcpy2D(©ParamsUV);
- if (result != CUDA_SUCCESS) {
+ // --- GPU-accelerated UV De-interleaving ---
+ uint8_t *d_u_plane = nullptr, *d_v_plane = nullptr;
+ cudaError_t cudaStatus = cudaMalloc(&d_u_plane, uv_size);
+ if (cudaStatus != cudaSuccess) {
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
- LogCUDAError(result, "cuMemcpy2D (UV plane)");
+ LogError("cudaMalloc failed for U plane");
+ return false;
+ }
+ cudaStatus = cudaMalloc(&d_v_plane, uv_size);
+ if (cudaStatus != cudaSuccess) {
+ cudaFree(d_u_plane);
+ cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+ LogError("cudaMalloc failed for V plane");
return false;
}
- // Deinterleave UV to separate U and V planes
- for (uint32_t i = 0; i < m_height / 2; i++) {
- for (uint32_t j = 0; j < m_width / 2; j++) {
- output_frame.u_plane[i * (m_width / 2) + j] = uv_temp[i * uv_pitch + j * 2];
- output_frame.v_plane[i * (m_width / 2) + j] = uv_temp[i * uv_pitch + j * 2 + 1];
- }
+ CUdeviceptr uvSrcPtr = srcDevicePtr + (srcPitch * m_height);
+ dim3 blockDim(16, 16);
+ dim3 gridDim((uv_width + blockDim.x - 1) / blockDim.x, (uv_height + blockDim.y - 1) / blockDim.y);
+
+ int u_pitch = uv_width;
+ int v_pitch = uv_width;
+ void* args[] = { &uvSrcPtr, &srcPitch, &d_u_plane, &u_pitch, &d_v_plane, &v_pitch, &uv_width, &uv_height };
+
+ result = cuLaunchKernel(m_kernel, gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, 0, m_stream, args, nullptr);
+ if (result != CUDA_SUCCESS) {
+ cudaFree(d_u_plane);
+ cudaFree(d_v_plane);
+ cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+ LogCUDAError(result, "cuLaunchKernel(DeinterleaveUV_Kernel)");
+ return false;
}
+ // Synchronize stream to ensure kernel completion before copy
+ result = cuStreamSynchronize(m_stream);
+ if (result != CUDA_SUCCESS) {
+ cudaFree(d_u_plane);
+ cudaFree(d_v_plane);
+ cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+ LogCUDAError(result, "cuStreamSynchronize after DeinterleaveUV_Kernel");
+ return false;
+ }
+
+ // Copy separated U and V planes from GPU to CPU
+ cudaStatus = cudaMemcpy(output_frame.u_plane.get(), d_u_plane, uv_size, cudaMemcpyDeviceToHost);
+ if (cudaStatus != cudaSuccess) {
+ cudaFree(d_u_plane);
+ cudaFree(d_v_plane);
+ cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+ LogError("cudaMemcpy failed for U plane");
+ return false;
+ }
+
+ cudaStatus = cudaMemcpy(output_frame.v_plane.get(), d_v_plane, uv_size, cudaMemcpyDeviceToHost);
+ if (cudaStatus != cudaSuccess) {
+ cudaFree(d_u_plane);
+ cudaFree(d_v_plane);
+ cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+ LogError("cudaMemcpy failed for V plane");
+ return false;
+ }
+
+ // Cleanup temporary GPU buffers
+ cudaFree(d_u_plane);
+ cudaFree(d_v_plane);
+
// Unmap the frame
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
@@ -679,6 +834,74 @@ bool NVDECAV1Decoder::SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) {
m_d3d12Device = d3d_device;
m_preferredSurfaceType = type;
+ // Create synchronization objects for D3D12 interop if not already created
+ if (m_d3d12Fence == nullptr) {
+ OutputDebugStringA("[SetD3DDevice] Creating D3D12 Fence for GPU synchronization...\n");
+ ID3D12Device* device = static_cast(m_d3d12Device);
+ HRESULT hr = device->CreateFence(0, D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(reinterpret_cast(&m_d3d12Fence)));
+ if (FAILED(hr)) {
+ LogError("Failed to create D3D12 Fence for CUDA interop");
+ OutputDebugStringA("[SetD3DDevice] ERROR: D3D12 Fence creation FAILED\n");
+ return false;
+ }
+
+ char fence_buf[256];
+ sprintf_s(fence_buf, "[SetD3DDevice] D3D12 Fence created successfully: %p\n", m_d3d12Fence);
+ OutputDebugStringA(fence_buf);
+
+ // Create shared handle for the fence so CUDA can access it
+ // IMPORTANT: Must use SECURITY_ATTRIBUTES with bInheritHandle = FALSE for NT handles
+ SECURITY_ATTRIBUTES secAttr = {};
+ secAttr.nLength = sizeof(SECURITY_ATTRIBUTES);
+ secAttr.bInheritHandle = FALSE;
+ secAttr.lpSecurityDescriptor = nullptr;
+
+ HANDLE sharedHandle = nullptr;
+ hr = device->CreateSharedHandle(
+ static_cast(m_d3d12Fence),
+ &secAttr,
+ GENERIC_ALL,
+ nullptr,
+ &sharedHandle
+ );
+ if (FAILED(hr) || sharedHandle == nullptr) {
+ LogError("Failed to create shared handle for D3D12 Fence");
+ OutputDebugStringA("[SetD3DDevice] ERROR: D3D12 Fence shared handle creation FAILED\n");
+ static_cast(m_d3d12Fence)->Release();
+ m_d3d12Fence = nullptr;
+ return false;
+ }
+
+ sprintf_s(fence_buf, "[SetD3DDevice] D3D12 Fence shared handle created: %p\n", sharedHandle);
+ OutputDebugStringA(fence_buf);
+
+ cudaExternalSemaphoreHandleDesc semDesc = {};
+ semDesc.type = cudaExternalSemaphoreHandleTypeD3D12Fence;
+ semDesc.handle.win32.handle = sharedHandle;
+ semDesc.flags = 0;
+
+ cudaError_t cudaStatus = cudaImportExternalSemaphore(&m_cudaSemaphore, &semDesc);
+ if (cudaStatus != cudaSuccess) {
+ char cuda_err[256];
+ sprintf_s(cuda_err, "[SetD3DDevice] CUDA semaphore import failed with error: %d\n", cudaStatus);
+ OutputDebugStringA(cuda_err);
+ LogError("Failed to import D3D12 Fence as CUDA semaphore");
+ CloseHandle(sharedHandle);
+ static_cast(m_d3d12Fence)->Release();
+ m_d3d12Fence = nullptr;
+ return false;
+ }
+ OutputDebugStringA("[SetD3DDevice] CUDA semaphore imported successfully\n");
+
+ // Store the shared handle for cleanup later
+ m_d3d12FenceSharedHandle = sharedHandle;
+ } else {
+ char fence_buf[256];
+ sprintf_s(fence_buf, "[SetD3DDevice] D3D12 Fence already exists: %p\n", m_d3d12Fence);
+ OutputDebugStringA(fence_buf);
+ }
+
+
// DON'T reinitialize if already initialized - this would invalidate existing decoded frames
if (m_initialized) {
OutputDebugStringA("[SetD3DDevice] Decoder already initialized, D3D12 device set for DecodeToSurface\n");
@@ -1111,7 +1334,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Copying Y plane to D3D12 texture...\n");
cudaError_t cudaStatus = cudaMemcpy2D(devicePtr, d3d12Pitch, // Use D3D12 texture pitch
(void*)srcDevicePtr, srcPitch,
- rowSizeInBytes, m_height, // Use rowSizeInBytes for width
+ srcPitch, m_height, // Use pitch as width to copy the entire row
cudaMemcpyDeviceToDevice);
if (cudaStatus != cudaSuccess) {
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (Y plane) failed with code %d: %s\n",
@@ -1171,17 +1394,23 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
// CRITICAL: Use the EXACT UV plane offset from GetCopyableFootprints
// planeLayouts[1].Offset is the correct offset accounting for alignment
void* uvDstPtr = static_cast(devicePtr) + planeLayouts[1].Offset;
+
+ // NVDEC UV plane offset: For NV12, UV plane starts at Y_height * Y_pitch
void* uvSrcPtr = (void*)(srcDevicePtr + (srcPitch * m_height));
+ sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] NVDEC UV source: srcDevicePtr=%p, Y_offset=%llu, UV_srcPtr=%p\n",
+ (void*)srcDevicePtr, (uint64_t)(srcPitch * m_height), uvSrcPtr);
+ OutputDebugStringA(debug_buf);
+
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] UV plane offset from GetCopyableFootprints: %llu bytes\n", planeLayouts[1].Offset);
OutputDebugStringA(debug_buf);
// UV plane: NV12 format has interleaved UV (UVUVUV...)
- // For NV12: UV plane width in pixels = original width / 2, but stored as interleaved bytes
- // So bytes per row = (width / 2) * 2 = width (same as Y plane width in bytes)
- // However, we should use the minimum of rowSize and available width
- UINT uvCopyWidth = static_cast(planeRowSizes[1]);
- UINT uvCopyHeight = planeNumRows[1];
+ // CRITICAL: For cudaMemcpy2D, width parameter is in BYTES, not pixels
+ // UV plane stores U and V interleaved: each row has (width/2) UV pairs = width bytes total
+ // Even though D3D12 sees it as (width/2) R8G8 pixels, the memory layout is still width bytes per row
+ UINT uvCopyWidth = m_width; // Copy full width in bytes (includes both U and V channels)
+ UINT uvCopyHeight = m_height / 2; // UV plane has half height
// Verify we won't exceed allocation bounds
size_t uvPlaneNeededSize = static_cast(planeLayouts[1].Footprint.RowPitch) * uvCopyHeight;
@@ -1207,7 +1436,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
cudaStatus = cudaMemcpy2D(uvDstPtr, planeLayouts[1].Footprint.RowPitch,
uvSrcPtr, srcPitch,
- uvCopyWidth, uvCopyHeight,
+ srcPitch, uvCopyHeight, // Use pitch as width to copy the entire row
cudaMemcpyDeviceToDevice);
if (cudaStatus != cudaSuccess) {
sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaMemcpy2D (UV plane) failed with code %d: %s\n",
@@ -1222,28 +1451,73 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
}
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] UV plane copied successfully to D3D12 texture\n");
- // CRITICAL: Synchronize CUDA device to ensure all GPU operations complete
- // before D3D12 rendering pipeline accesses the texture
- // cudaDeviceSynchronize() ensures ALL CUDA work completes, not just one stream
- cudaError_t syncResult = cudaDeviceSynchronize();
- if (syncResult != cudaSuccess) {
+ // Asynchronously signal the D3D12 fence from CUDA when copies are complete
+ if (m_cudaSemaphore != nullptr) {
+ // Ensure we're in the correct CUDA context
+ CUcontext currentContext = nullptr;
+ cuCtxGetCurrent(¤tContext);
+
char debug_buf[256];
- sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaDeviceSynchronize failed: %s\n",
- cudaGetErrorString(syncResult));
+ sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Current CUDA context: %p, Expected: %p\n",
+ currentContext, m_cuContext);
OutputDebugStringA(debug_buf);
+
+ if (currentContext != m_cuContext) {
+ OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] WARNING: CUDA context mismatch, setting correct context\n");
+ cuCtxSetCurrent(m_cuContext);
+ }
+
+ m_fenceValue++;
+ cudaExternalSemaphoreSignalParams signalParams = {};
+ signalParams.params.fence.value = m_fenceValue;
+ signalParams.flags = 0;
+
+ sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Signaling fence with value: %llu, semaphore: %p, stream: %p\n",
+ m_fenceValue, m_cudaSemaphore, m_stream);
+ OutputDebugStringA(debug_buf);
+
+ cudaStatus = cudaSignalExternalSemaphoresAsync(&m_cudaSemaphore, &signalParams, 1, m_stream);
+ if (cudaStatus != cudaSuccess) {
+ sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cudaSignalExternalSemaphoresAsync failed: %s (error code: %d)\n",
+ cudaGetErrorString(cudaStatus), cudaStatus);
+ OutputDebugStringA(debug_buf);
+
+ // Try synchronizing the stream and signaling the fence from CPU instead
+ OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Falling back to CPU fence signaling\n");
+ cudaStreamSynchronize(m_stream);
+
+ if (m_d3d12Fence != nullptr) {
+ HRESULT hr = static_cast(m_d3d12Fence)->Signal(m_fenceValue);
+ if (SUCCEEDED(hr)) {
+ OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal succeeded\n");
+ output_frame.sync_fence_value = m_fenceValue;
+ } else {
+ OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CPU fence signal also failed\n");
+ output_frame.sync_fence_value = 0;
+ }
+ } else {
+ output_frame.sync_fence_value = 0;
+ }
+ } else {
+ OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA fence signal enqueued successfully\n");
+ // Pass the fence value to the caller
+ output_frame.sync_fence_value = m_fenceValue;
+ }
} else {
- OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA device synchronized successfully (all GPU work complete)\n");
+ OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] No CUDA semaphore available, skipping fence signal\n");
+ // No synchronization available
+ output_frame.sync_fence_value = 0;
}
- // CRITICAL GPU-GPU SYNCHRONIZATION:
- // After CUDA write completes, cudaDeviceSynchronize() has already ensured
- // all CUDA work is complete. D3D12 will handle synchronization via resource
- // barriers when the texture is accessed by the rendering pipeline.
- // No additional D3D12 fence synchronization needed here - it would create deadlock!
- OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] CUDA-D3D12 sync complete via cudaDeviceSynchronize()\n");
-
// Unmap frame and close shared handle (if newly created)
- cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+ result = cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
+ if (result != CUDA_SUCCESS) {
+ sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] cuvidUnmapVideoFrame failed with code %d\n", result);
+ OutputDebugStringA(debug_buf);
+ LogCUDAError(result, "cuvidUnmapVideoFrame");
+ // Even if unmap fails, try to continue and close the handle
+ }
+
if (sharedHandle != nullptr) {
CloseHandle(sharedHandle);
OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] Shared handle closed\n");
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
index 09c4bd4..6384b8e 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
@@ -22,6 +22,7 @@
// Forward declaration for CUDA external memory type
typedef struct CUexternalMemory_st* cudaExternalMemory_t;
+typedef struct CUexternalSemaphore_st* cudaExternalSemaphore_t;
namespace VavCore {
@@ -49,6 +50,7 @@ public:
// D3D Surface decoding functionality
bool SupportsSurfaceType(VavCoreSurfaceType type) const override;
bool SetD3DDevice(void* d3d_device, VavCoreSurfaceType type) override;
+ void* GetSyncFence() override { return m_d3d12Fence; }
bool DecodeToSurface(const uint8_t* packet_data, size_t packet_size,
VavCoreSurfaceType target_type,
void* target_surface,
@@ -101,6 +103,8 @@ private:
CUdevice m_cudaDevice = 0;
CUvideoparser m_parser = nullptr;
CUstream m_stream = nullptr;
+ CUmodule m_module = nullptr;
+ CUfunction m_kernel = nullptr;
// D3D-CUDA interop
void* m_d3d11Device = nullptr; // ID3D11Device*
@@ -108,6 +112,12 @@ private:
CUgraphicsResource m_cudaGraphicsResource = nullptr;
VavCoreSurfaceType m_preferredSurfaceType = VAVCORE_SURFACE_CPU;
+ // D3D12-CUDA Asynchronous Synchronization
+ void* m_d3d12Fence = nullptr; // ID3D12Fence*
+ HANDLE m_d3d12FenceSharedHandle = nullptr; // Shared handle for CUDA interop
+ cudaExternalSemaphore_t m_cudaSemaphore = {};
+ uint64_t m_fenceValue = 0;
+
// External memory caching for D3D12 interop
void* m_cachedD3D12Resource = nullptr; // ID3D12Resource*
cudaExternalMemory_t m_cachedExternalMemory = nullptr;
diff --git a/vav2/platforms/windows/vavcore/src/VavCore.cpp b/vav2/platforms/windows/vavcore/src/VavCore.cpp
index 9abb8b7..b5373b7 100644
--- a/vav2/platforms/windows/vavcore/src/VavCore.cpp
+++ b/vav2/platforms/windows/vavcore/src/VavCore.cpp
@@ -710,6 +710,13 @@ VAVCORE_API VavCoreResult vavcore_set_d3d_device(VavCorePlayer* player, void* d3
return success ? VAVCORE_SUCCESS : VAVCORE_ERROR_NOT_SUPPORTED;
}
+VAVCORE_API void* vavcore_get_sync_fence(VavCorePlayer* player) {
+ if (!player || !player->impl || !player->impl->decoder) {
+ return nullptr;
+ }
+ return player->impl->decoder->GetSyncFence();
+}
+
VAVCORE_API VavCoreResult vavcore_decode_to_surface(VavCorePlayer* player,
VavCoreSurfaceType target_type,
void* target_surface,