WIP - Playback timing jerky
This commit is contained in:
1012
todo21.txt
1012
todo21.txt
File diff suppressed because it is too large
Load Diff
262
vav2/docs/working/Phase1_Implementation_Summary.md
Normal file
262
vav2/docs/working/Phase1_Implementation_Summary.md
Normal file
@@ -0,0 +1,262 @@
|
||||
# Phase 1 Implementation Summary - Playback Timing Fix
|
||||
|
||||
**Date**: 2025-10-07
|
||||
**Status**: ✅ Implemented, Ready for Testing
|
||||
|
||||
## Changes Made
|
||||
|
||||
### 1. PlaybackController.h
|
||||
**Added**:
|
||||
- Forward declaration: `class FrameProcessor;`
|
||||
- Public method: `void SetFrameProcessor(FrameProcessor* processor)`
|
||||
- Public method: `bool IsFrameProcessing() const`
|
||||
- Private member: `FrameProcessor* m_frameProcessor = nullptr`
|
||||
|
||||
**Lines changed**: +5 lines
|
||||
|
||||
### 2. PlaybackController.cpp
|
||||
**Added**:
|
||||
- Include: `#include "FrameProcessor.h"`
|
||||
- Implementation of `IsFrameProcessing()` method
|
||||
- Frame completion wait logic in `TimingThreadLoop()`
|
||||
|
||||
**Modified**:
|
||||
```cpp
|
||||
void PlaybackController::TimingThreadLoop()
|
||||
{
|
||||
// ... existing code ...
|
||||
|
||||
while (!m_shouldStopTiming && m_isPlaying) {
|
||||
auto frameStart = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// Signal frame processing
|
||||
if (m_frameReadyCallback) {
|
||||
m_frameReadyCallback();
|
||||
}
|
||||
|
||||
// ✅ NEW: Wait for frame processing completion (max 100ms timeout)
|
||||
int waitCount = 0;
|
||||
while (IsFrameProcessing() && waitCount < 100) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
waitCount++;
|
||||
}
|
||||
|
||||
if (waitCount >= 100) {
|
||||
LOGF_WARNING("[PlaybackController] Frame processing timeout after 100ms");
|
||||
}
|
||||
|
||||
// Update frame counter
|
||||
m_currentFrame++;
|
||||
m_currentTime = m_currentFrame / m_frameRate;
|
||||
|
||||
// Sleep until next frame
|
||||
std::this_thread::sleep_until(nextFrame);
|
||||
}
|
||||
}
|
||||
|
||||
bool PlaybackController::IsFrameProcessing() const
|
||||
{
|
||||
return m_frameProcessor && m_frameProcessor->IsProcessing();
|
||||
}
|
||||
```
|
||||
|
||||
**Lines changed**: +20 lines
|
||||
|
||||
### 3. VideoPlayerControl2.xaml.cpp
|
||||
**Modified**:
|
||||
```cpp
|
||||
void VideoPlayerControl2::LoadVideo(...)
|
||||
{
|
||||
// ... existing code ...
|
||||
|
||||
if (success) {
|
||||
// ✅ NEW: Link FrameProcessor to PlaybackController
|
||||
if (m_frameProcessor) {
|
||||
m_playbackController->SetFrameProcessor(m_frameProcessor.get());
|
||||
m_frameProcessor->PrepareVideoTexture(videoWidth, videoHeight);
|
||||
}
|
||||
|
||||
// ... rest of code ...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Lines changed**: +1 line
|
||||
|
||||
---
|
||||
|
||||
## Total Changes
|
||||
- **Files modified**: 3
|
||||
- **Lines added**: ~26 lines
|
||||
- **Complexity**: Very Low
|
||||
- **Risk**: Minimal
|
||||
|
||||
---
|
||||
|
||||
## How It Works
|
||||
|
||||
### Before Fix (Broken)
|
||||
```
|
||||
Timeline:
|
||||
0ms : Timing thread signals "process frame N"
|
||||
0ms : FrameProcessor starts decode
|
||||
10-15ms : CUDA decode completes
|
||||
20-25ms : UI render completes
|
||||
33.33ms : Timing thread signals "process frame N+1" (DOESN'T WAIT!)
|
||||
→ If frame N not done, m_frameProcessing check FAILS
|
||||
→ Frame N+1 DROPPED
|
||||
→ Result: Jerky playback
|
||||
```
|
||||
|
||||
### After Fix (Working)
|
||||
```
|
||||
Timeline:
|
||||
0ms : Timing thread signals "process frame N"
|
||||
0ms : FrameProcessor starts decode (m_frameProcessing = true)
|
||||
10-15ms : CUDA decode completes
|
||||
20-25ms : UI render completes
|
||||
25ms : FrameProcessor done (m_frameProcessing = false)
|
||||
25ms : Timing thread detects completion via IsFrameProcessing()
|
||||
25ms : Timing thread advances frame counter
|
||||
33.33ms : Sleep completes, signal "process frame N+1"
|
||||
→ NO FRAME DROPS
|
||||
→ Result: Smooth playback
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Build Status
|
||||
✅ **Build successful**
|
||||
- VavCore-debug.dll: Built
|
||||
- Vav2Player.exe: Built
|
||||
- No errors, no warnings (related to changes)
|
||||
|
||||
---
|
||||
|
||||
## Testing Instructions
|
||||
|
||||
### 1. Launch Application
|
||||
```bash
|
||||
cd "D:/Project/video-av1/vav2/platforms/windows/applications/vav2player/Vav2Player/x64/Debug/Vav2Player"
|
||||
./Vav2Player.exe
|
||||
```
|
||||
|
||||
### 2. Load Test Video
|
||||
- Use file picker to load: `D:/Project/video-av1/sample/simple_test.webm`
|
||||
- Or any 30fps AV1/VP9 video
|
||||
|
||||
### 3. Observe Playback
|
||||
**Expected Results**:
|
||||
- ✅ Smooth playback (no jerking)
|
||||
- ✅ Consistent 30fps
|
||||
- ✅ Zero frame drops
|
||||
|
||||
**Check Logs**:
|
||||
```
|
||||
[PlaybackController] Frame processing timeout after 100ms ← Should NOT appear
|
||||
[FrameProcessor] Frame dropped (#X) ← Should be 0
|
||||
```
|
||||
|
||||
### 4. Long Duration Test
|
||||
- Play for 5 minutes
|
||||
- Monitor for any jerking
|
||||
- Check frame drop counter: should remain 0
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
### Primary
|
||||
- [ ] Video plays smoothly (no "통통 튐")
|
||||
- [ ] No frame drops during 5-minute playback
|
||||
- [ ] User confirms smooth visual playback
|
||||
|
||||
### Secondary
|
||||
- [ ] Logs show no timeout warnings
|
||||
- [ ] Frame intervals consistent at 33.33ms
|
||||
- [ ] CPU usage same as before
|
||||
|
||||
---
|
||||
|
||||
## If Problems Occur
|
||||
|
||||
### Rollback Procedure
|
||||
1. Revert PlaybackController.h:
|
||||
- Remove forward declaration
|
||||
- Remove SetFrameProcessor and IsFrameProcessing methods
|
||||
- Remove m_frameProcessor member
|
||||
|
||||
2. Revert PlaybackController.cpp:
|
||||
- Remove FrameProcessor.h include
|
||||
- Restore original TimingThreadLoop
|
||||
- Remove IsFrameProcessing implementation
|
||||
|
||||
3. Revert VideoPlayerControl2.xaml.cpp:
|
||||
- Remove SetFrameProcessor call
|
||||
|
||||
**Time**: < 5 minutes
|
||||
|
||||
### Alternative: Git Revert
|
||||
```bash
|
||||
git diff HEAD # Review changes
|
||||
git checkout -- <file> # Revert specific file
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Next Steps (Phase 2)
|
||||
|
||||
**If Phase 1 successful**:
|
||||
- Proceed with Fence removal (optional cleanup)
|
||||
- See: `Playback_Timing_Fix_Design.md` Phase 2
|
||||
|
||||
**If Phase 1 issues**:
|
||||
- Debug specific problem
|
||||
- Check FrameProcessor::IsProcessing() behavior
|
||||
- Verify m_frameProcessing atomic flag operations
|
||||
|
||||
---
|
||||
|
||||
## Technical Notes
|
||||
|
||||
### Thread Safety
|
||||
- `IsFrameProcessing()` reads atomic bool `m_frameProcessing` ✅ Thread-safe
|
||||
- `SetFrameProcessor()` called during initialization only ✅ No race condition
|
||||
- Non-owning pointer (FrameProcessor lifetime > PlaybackController) ✅ Safe
|
||||
|
||||
### Performance Impact
|
||||
- Wait loop: 1ms sleep per iteration
|
||||
- Typical wait: 20-25ms (20-25 iterations)
|
||||
- CPU overhead: Negligible (< 1%)
|
||||
|
||||
### Timeout Behavior
|
||||
- Max wait: 100ms (100 iterations)
|
||||
- If timeout occurs: Warning logged, playback continues
|
||||
- Prevents infinite loop if frame processor hangs
|
||||
|
||||
---
|
||||
|
||||
## Logs to Monitor
|
||||
|
||||
### Good Playback
|
||||
```
|
||||
[PlaybackController] Timing thread started
|
||||
[FrameProcessor] ProcessFrame START (decoded: 0, dropped: 0)
|
||||
[FrameProcessor] Decode SUCCESS - frame decoded
|
||||
[FrameProcessor] Render succeeded
|
||||
[FrameProcessor] CLEARING m_frameProcessing flag
|
||||
[PlaybackController] [No timeout warnings]
|
||||
```
|
||||
|
||||
### Bad Playback (Should NOT see)
|
||||
```
|
||||
[PlaybackController] Frame processing timeout after 100ms
|
||||
[FrameProcessor] Frame dropped (#1) - previous frame still processing
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Contact
|
||||
**Implementation**: Claude Code
|
||||
**Review**: User
|
||||
**Date**: 2025-10-07
|
||||
444
vav2/docs/working/Playback_Timing_Fix_Design.md
Normal file
444
vav2/docs/working/Playback_Timing_Fix_Design.md
Normal file
@@ -0,0 +1,444 @@
|
||||
# Playback Timing Fix Design
|
||||
|
||||
**Date**: 2025-10-07
|
||||
**Author**: Claude Code
|
||||
**Status**: Design Phase
|
||||
|
||||
## Problem Statement
|
||||
|
||||
### Symptom
|
||||
Video playback is jerky ("통통 튄다") at 30fps despite NVDEC hardware acceleration working correctly.
|
||||
|
||||
### Root Cause Analysis
|
||||
|
||||
**NOT the problem**:
|
||||
- ❌ CUDA synchronization (cuStreamSynchronize is fine)
|
||||
- ❌ Fence implementation (works correctly but unnecessary)
|
||||
- ❌ GPU decode performance (10-15ms, well within budget)
|
||||
|
||||
**ACTUAL problem**:
|
||||
- ✅ **PlaybackController::TimingThreadLoop()** does NOT wait for frame processing completion
|
||||
- ✅ Timing thread sends "next frame" signal every 33.33ms regardless of previous frame status
|
||||
- ✅ FrameProcessor drops frames when previous frame still processing (m_frameProcessing flag)
|
||||
|
||||
### Evidence
|
||||
|
||||
**PlaybackController.cpp:249-275** - Current broken implementation:
|
||||
```cpp
|
||||
void PlaybackController::TimingThreadLoop()
|
||||
{
|
||||
while (!m_shouldStopTiming && m_isPlaying) {
|
||||
// 1. Signal frame processing
|
||||
if (m_frameReadyCallback) {
|
||||
m_frameReadyCallback(); // → ProcessFrame()
|
||||
}
|
||||
|
||||
// 2. ❌ IMMEDIATE advance without waiting!
|
||||
m_currentFrame++;
|
||||
m_currentTime = m_currentFrame / m_frameRate;
|
||||
|
||||
// 3. Sleep until next frame
|
||||
std::this_thread::sleep_until(nextFrame);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**FrameProcessor.cpp:47-53** - Frame drop mechanism:
|
||||
```cpp
|
||||
bool FrameProcessor::ProcessFrame(...)
|
||||
{
|
||||
// Check if previous frame still processing
|
||||
bool expected = false;
|
||||
if (!m_frameProcessing.compare_exchange_strong(expected, true)) {
|
||||
m_framesDropped++; // ← This causes jerky playback!
|
||||
return false;
|
||||
}
|
||||
// ... decode and render ...
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Solution Design
|
||||
|
||||
### Approach 1: Fix TimingThreadLoop (RECOMMENDED)
|
||||
|
||||
**Minimal change, keeps existing architecture**
|
||||
|
||||
#### Modified PlaybackController.cpp
|
||||
```cpp
|
||||
void PlaybackController::TimingThreadLoop()
|
||||
{
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
while (!m_shouldStopTiming && m_isPlaying) {
|
||||
auto frameStart = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// 1. Signal frame processing
|
||||
if (m_frameReadyCallback) {
|
||||
m_frameReadyCallback();
|
||||
}
|
||||
|
||||
// 2. ✅ WAIT for frame processing completion (max 100ms timeout)
|
||||
int waitCount = 0;
|
||||
while (IsFrameProcessing() && waitCount < 100) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
waitCount++;
|
||||
}
|
||||
|
||||
if (waitCount >= 100) {
|
||||
LOGF_WARNING("[PlaybackController] Frame processing timeout");
|
||||
}
|
||||
|
||||
// 3. Advance frame counter
|
||||
m_currentFrame++;
|
||||
m_currentTime = m_currentFrame / m_frameRate;
|
||||
|
||||
// 4. Sleep for remaining time to maintain 30fps
|
||||
auto frameEnd = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration<double, std::milli>(frameEnd - frameStart).count();
|
||||
|
||||
auto nextFrameTime = start + std::chrono::microseconds(
|
||||
static_cast<long long>(33333.0 * m_currentFrame));
|
||||
std::this_thread::sleep_until(nextFrameTime);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Required Changes
|
||||
|
||||
**PlaybackController.h**:
|
||||
```cpp
|
||||
class PlaybackController
|
||||
{
|
||||
// Add frame processor reference
|
||||
void SetFrameProcessor(FrameProcessor* processor) { m_frameProcessor = processor; }
|
||||
bool IsFrameProcessing() const {
|
||||
return m_frameProcessor && m_frameProcessor->IsProcessing();
|
||||
}
|
||||
|
||||
private:
|
||||
FrameProcessor* m_frameProcessor = nullptr; // Non-owning pointer
|
||||
};
|
||||
```
|
||||
|
||||
**VideoPlayerControl2.xaml.cpp**:
|
||||
```cpp
|
||||
void VideoPlayerControl2::LoadVideo(...)
|
||||
{
|
||||
// ... existing code ...
|
||||
|
||||
// Link PlaybackController to FrameProcessor
|
||||
m_playbackController->SetFrameProcessor(m_frameProcessor.get());
|
||||
}
|
||||
```
|
||||
|
||||
#### Benefits
|
||||
- ✅ Minimal code change (10-15 lines)
|
||||
- ✅ Keeps existing architecture intact
|
||||
- ✅ Easy to test and rollback
|
||||
- ✅ No risk to other components
|
||||
- ✅ Frame drops eliminated
|
||||
|
||||
---
|
||||
|
||||
### Approach 2: Remove Fence + Fix Timing (COMPREHENSIVE)
|
||||
|
||||
**Reduces complexity while fixing the root cause**
|
||||
|
||||
#### Changes
|
||||
|
||||
**1. Remove CUDA-D3D12 Fence** (complexity reduction):
|
||||
|
||||
Delete:
|
||||
- `D3D12SurfaceHandler::SetD3D12Fence()`
|
||||
- `D3D12SurfaceHandler::SignalD3D12Fence()`
|
||||
- `vavcore_get_sync_fence()` API
|
||||
- `VavCoreVideoFrame::sync_fence_value` field
|
||||
|
||||
Replace with:
|
||||
```cpp
|
||||
// NVDECAV1Decoder.cpp
|
||||
VavCoreResult NVDECAV1Decoder::DecodeToSurface(...)
|
||||
{
|
||||
// ... decode logic ...
|
||||
|
||||
// Copy RGBA to D3D12 texture
|
||||
m_d3d12Handler->CopyRGBAFrame(...);
|
||||
|
||||
// ✅ Simple synchronous wait (15ms max)
|
||||
cuStreamSynchronize(m_stream);
|
||||
|
||||
// ❌ Remove fence signaling
|
||||
// m_d3d12Handler->SignalD3D12Fence(...);
|
||||
|
||||
return VAVCORE_SUCCESS;
|
||||
}
|
||||
```
|
||||
|
||||
**2. Fix TimingThreadLoop** (same as Approach 1)
|
||||
|
||||
**3. Keep D3D12 Internal Fence** (required for SwapChain):
|
||||
- ✅ Keep `D3D12VideoRenderer::m_fence`
|
||||
- ✅ Keep `WaitForFrameCompletion()` for triple buffering
|
||||
|
||||
#### Benefits
|
||||
- ✅ Reduced complexity (remove External Fence)
|
||||
- ✅ Fixes root cause (timing)
|
||||
- ✅ Same performance (15ms CPU block < 33.33ms budget)
|
||||
- ✅ Easier debugging (synchronous flow)
|
||||
|
||||
#### Risks
|
||||
- ⚠️ More code changes (3 components)
|
||||
- ⚠️ Requires more thorough testing
|
||||
|
||||
---
|
||||
|
||||
## Decision Matrix
|
||||
|
||||
| Aspect | Approach 1 (Fix Timing Only) | Approach 2 (Remove Fence + Fix) |
|
||||
|--------|------------------------------|----------------------------------|
|
||||
| Code Changes | 10-15 lines | ~50 lines |
|
||||
| Complexity Reduction | None | Medium |
|
||||
| Performance Impact | None | None (15ms < 33.33ms) |
|
||||
| Risk Level | Very Low | Low |
|
||||
| Testing Effort | Minimal | Moderate |
|
||||
| **Recommendation** | ✅ Start here | Follow-up refactor |
|
||||
|
||||
---
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Fix Timing (Immediate)
|
||||
|
||||
**Goal**: Stop jerky playback
|
||||
|
||||
**Tasks**:
|
||||
1. Modify `PlaybackController::TimingThreadLoop()` to wait for frame completion
|
||||
2. Add `SetFrameProcessor()` method to PlaybackController
|
||||
3. Link FrameProcessor in VideoPlayerControl2
|
||||
4. Test with sample video
|
||||
|
||||
**Files Modified**:
|
||||
- `PlaybackController.h` (add 3 lines)
|
||||
- `PlaybackController.cpp` (modify TimingThreadLoop, ~15 lines)
|
||||
- `VideoPlayerControl2.xaml.cpp` (add 1 line)
|
||||
|
||||
**Expected Outcome**: Smooth 30fps playback
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Remove Fence (Follow-up)
|
||||
|
||||
**Goal**: Reduce technical debt
|
||||
|
||||
**Tasks**:
|
||||
1. Remove External Fence from D3D12SurfaceHandler
|
||||
2. Replace with `cuStreamSynchronize()` in NVDECAV1Decoder
|
||||
3. Remove `vavcore_get_sync_fence()` from VavCore API
|
||||
4. Remove fence wait from D3D12VideoRenderer
|
||||
5. Update VavCoreVideoFrame struct
|
||||
|
||||
**Files Modified**:
|
||||
- `D3D12SurfaceHandler.h/.cpp`
|
||||
- `NVDECAV1Decoder.cpp`
|
||||
- `D3D12VideoRenderer.cpp`
|
||||
- `VavCore.h` (C API)
|
||||
- `VideoTypes.h` (internal)
|
||||
|
||||
**Expected Outcome**: Same performance, simpler code
|
||||
|
||||
---
|
||||
|
||||
## Performance Analysis
|
||||
|
||||
### Frame Processing Timeline (30fps = 33.33ms budget)
|
||||
|
||||
**Current (Broken)**:
|
||||
```
|
||||
0ms : Timing thread signals frame N
|
||||
0ms : ProcessFrame(N) starts
|
||||
10ms : CUDA decode completes (async)
|
||||
15ms : UI render starts
|
||||
25ms : Frame N complete
|
||||
33.33ms: Timing thread signals frame N+1 (regardless of N status!)
|
||||
→ If frame N not done, FRAME DROP occurs
|
||||
```
|
||||
|
||||
**After Fix**:
|
||||
```
|
||||
0ms : Timing thread signals frame N
|
||||
0ms : ProcessFrame(N) starts
|
||||
10ms : CUDA decode completes
|
||||
15ms : UI render starts
|
||||
25ms : Frame N complete
|
||||
25ms : Timing thread detects completion, advances counter
|
||||
33.33ms: Sleep completes, signal frame N+1
|
||||
→ NO FRAME DROPS
|
||||
```
|
||||
|
||||
### Budget Analysis
|
||||
- NVDEC decode: 10-15ms
|
||||
- UI render: 5-10ms
|
||||
- **Total**: 15-25ms
|
||||
- **Budget**: 33.33ms (30fps)
|
||||
- **Margin**: 8-18ms spare time
|
||||
|
||||
**Conclusion**: We have plenty of time budget, synchronous processing is perfectly fine.
|
||||
|
||||
---
|
||||
|
||||
## Alternative Considered and Rejected
|
||||
|
||||
### Fence-based Async Processing
|
||||
|
||||
**Why rejected**:
|
||||
1. **Current architecture doesn't benefit**: Timing thread doesn't overlap work
|
||||
2. **33.33ms budget sufficient**: No need for async optimization
|
||||
3. **Adds complexity**: External fence requires CUDA/D3D12 interop code
|
||||
4. **Doesn't fix root cause**: Timing thread still needs to wait
|
||||
|
||||
**When Fence would be beneficial**:
|
||||
- Variable framerate (60fps+)
|
||||
- Tight timing budget (< 16ms)
|
||||
- Truly parallel decode/render pipeline
|
||||
|
||||
**Our case**: Fixed 30fps, 33.33ms budget → synchronous is simpler and sufficient
|
||||
|
||||
---
|
||||
|
||||
## Testing Plan
|
||||
|
||||
### Phase 1 Testing (Timing Fix)
|
||||
|
||||
**Unit Tests**: None required (integration testing sufficient)
|
||||
|
||||
**Integration Tests**:
|
||||
1. **Smooth Playback Test**:
|
||||
- Load 30fps test video
|
||||
- Play for 30 seconds
|
||||
- Verify no frame drops
|
||||
- Verify smooth visual playback
|
||||
|
||||
2. **Frame Timing Test**:
|
||||
- Log actual frame intervals
|
||||
- Verify 33.33ms ± 2ms intervals
|
||||
- Check m_framesDropped counter (should be 0)
|
||||
|
||||
3. **Long Duration Test**:
|
||||
- Play 5-minute video
|
||||
- Monitor frame drops
|
||||
- Check for memory leaks
|
||||
|
||||
**Success Criteria**:
|
||||
- ✅ Zero frame drops over 5 minutes
|
||||
- ✅ Consistent 33.33ms frame intervals
|
||||
- ✅ Smooth visual playback (no jerking)
|
||||
|
||||
---
|
||||
|
||||
### Phase 2 Testing (Fence Removal)
|
||||
|
||||
**Additional Tests**:
|
||||
1. **Performance Regression Test**:
|
||||
- Measure decode time before/after
|
||||
- Verify < 20ms average decode time
|
||||
- Check CPU usage (should be same)
|
||||
|
||||
2. **Multi-format Test**:
|
||||
- Test with different resolutions (720p, 1080p, 4K)
|
||||
- Verify all formats work
|
||||
|
||||
3. **Decoder Compatibility Test**:
|
||||
- Test NVDEC (affected by change)
|
||||
- Test VPL (should be unaffected)
|
||||
- Test dav1d fallback (should be unaffected)
|
||||
|
||||
**Success Criteria**:
|
||||
- ✅ Same performance as before
|
||||
- ✅ No visual regression
|
||||
- ✅ All decoders work correctly
|
||||
|
||||
---
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
### Phase 1 Rollback
|
||||
**If timing fix causes issues**:
|
||||
1. Revert PlaybackController.cpp changes
|
||||
2. Revert PlaybackController.h changes
|
||||
3. Revert VideoPlayerControl2.xaml.cpp link
|
||||
|
||||
**Time**: < 5 minutes
|
||||
|
||||
### Phase 2 Rollback
|
||||
**If Fence removal causes issues**:
|
||||
1. Git revert to Phase 1 completion
|
||||
2. Keep timing fix, restore Fence code
|
||||
|
||||
**Time**: < 10 minutes
|
||||
|
||||
---
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
### Thread Safety Considerations
|
||||
|
||||
**FrameProcessor::IsProcessing()**:
|
||||
- Called from Timing Thread
|
||||
- Reads atomic bool `m_frameProcessing`
|
||||
- ✅ Thread-safe (atomic operation)
|
||||
|
||||
**PlaybackController::SetFrameProcessor()**:
|
||||
- Called from UI Thread during LoadVideo
|
||||
- Non-owning pointer (FrameProcessor lifetime > PlaybackController)
|
||||
- ✅ Safe (no concurrent access during init)
|
||||
|
||||
### Performance Considerations
|
||||
|
||||
**CPU Blocking in cuStreamSynchronize()**:
|
||||
- Blocks for ~15ms
|
||||
- Timing thread sleeps for 33.33ms anyway
|
||||
- ✅ No performance impact (within budget)
|
||||
|
||||
**Wait Loop in TimingThreadLoop()**:
|
||||
- 1ms sleep per iteration
|
||||
- Max 100 iterations (100ms timeout)
|
||||
- Typical: 25ms wait = 25 iterations
|
||||
- ✅ Low CPU overhead
|
||||
|
||||
---
|
||||
|
||||
## Success Metrics
|
||||
|
||||
### Immediate (Phase 1)
|
||||
- [ ] Zero frame drops during playback
|
||||
- [ ] Smooth visual playback (user confirmed)
|
||||
- [ ] Consistent 30fps timing (logs verified)
|
||||
|
||||
### Follow-up (Phase 2)
|
||||
- [ ] Code complexity reduced (fence code removed)
|
||||
- [ ] Same or better performance
|
||||
- [ ] All decoders working (NVDEC, VPL, dav1d)
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
**Related Files**:
|
||||
- `vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/PlaybackController.cpp`
|
||||
- `vav2/platforms/windows/applications/vav2player/Vav2Player/src/Playback/FrameProcessor.cpp`
|
||||
- `vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp`
|
||||
- `vav2/platforms/windows/applications/vav2player/Vav2Player/src/Rendering/D3D12VideoRenderer.cpp`
|
||||
|
||||
**Previous Attempts**:
|
||||
- Fence implementation (2025-10-07): Did not fix jerky playback
|
||||
- Timing analysis: Identified root cause in PlaybackController
|
||||
|
||||
---
|
||||
|
||||
## Approval
|
||||
|
||||
**Approved by**: [User]
|
||||
**Date**: [Pending]
|
||||
**Implementation Start**: [Pending]
|
||||
@@ -49,7 +49,8 @@ namespace winrt::Vav2Player::implementation
|
||||
|
||||
VideoPlayerControl2::~VideoPlayerControl2()
|
||||
{
|
||||
LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2", L"Destructor called");
|
||||
// DO NOT LOG in destructor - DispatcherQueue may already be shut down
|
||||
// This causes crashes when LogMessagePage tries to access UI thread
|
||||
|
||||
// Stop playback and cleanup
|
||||
if (m_playbackController) {
|
||||
@@ -319,8 +320,9 @@ namespace winrt::Vav2Player::implementation
|
||||
LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2",
|
||||
L"Video loaded: " + std::to_wstring(videoWidth) + L"x" + std::to_wstring(videoHeight));
|
||||
|
||||
// Prepare video texture before decoding (via FrameProcessor)
|
||||
// Link FrameProcessor to PlaybackController for timing synchronization
|
||||
if (m_frameProcessor) {
|
||||
m_playbackController->SetFrameProcessor(m_frameProcessor.get());
|
||||
m_frameProcessor->PrepareVideoTexture(videoWidth, videoHeight);
|
||||
}
|
||||
|
||||
|
||||
@@ -38,6 +38,8 @@ void FrameProcessor::PrepareVideoTexture(uint32_t width, uint32_t height)
|
||||
bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
|
||||
std::function<void(bool success)> onComplete)
|
||||
{
|
||||
auto processStart = std::chrono::high_resolution_clock::now();
|
||||
|
||||
if (!player || !m_renderer || !m_dispatcherQueue) {
|
||||
LOGF_ERROR("[FrameProcessor] Invalid state: missing player/renderer/dispatcherQueue");
|
||||
return false;
|
||||
@@ -47,7 +49,7 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
|
||||
bool expected = false;
|
||||
if (!m_frameProcessing.compare_exchange_strong(expected, true)) {
|
||||
m_framesDropped++;
|
||||
LOGF_INFO("[FrameProcessor] Frame dropped (#%llu) - previous frame still processing (decoded: %llu)",
|
||||
LOGF_ERROR("[FrameProcessor] FRAME DROPPED (#%llu) - previous frame still processing (decoded: %llu)",
|
||||
m_framesDropped.load(), m_framesDecoded.load());
|
||||
return false;
|
||||
}
|
||||
@@ -67,6 +69,7 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
|
||||
LOGF_INFO("[FrameProcessor] Next RGBA texture acquired (triple buffering)");
|
||||
|
||||
// Decode frame to D3D12 surface (blocking)
|
||||
auto decodeStart = std::chrono::high_resolution_clock::now();
|
||||
LOGF_INFO("[FrameProcessor] Starting vavcore_decode_to_surface (BLOCKING)...");
|
||||
VavCoreVideoFrame vavFrame = {};
|
||||
VavCoreResult result = vavcore_decode_to_surface(
|
||||
@@ -75,7 +78,9 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
|
||||
rgbaTexture,
|
||||
&vavFrame
|
||||
);
|
||||
LOGF_INFO("[FrameProcessor] vavcore_decode_to_surface COMPLETED");
|
||||
auto decodeEnd = std::chrono::high_resolution_clock::now();
|
||||
double decodeTime = std::chrono::duration<double, std::milli>(decodeEnd - decodeStart).count();
|
||||
LOGF_INFO("[FrameProcessor] vavcore_decode_to_surface COMPLETED in %.2f ms", decodeTime);
|
||||
|
||||
if (result != VAVCORE_SUCCESS) {
|
||||
if (result == VAVCORE_END_OF_STREAM) {
|
||||
@@ -110,36 +115,52 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
|
||||
// Enqueue render on UI thread
|
||||
LOGF_INFO("[FrameProcessor] Attempting to enqueue render...");
|
||||
|
||||
bool enqueued = m_dispatcherQueue.TryEnqueue([this, vavFrame, onComplete]() {
|
||||
LOGF_INFO("[FrameProcessor] *** UI THREAD CALLBACK STARTED ***");
|
||||
HRESULT hr = m_renderer->RenderVideoFrame(vavFrame);
|
||||
bool enqueued = m_dispatcherQueue.TryEnqueue([this, vavFrame, onComplete, player, processStart]() {
|
||||
auto uiCallbackStart = std::chrono::high_resolution_clock::now();
|
||||
double queueDelay = std::chrono::duration<double, std::milli>(uiCallbackStart - processStart).count();
|
||||
LOGF_INFO("[FrameProcessor] UI THREAD CALLBACK STARTED (queue delay: %.2f ms)", queueDelay);
|
||||
|
||||
auto renderStart = std::chrono::high_resolution_clock::now();
|
||||
HRESULT hr = m_renderer->RenderVideoFrame(vavFrame, player);
|
||||
auto renderEnd = std::chrono::high_resolution_clock::now();
|
||||
double renderTime = std::chrono::duration<double, std::milli>(renderEnd - renderStart).count();
|
||||
|
||||
bool renderSuccess = SUCCEEDED(hr);
|
||||
|
||||
if (!renderSuccess) {
|
||||
m_renderErrors++;
|
||||
LOGF_ERROR("[FrameProcessor] Render error: HRESULT = 0x%08X", hr);
|
||||
LOGF_ERROR("[FrameProcessor] Render error: HRESULT = 0x%08X (took %.2f ms)", hr, renderTime);
|
||||
} else {
|
||||
LOGF_INFO("[FrameProcessor] Render succeeded");
|
||||
LOGF_INFO("[FrameProcessor] Render succeeded (%.2f ms)", renderTime);
|
||||
}
|
||||
|
||||
// Present to screen
|
||||
if (renderSuccess) {
|
||||
auto presentStart = std::chrono::high_resolution_clock::now();
|
||||
hr = m_renderer->Present();
|
||||
auto presentEnd = std::chrono::high_resolution_clock::now();
|
||||
double presentTime = std::chrono::duration<double, std::milli>(presentEnd - presentStart).count();
|
||||
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[FrameProcessor] Present error: HRESULT = 0x%08X", hr);
|
||||
LOGF_ERROR("[FrameProcessor] Present error: HRESULT = 0x%08X (took %.2f ms)", hr, presentTime);
|
||||
renderSuccess = false;
|
||||
} else {
|
||||
LOGF_INFO("[FrameProcessor] Present succeeded (%.2f ms)", presentTime);
|
||||
}
|
||||
}
|
||||
|
||||
// Mark frame processing complete
|
||||
LOGF_INFO("[FrameProcessor] CLEARING m_frameProcessing flag");
|
||||
auto totalEnd = std::chrono::high_resolution_clock::now();
|
||||
double totalTime = std::chrono::duration<double, std::milli>(totalEnd - processStart).count();
|
||||
|
||||
LOGF_INFO("[FrameProcessor] CLEARING m_frameProcessing flag (TOTAL TIME: %.2f ms)", totalTime);
|
||||
m_frameProcessing.store(false);
|
||||
LOGF_INFO("[FrameProcessor] Flag cleared - ready for next frame");
|
||||
|
||||
if (onComplete) {
|
||||
onComplete(renderSuccess);
|
||||
}
|
||||
LOGF_INFO("[FrameProcessor] *** UI THREAD CALLBACK ENDED ***");
|
||||
LOGF_INFO("[FrameProcessor] UI THREAD CALLBACK ENDED");
|
||||
});
|
||||
|
||||
if (!enqueued) {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "pch.h"
|
||||
#include "PlaybackController.h"
|
||||
#include "FrameProcessor.h"
|
||||
#include "../Logger/SimpleLogger.h"
|
||||
#include <chrono>
|
||||
#include <sstream>
|
||||
@@ -250,13 +251,41 @@ void PlaybackController::TimingThreadLoop()
|
||||
{
|
||||
double baseIntervalMs = 1000.0 / m_frameRate;
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
auto lastFrameTime = start;
|
||||
|
||||
LOGF_INFO("[PlaybackController] Timing thread loop started (target: %.2f fps, %.2f ms per frame)",
|
||||
m_frameRate, baseIntervalMs);
|
||||
|
||||
while (!m_shouldStopTiming && m_isPlaying) {
|
||||
auto frameStart = std::chrono::high_resolution_clock::now();
|
||||
double sinceLast = std::chrono::duration<double, std::milli>(frameStart - lastFrameTime).count();
|
||||
|
||||
LOGF_INFO("[PlaybackController] ==== FRAME %llu START (%.2f ms since last) ====",
|
||||
m_currentFrame, sinceLast);
|
||||
|
||||
// Invoke frame-ready callback
|
||||
if (m_frameReadyCallback) {
|
||||
m_frameReadyCallback();
|
||||
}
|
||||
|
||||
// CRITICAL FIX: Wait for frame processing completion (max 100ms timeout)
|
||||
// This prevents frame drops when decode/render takes longer than expected
|
||||
auto waitStart = std::chrono::high_resolution_clock::now();
|
||||
int waitCount = 0;
|
||||
while (IsFrameProcessing() && waitCount < 100) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
waitCount++;
|
||||
}
|
||||
|
||||
auto waitEnd = std::chrono::high_resolution_clock::now();
|
||||
double waitTime = std::chrono::duration<double, std::milli>(waitEnd - waitStart).count();
|
||||
|
||||
if (waitCount >= 100) {
|
||||
LOGF_ERROR("[PlaybackController] Frame processing TIMEOUT after 100ms");
|
||||
} else if (waitTime > 0.5) {
|
||||
LOGF_INFO("[PlaybackController] Waited %.2f ms for frame processing completion", waitTime);
|
||||
}
|
||||
|
||||
// Update current time
|
||||
m_currentFrame++;
|
||||
m_currentTime = m_currentFrame / m_frameRate;
|
||||
@@ -268,12 +297,28 @@ void PlaybackController::TimingThreadLoop()
|
||||
// High-precision sleep until next frame
|
||||
auto nextFrame = start + std::chrono::microseconds(
|
||||
static_cast<long long>(targetIntervalMs * 1000 * m_currentFrame));
|
||||
|
||||
auto beforeSleep = std::chrono::high_resolution_clock::now();
|
||||
std::this_thread::sleep_until(nextFrame);
|
||||
auto afterSleep = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double sleepTime = std::chrono::duration<double, std::milli>(afterSleep - beforeSleep).count();
|
||||
double totalFrameTime = std::chrono::duration<double, std::milli>(afterSleep - frameStart).count();
|
||||
|
||||
LOGF_INFO("[PlaybackController] Frame %llu complete (total: %.2f ms, sleep: %.2f ms)",
|
||||
m_currentFrame - 1, totalFrameTime, sleepTime);
|
||||
|
||||
lastFrameTime = frameStart;
|
||||
}
|
||||
|
||||
LOGF_INFO("[PlaybackController] Timing thread loop exited");
|
||||
}
|
||||
|
||||
bool PlaybackController::IsFrameProcessing() const
|
||||
{
|
||||
return m_frameProcessor && m_frameProcessor->IsProcessing();
|
||||
}
|
||||
|
||||
void PlaybackController::SetPlaybackSpeed(double speed)
|
||||
{
|
||||
// Clamp to reasonable range: 0.25x to 4.0x
|
||||
|
||||
@@ -9,6 +9,9 @@
|
||||
|
||||
namespace Vav2Player {
|
||||
|
||||
// Forward declaration
|
||||
class FrameProcessor;
|
||||
|
||||
// Manages VavCore player lifecycle and playback timing
|
||||
// Responsibilities:
|
||||
// - VavCore player creation/destruction
|
||||
@@ -58,6 +61,10 @@ public:
|
||||
// D3D device configuration (for GPU acceleration)
|
||||
void SetD3DDevice(void* d3d_device, VavCoreSurfaceType type);
|
||||
|
||||
// Frame processor link (for timing synchronization)
|
||||
void SetFrameProcessor(FrameProcessor* processor) { m_frameProcessor = processor; }
|
||||
bool IsFrameProcessing() const;
|
||||
|
||||
private:
|
||||
// VavCore player instance
|
||||
VavCorePlayer* m_vavCorePlayer = nullptr;
|
||||
@@ -89,6 +96,9 @@ private:
|
||||
std::unique_ptr<std::thread> m_timingThread;
|
||||
std::function<void()> m_frameReadyCallback;
|
||||
|
||||
// Frame processor reference (non-owning, for synchronization)
|
||||
FrameProcessor* m_frameProcessor = nullptr;
|
||||
|
||||
// Helper methods
|
||||
bool InitializeVavCore();
|
||||
void CleanupVavCore();
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "pch.h"
|
||||
#include "D3D12VideoRenderer.h"
|
||||
#include "d3dx12.h"
|
||||
#include "../Logger/SimpleLogger.h"
|
||||
#include <microsoft.ui.xaml.media.dxinterop.h>
|
||||
|
||||
namespace Vav2Player {
|
||||
@@ -99,7 +100,7 @@ void D3D12VideoRenderer::Shutdown() {
|
||||
m_initialized = false;
|
||||
}
|
||||
|
||||
HRESULT D3D12VideoRenderer::RenderVideoFrame(const VavCoreVideoFrame& frame) {
|
||||
HRESULT D3D12VideoRenderer::RenderVideoFrame(const VavCoreVideoFrame& frame, VavCorePlayer* player) {
|
||||
if (!m_initialized) {
|
||||
return E_NOT_VALID_STATE;
|
||||
}
|
||||
@@ -122,6 +123,20 @@ HRESULT D3D12VideoRenderer::RenderVideoFrame(const VavCoreVideoFrame& frame) {
|
||||
return hr;
|
||||
}
|
||||
|
||||
// IMPORTANT: Wait for CUDA decode to complete before D3D12 renders
|
||||
// The frame.surface_data.d3d12.fence_value indicates when CUDA finished writing to the texture
|
||||
if (player && frame.surface_data.d3d12.fence_value > 0) {
|
||||
// Get the shared D3D12 fence from VavCore
|
||||
void* syncFencePtr = vavcore_get_sync_fence(player);
|
||||
if (syncFencePtr) {
|
||||
ID3D12Fence* cudaFence = static_cast<ID3D12Fence*>(syncFencePtr);
|
||||
// Make D3D12 command queue wait for CUDA to finish (ASYNC GPU wait, no CPU blocking!)
|
||||
m_commandQueue->Wait(cudaFence, frame.surface_data.d3d12.fence_value);
|
||||
LOGF_DEBUG("[D3D12VideoRenderer] D3D12 queue waiting for CUDA fence value=%llu (async GPU wait)",
|
||||
frame.surface_data.d3d12.fence_value);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for previous frame to complete
|
||||
WaitForFrameCompletion(m_frameIndex);
|
||||
|
||||
@@ -167,10 +182,6 @@ HRESULT D3D12VideoRenderer::RenderVideoFrame(const VavCoreVideoFrame& frame) {
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
bool D3D12VideoRenderer::TryRenderFrame(const VavCoreVideoFrame& frame) {
|
||||
return SUCCEEDED(RenderVideoFrame(frame));
|
||||
}
|
||||
|
||||
HRESULT D3D12VideoRenderer::Present() {
|
||||
if (!m_swapChain) {
|
||||
return E_NOT_VALID_STATE;
|
||||
|
||||
@@ -35,8 +35,7 @@ public:
|
||||
void Shutdown() override;
|
||||
bool IsInitialized() const override { return m_initialized; }
|
||||
|
||||
HRESULT RenderVideoFrame(const VavCoreVideoFrame& frame) override;
|
||||
bool TryRenderFrame(const VavCoreVideoFrame& frame) override;
|
||||
HRESULT RenderVideoFrame(const VavCoreVideoFrame& frame, VavCorePlayer* player) override;
|
||||
HRESULT Present() override;
|
||||
|
||||
HRESULT Resize(uint32_t width, uint32_t height) override;
|
||||
|
||||
@@ -20,9 +20,8 @@ public:
|
||||
virtual void Shutdown() = 0;
|
||||
virtual bool IsInitialized() const = 0;
|
||||
|
||||
// Video rendering
|
||||
virtual HRESULT RenderVideoFrame(const VavCoreVideoFrame& frame) = 0;
|
||||
virtual bool TryRenderFrame(const VavCoreVideoFrame& frame) = 0; // Returns true if successful
|
||||
// Video rendering (requires player for Fence synchronization)
|
||||
virtual HRESULT RenderVideoFrame(const VavCoreVideoFrame& frame, VavCorePlayer* player) = 0;
|
||||
virtual HRESULT Present() = 0;
|
||||
|
||||
// Size management
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "pch.h"
|
||||
#include "RGBASurfaceBackend.h"
|
||||
#include "../Logger/SimpleLogger.h"
|
||||
#include <stdexcept>
|
||||
|
||||
namespace Vav2Player {
|
||||
@@ -131,7 +132,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
|
||||
ID3D12Resource* backBuffer,
|
||||
ID3D12GraphicsCommandList* commandList)
|
||||
{
|
||||
if (!m_initialized || !m_rgbaTextures[m_currentTextureIndex]) {
|
||||
if (!m_initialized) {
|
||||
return E_NOT_VALID_STATE;
|
||||
}
|
||||
|
||||
@@ -139,11 +140,34 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
|
||||
return E_INVALIDARG;
|
||||
}
|
||||
|
||||
// Get the actual texture that was decoded into (from frame metadata)
|
||||
ID3D12Resource* frameTexture = static_cast<ID3D12Resource*>(frame.surface_data.d3d12.d3d12_resource);
|
||||
if (!frameTexture) {
|
||||
LOGF_ERROR("[RGBASurfaceBackend] RenderToBackBuffer: frameTexture is NULL!");
|
||||
return E_INVALIDARG;
|
||||
}
|
||||
|
||||
// Find which texture index this corresponds to (for correct SRV descriptor)
|
||||
int textureIndex = -1;
|
||||
for (int i = 0; i < BUFFER_COUNT; i++) {
|
||||
if (m_rgbaTextures[i].Get() == frameTexture) {
|
||||
textureIndex = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (textureIndex == -1) {
|
||||
LOGF_ERROR("[RGBASurfaceBackend] RenderToBackBuffer: texture=%p NOT FOUND in buffer pool!", frameTexture);
|
||||
return E_INVALIDARG; // Texture not found in our buffer pool
|
||||
}
|
||||
|
||||
LOGF_INFO("[RGBASurfaceBackend] RenderToBackBuffer: using texture index=%d, ptr=%p", textureIndex, frameTexture);
|
||||
|
||||
// Transition RGBA texture to shader resource
|
||||
D3D12_RESOURCE_BARRIER barrierToSRV = {};
|
||||
barrierToSRV.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||
barrierToSRV.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||
barrierToSRV.Transition.pResource = m_rgbaTextures[m_currentTextureIndex].Get();
|
||||
barrierToSRV.Transition.pResource = frameTexture;
|
||||
barrierToSRV.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON;
|
||||
barrierToSRV.Transition.StateAfter = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
|
||||
barrierToSRV.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||
@@ -188,9 +212,9 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
|
||||
ID3D12DescriptorHeap* heaps[] = { m_srvHeap.Get() };
|
||||
commandList->SetDescriptorHeaps(1, heaps);
|
||||
|
||||
// Use descriptor for current texture index
|
||||
// Use descriptor for the frame's texture index
|
||||
UINT descriptorSize = m_device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
|
||||
CD3DX12_GPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetGPUDescriptorHandleForHeapStart(), m_currentTextureIndex, descriptorSize);
|
||||
CD3DX12_GPU_DESCRIPTOR_HANDLE srvHandle(m_srvHeap->GetGPUDescriptorHandleForHeapStart(), textureIndex, descriptorSize);
|
||||
commandList->SetGraphicsRootDescriptorTable(0, srvHandle);
|
||||
commandList->SetGraphicsRootConstantBufferView(1, m_constantBuffer->GetGPUVirtualAddress());
|
||||
|
||||
@@ -232,7 +256,7 @@ HRESULT RGBASurfaceBackend::RenderToBackBuffer(
|
||||
D3D12_RESOURCE_BARRIER barrierToCommon = {};
|
||||
barrierToCommon.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||
barrierToCommon.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||
barrierToCommon.Transition.pResource = m_rgbaTextures[m_currentTextureIndex].Get();
|
||||
barrierToCommon.Transition.pResource = frameTexture;
|
||||
barrierToCommon.Transition.StateBefore = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
|
||||
barrierToCommon.Transition.StateAfter = D3D12_RESOURCE_STATE_COMMON;
|
||||
barrierToCommon.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||
@@ -517,7 +541,12 @@ HRESULT RGBASurfaceBackend::UpdateConstantBuffer() {
|
||||
|
||||
ID3D12Resource* RGBASurfaceBackend::GetNextVideoTexture() {
|
||||
// Rotate to next buffer index
|
||||
int prevIndex = m_currentTextureIndex;
|
||||
m_currentTextureIndex = (m_currentTextureIndex + 1) % BUFFER_COUNT;
|
||||
|
||||
LOGF_INFO("[RGBASurfaceBackend] GetNextVideoTexture: %d -> %d, texture=%p",
|
||||
prevIndex, m_currentTextureIndex, m_rgbaTextures[m_currentTextureIndex].Get());
|
||||
|
||||
return m_rgbaTextures[m_currentTextureIndex].Get();
|
||||
}
|
||||
|
||||
|
||||
@@ -129,6 +129,7 @@ typedef struct {
|
||||
void* d3d12_resource;
|
||||
void* d3d12_device;
|
||||
uint32_t subresource_index;
|
||||
uint64_t fence_value; // D3D12 fence value for async synchronization
|
||||
} d3d12;
|
||||
|
||||
struct {
|
||||
|
||||
@@ -12,6 +12,8 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
|
||||
, m_cache(std::make_unique<ExternalMemoryCache>(device, cuda_context))
|
||||
, m_surfaceWriteModule(nullptr)
|
||||
, m_surfaceWriteKernel(nullptr)
|
||||
, m_d3d12Fence(nullptr)
|
||||
, m_cudaSemaphore(nullptr)
|
||||
{
|
||||
// Load surface write kernel immediately since CUDA context is already active
|
||||
if (!LoadSurfaceWriteKernel()) {
|
||||
@@ -23,6 +25,12 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
|
||||
|
||||
D3D12SurfaceHandler::~D3D12SurfaceHandler()
|
||||
{
|
||||
// Destroy CUDA external semaphore
|
||||
if (m_cudaSemaphore) {
|
||||
cuDestroyExternalSemaphore(m_cudaSemaphore);
|
||||
m_cudaSemaphore = nullptr;
|
||||
}
|
||||
|
||||
// Unload surface write kernel
|
||||
if (m_surfaceWriteModule) {
|
||||
cuModuleUnload(m_surfaceWriteModule);
|
||||
@@ -181,10 +189,77 @@ bool D3D12SurfaceHandler::CopySeparateNV12Frame(CUdeviceptr src_frame,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::SignalD3D12Fence(uint64_t fence_value)
|
||||
void D3D12SurfaceHandler::SetD3D12Fence(void* fence)
|
||||
{
|
||||
// TODO: Implement fence signaling
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Fence signaling not implemented");
|
||||
if (!fence) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Null fence provided");
|
||||
return;
|
||||
}
|
||||
|
||||
m_d3d12Fence = fence;
|
||||
|
||||
// Import D3D12 Fence as CUDA external semaphore
|
||||
// This requires the fence to have D3D12_FENCE_FLAG_SHARED set during creation
|
||||
ID3D12Fence* d3d12Fence = static_cast<ID3D12Fence*>(fence);
|
||||
|
||||
// Get shared handle from D3D12 Fence
|
||||
HANDLE sharedHandle = nullptr;
|
||||
HRESULT hr = m_device->CreateSharedHandle(
|
||||
d3d12Fence,
|
||||
nullptr,
|
||||
GENERIC_ALL,
|
||||
nullptr,
|
||||
&sharedHandle
|
||||
);
|
||||
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to create shared handle for fence: 0x%08X", hr);
|
||||
return;
|
||||
}
|
||||
|
||||
// Import fence as CUDA external semaphore
|
||||
CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC semDesc = {};
|
||||
semDesc.type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE;
|
||||
semDesc.handle.win32.handle = sharedHandle;
|
||||
semDesc.flags = 0;
|
||||
|
||||
CUresult result = cuImportExternalSemaphore(&m_cudaSemaphore, &semDesc);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
const char* errorName = nullptr;
|
||||
cuGetErrorName(result, &errorName);
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to import D3D12 fence as CUDA semaphore: %s",
|
||||
errorName ? errorName : "unknown");
|
||||
CloseHandle(sharedHandle);
|
||||
return;
|
||||
}
|
||||
|
||||
// NOTE: Do NOT close sharedHandle - it's owned by CUDA after import
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] D3D12 Fence imported as CUDA semaphore: %p", m_cudaSemaphore);
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::SignalD3D12Fence(uint64_t fence_value, cudaStream_t stream)
|
||||
{
|
||||
if (!m_cudaSemaphore) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] CUDA semaphore not initialized - call SetD3D12Fence first");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Signal D3D12 fence from CUDA stream (async GPU operation)
|
||||
CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = {};
|
||||
signalParams.params.fence.value = fence_value;
|
||||
signalParams.flags = 0;
|
||||
|
||||
CUresult result = cuSignalExternalSemaphoresAsync(&m_cudaSemaphore, &signalParams, 1, (CUstream)stream);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
const char* errorName = nullptr;
|
||||
cuGetErrorName(result, &errorName);
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to signal D3D12 fence: %s",
|
||||
errorName ? errorName : "unknown");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Signaled D3D12 fence with value %llu (async)", fence_value);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -266,7 +341,8 @@ bool D3D12SurfaceHandler::CopyRGBAFrame(CUdeviceptr src_rgba,
|
||||
ID3D12Resource* dst_texture,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
cudaStream_t stream)
|
||||
cudaStream_t stream,
|
||||
uint64_t fence_value)
|
||||
{
|
||||
if (!dst_texture) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Null destination texture");
|
||||
@@ -285,16 +361,23 @@ bool D3D12SurfaceHandler::CopyRGBAFrame(CUdeviceptr src_rgba,
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] CopyRGBAFrame via surface: width=%u, height=%u, surface=0x%llX",
|
||||
width, height, (unsigned long long)dst_surface);
|
||||
|
||||
// Use surface write kernel to handle tiled layout automatically
|
||||
// Use surface write kernel to handle tiled layout automatically (ASYNC)
|
||||
if (!CopyRGBAToSurfaceViaKernel(dst_surface, src_rgba, width, height, src_pitch, stream)) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to copy RGBA via surface kernel");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] RGBA frame copied to D3D12 texture via surface (%ux%u)", width, height);
|
||||
// Signal D3D12 fence from CUDA stream (async GPU operation)
|
||||
// This allows D3D12 to know when CUDA work is complete without CPU blocking
|
||||
if (!SignalD3D12Fence(fence_value, stream)) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to signal D3D12 fence");
|
||||
return false;
|
||||
}
|
||||
|
||||
// NOTE: Debug sampling removed because surface objects don't expose linear pointers
|
||||
// To verify output, save BMP files in RedSurfaceNVDECTest
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] RGBA frame copy submitted (async) - fence will be signaled at %llu", fence_value);
|
||||
|
||||
// NOTE: Do NOT synchronize stream here - that would block!
|
||||
// D3D12 will wait on the fence value instead
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -359,7 +442,7 @@ bool D3D12SurfaceHandler::CopyRGBAToSurfaceViaKernel(CUsurfObject dst_surface,
|
||||
&src_pitch
|
||||
};
|
||||
|
||||
// Launch kernel using Driver API
|
||||
// Launch kernel using Driver API (ASYNC - no synchronization!)
|
||||
CUresult result = cuLaunchKernel(
|
||||
m_surfaceWriteKernel,
|
||||
grid_x, grid_y, 1, // Grid dimensions
|
||||
@@ -377,16 +460,9 @@ bool D3D12SurfaceHandler::CopyRGBAToSurfaceViaKernel(CUsurfObject dst_surface,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Synchronize to ensure kernel completes
|
||||
result = cuStreamSynchronize((CUstream)stream);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
const char* errorName = nullptr;
|
||||
cuGetErrorName(result, &errorName);
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Kernel synchronization failed: %s", errorName ? errorName : "unknown");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel completed successfully");
|
||||
// DO NOT synchronize stream here! This would block the CPU.
|
||||
// Instead, we'll signal the D3D12 fence which will be waited on by D3D12 command queue
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel launched (async)");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -17,6 +17,11 @@ public:
|
||||
D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_context);
|
||||
~D3D12SurfaceHandler();
|
||||
|
||||
// Set D3D12 Fence for CUDA-D3D12 synchronization
|
||||
// Must be called before using async copy operations
|
||||
// fence: ID3D12Fence* created by D3D12 device
|
||||
void SetD3D12Fence(void* fence);
|
||||
|
||||
// Copy NV12 frame from CUDA to D3D12 texture (legacy combined NV12 texture)
|
||||
// Returns true on success
|
||||
bool CopyNV12Frame(CUdeviceptr src_frame,
|
||||
@@ -36,19 +41,21 @@ public:
|
||||
uint32_t width,
|
||||
uint32_t height);
|
||||
|
||||
// Copy RGBA frame from CUDA to D3D12 texture
|
||||
// Copy RGBA frame from CUDA to D3D12 texture (ASYNC with Fence signaling)
|
||||
// RGBA Format: DXGI_FORMAT_R8G8B8A8_UNORM (width x height)
|
||||
// src_rgba: CUDA RGBA buffer (4 bytes per pixel, interleaved, alpha=255)
|
||||
// dst_texture: D3D12 RGBA texture (ROW_MAJOR layout)
|
||||
// fence_value: D3D12 fence value to signal when CUDA work completes
|
||||
// Returns true on success
|
||||
bool CopyRGBAFrame(CUdeviceptr src_rgba,
|
||||
ID3D12Resource* dst_texture,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
cudaStream_t stream);
|
||||
cudaStream_t stream,
|
||||
uint64_t fence_value);
|
||||
|
||||
// Signal D3D12 fence from CUDA stream (not implemented yet)
|
||||
bool SignalD3D12Fence(uint64_t fence_value);
|
||||
// Signal D3D12 fence from CUDA stream (async GPU operation)
|
||||
bool SignalD3D12Fence(uint64_t fence_value, cudaStream_t stream);
|
||||
|
||||
// Release D3D12 resource from external memory cache
|
||||
void ReleaseD3D12Resource(ID3D12Resource* resource);
|
||||
@@ -84,6 +91,10 @@ private:
|
||||
// Surface write kernel (Driver API)
|
||||
CUmodule m_surfaceWriteModule;
|
||||
CUfunction m_surfaceWriteKernel;
|
||||
|
||||
// D3D12 Fence for CUDA-D3D12 synchronization
|
||||
void* m_d3d12Fence; // ID3D12Fence*
|
||||
CUexternalSemaphore m_cudaSemaphore; // CUDA external semaphore for fence
|
||||
};
|
||||
|
||||
} // namespace VavCore
|
||||
|
||||
@@ -720,6 +720,12 @@ bool NVDECAV1Decoder::InitializeCUDA() {
|
||||
m_cuContext
|
||||
);
|
||||
|
||||
// Set D3D12 Fence for async synchronization
|
||||
if (m_d3d12Fence) {
|
||||
m_d3d12Handler->SetD3D12Fence(m_d3d12Fence);
|
||||
LOGF_DEBUG("[InitializeCUDA] D3D12 Fence set on D3D12SurfaceHandler");
|
||||
}
|
||||
|
||||
m_rgbaConverter = std::make_unique<NV12ToRGBAConverter>();
|
||||
|
||||
LOGF_DEBUG("[InitializeCUDA] D3D12SurfaceHandler and NV12ToRGBAConverter instances created");
|
||||
@@ -1477,13 +1483,18 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy RGBA to D3D12 texture
|
||||
LOGF_DEBUG("[DecodeToSurface] Calling CopyRGBAFrame with m_width=%u, m_height=%u", m_width, m_height);
|
||||
// Copy RGBA to D3D12 texture (ASYNC with fence signaling)
|
||||
// Increment fence value for this frame
|
||||
m_fenceValue++;
|
||||
LOGF_DEBUG("[DecodeToSurface] Calling CopyRGBAFrame with m_width=%u, m_height=%u, fence_value=%llu",
|
||||
m_width, m_height, m_fenceValue);
|
||||
|
||||
copySuccess = m_d3d12Handler->CopyRGBAFrame(
|
||||
rgbaPtr,
|
||||
d3d12Resource,
|
||||
m_width, m_height,
|
||||
m_stream
|
||||
m_stream,
|
||||
m_fenceValue // Signal fence when CUDA work completes
|
||||
);
|
||||
|
||||
output_frame.color_space = ColorSpace::RGB32;
|
||||
@@ -1500,6 +1511,9 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
output_frame.color_space = ColorSpace::YUV420P;
|
||||
}
|
||||
|
||||
// Store fence value in output frame for async synchronization
|
||||
output_frame.sync_fence_value = m_fenceValue;
|
||||
|
||||
// Unmap frame
|
||||
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
|
||||
|
||||
@@ -1530,11 +1544,11 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
return false;
|
||||
}
|
||||
|
||||
// Signal D3D12 fence
|
||||
m_d3d12Handler->SignalD3D12Fence(++m_fenceValue);
|
||||
// CopyRGBAFrame already signaled the fence, so just store the value
|
||||
// No need to signal again - that would cause fence value mismatch!
|
||||
output_frame.sync_fence_value = m_fenceValue;
|
||||
|
||||
LOGF_DEBUG("[DecodeToSurface] D3D12 frame processing complete");
|
||||
LOGF_DEBUG("[DecodeToSurface] D3D12 frame processing complete, fence_value=%llu", m_fenceValue);
|
||||
|
||||
// Fill output frame metadata (color_space already set above)
|
||||
output_frame.width = m_width;
|
||||
|
||||
Reference in New Issue
Block a user