WIP
This commit is contained in:
627
vav2/docs/working/Vav2Player_Stutter_Fix_Design.md
Normal file
627
vav2/docs/working/Vav2Player_Stutter_Fix_Design.md
Normal file
@@ -0,0 +1,627 @@
|
||||
# Vav2Player Stutter Fix Design
|
||||
|
||||
**Date**: 2025-10-08
|
||||
**Status**: In Progress
|
||||
**Priority**: Critical
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Vav2Player exhibits persistent stuttering ("통통 튀는 현상") during 30fps AV1 video playback despite previous B-frame reordering fixes. Analysis reveals three critical synchronization issues in the NVDEC → Triple Buffering → Staging Texture → Renderer pipeline.
|
||||
|
||||
## Root Cause Analysis
|
||||
|
||||
### Problem 1: GPU Copy Race Condition
|
||||
|
||||
**Issue**: Asynchronous GPU copy to staging texture without completion wait
|
||||
|
||||
**Current Flow**:
|
||||
```
|
||||
NVDEC Decode → Backend Texture (m_rgbaTextures[0,1,2])
|
||||
↓
|
||||
GPU CopyResource (async, no wait)
|
||||
↓
|
||||
Staging Texture (m_stagingTexture)
|
||||
↓
|
||||
Renderer reads (60Hz Present)
|
||||
```
|
||||
|
||||
**Race Condition**:
|
||||
- `FrameProcessor.cpp:109` calls `CopyToStagingTexture()` and returns immediately
|
||||
- GPU copy executes asynchronously on command queue
|
||||
- Next frame may overwrite staging texture before GPU copy completes
|
||||
- Renderer (`RGBASurfaceBackend.cpp:200-206`) reads staging texture while copy in-flight
|
||||
|
||||
**Evidence**:
|
||||
```cpp
|
||||
// FrameProcessor.cpp:106-114
|
||||
if (result == VAVCORE_SUCCESS) {
|
||||
auto backend = m_renderer->GetRGBASurfaceBackend();
|
||||
if (backend) {
|
||||
HRESULT hr = backend->CopyToStagingTexture(rgbaTexture);
|
||||
// ❌ NO WAIT HERE - returns immediately
|
||||
// ❌ GPU copy may not be complete
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Symptoms**:
|
||||
- Flickering/tearing during playback
|
||||
- Inconsistent frame presentation
|
||||
- Visual artifacts (partial frame updates)
|
||||
|
||||
---
|
||||
|
||||
### Problem 2: NVDEC Decode Completion Not Verified
|
||||
|
||||
**Issue**: Polling thread exists but decode completion not enforced in DecodeToSurface
|
||||
|
||||
**Current Implementation**:
|
||||
- `NVDECAV1Decoder.cpp` has `PollingThreadFunc()` for `cuvidGetDecodeStatus()`
|
||||
- `DecodeToSurface()` only waits for FIFO ordering (submission queue)
|
||||
- **No wait for `slot.is_ready` flag** (set by polling thread)
|
||||
|
||||
**Evidence**:
|
||||
```cpp
|
||||
// NVDECAV1Decoder.cpp - DecodeToSurface (approximate line ~1400)
|
||||
// FIFO wait only
|
||||
while (m_returnCounter.load() != submission_id) {
|
||||
std::unique_lock<std::mutex> fifo_lock(m_fifoWaitMutex);
|
||||
m_fifoWaitCV.wait_for(fifo_lock, std::chrono::milliseconds(100), ...);
|
||||
}
|
||||
|
||||
// ❌ MISSING: Wait for slot.is_ready
|
||||
// ❌ MISSING: cuvidGetDecodeStatus() completion check
|
||||
// Function returns immediately after FIFO order satisfied
|
||||
```
|
||||
|
||||
**Impact**:
|
||||
- Backend texture may contain incomplete decoded data
|
||||
- GPU copy operates on partial decode results
|
||||
- Frame quality inconsistency
|
||||
|
||||
---
|
||||
|
||||
### Problem 3: Playback Timing Irregularity
|
||||
|
||||
**Issue**: Sleep-first strategy causes cumulative timing jitter
|
||||
|
||||
**Current Implementation** (`PlaybackController.cpp:354-371`):
|
||||
```cpp
|
||||
// Sleep FIRST (fixed duration)
|
||||
std::this_thread::sleep_for(targetIntervalMs);
|
||||
|
||||
// Then invoke callback (variable duration)
|
||||
m_frameReadyCallback(); // Blocking: 6-20ms depending on frame type
|
||||
```
|
||||
|
||||
**Timing Analysis**:
|
||||
| Frame Type | Sleep | Callback | Total | Target | Error |
|
||||
|------------|-------|----------|-------|--------|-------|
|
||||
| Display-only | 33ms | 6ms | 39ms | 33.33ms | +5.67ms |
|
||||
| Normal decode | 33ms | 20ms | 53ms | 33.33ms | +19.67ms |
|
||||
|
||||
**B-frame Pattern** (every 3rd frame is Display-only):
|
||||
```
|
||||
Frame 0: Normal (53ms total) → 19.67ms late
|
||||
Frame 1: Normal (53ms total) → 19.67ms late
|
||||
Frame 2: Display (39ms total) → 5.67ms late
|
||||
Frame 3: Normal (53ms total) → 19.67ms late
|
||||
...
|
||||
```
|
||||
|
||||
**Compounding Effects**:
|
||||
- Combined with VSync Present(1,0): frames shown for 1 or 2 VSync cycles (16.66ms or 33.33ms)
|
||||
- Irregular display duration causes perceived "jumping" motion
|
||||
- User perception: "통통 튀는 현상" (bouncy/stuttering playback)
|
||||
|
||||
---
|
||||
|
||||
## Solution Design
|
||||
|
||||
### Solution 1: GPU Copy Completion Synchronization
|
||||
|
||||
**Objective**: Ensure GPU copy completes before proceeding to rendering
|
||||
|
||||
**Implementation**:
|
||||
|
||||
#### 1.1 Add GPU Fence to RGBASurfaceBackend
|
||||
|
||||
**File**: `RGBASurfaceBackend.h`
|
||||
|
||||
```cpp
|
||||
class RGBASurfaceBackend : public IVideoBackend {
|
||||
public:
|
||||
// ... existing methods ...
|
||||
|
||||
// New method: Wait for GPU copy to complete
|
||||
HRESULT WaitForCopyCompletion();
|
||||
|
||||
private:
|
||||
// ... existing members ...
|
||||
|
||||
// GPU synchronization for copy operations
|
||||
ComPtr<ID3D12Fence> m_copyFence;
|
||||
UINT64 m_copyFenceValue = 0;
|
||||
HANDLE m_copyFenceEvent = nullptr;
|
||||
};
|
||||
```
|
||||
|
||||
#### 1.2 Create Fence in Initialize
|
||||
|
||||
**File**: `RGBASurfaceBackend.cpp` (in `Initialize()` method)
|
||||
|
||||
```cpp
|
||||
// Create fence for GPU copy synchronization
|
||||
HRESULT hr = m_device->CreateFence(
|
||||
0,
|
||||
D3D12_FENCE_FLAG_NONE,
|
||||
IID_PPV_ARGS(&m_copyFence)
|
||||
);
|
||||
if (FAILED(hr)) {
|
||||
return hr;
|
||||
}
|
||||
|
||||
// Create fence event
|
||||
m_copyFenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
|
||||
if (m_copyFenceEvent == nullptr) {
|
||||
return HRESULT_FROM_WIN32(GetLastError());
|
||||
}
|
||||
```
|
||||
|
||||
#### 1.3 Signal Fence After Copy Submission
|
||||
|
||||
**File**: `RGBASurfaceBackend.cpp` (in `CopyToStagingTexture()`)
|
||||
|
||||
```cpp
|
||||
HRESULT RGBASurfaceBackend::CopyToStagingTexture(ID3D12Resource* sourceTexture) {
|
||||
// ... existing copy commands ...
|
||||
|
||||
m_copyCommandList->Close();
|
||||
ID3D12CommandList* commandLists[] = { m_copyCommandList.Get() };
|
||||
m_commandQueue->ExecuteCommandLists(1, commandLists);
|
||||
|
||||
// Signal fence after copy submission
|
||||
m_copyFenceValue++;
|
||||
HRESULT hr = m_commandQueue->Signal(m_copyFence.Get(), m_copyFenceValue);
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[CopyToStagingTexture] Failed to signal fence: 0x%08X", hr);
|
||||
return hr;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[CopyToStagingTexture] GPU copy submitted (fence value: %llu)",
|
||||
m_copyFenceValue);
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
```
|
||||
|
||||
#### 1.4 Implement Wait Method
|
||||
|
||||
**File**: `RGBASurfaceBackend.cpp` (new method)
|
||||
|
||||
```cpp
|
||||
HRESULT RGBASurfaceBackend::WaitForCopyCompletion() {
|
||||
// Check if copy already completed
|
||||
if (m_copyFence->GetCompletedValue() >= m_copyFenceValue) {
|
||||
return S_OK; // Already complete
|
||||
}
|
||||
|
||||
// Wait for GPU copy to complete
|
||||
HRESULT hr = m_copyFence->SetEventOnCompletion(
|
||||
m_copyFenceValue,
|
||||
m_copyFenceEvent
|
||||
);
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[WaitForCopyCompletion] SetEventOnCompletion failed: 0x%08X", hr);
|
||||
return hr;
|
||||
}
|
||||
|
||||
DWORD waitResult = WaitForSingleObject(m_copyFenceEvent, 5000); // 5 second timeout
|
||||
if (waitResult != WAIT_OBJECT_0) {
|
||||
LOGF_ERROR("[WaitForCopyCompletion] Wait failed or timed out: %lu", waitResult);
|
||||
return E_FAIL;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[WaitForCopyCompletion] GPU copy completed (fence value: %llu)",
|
||||
m_copyFenceValue);
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
```
|
||||
|
||||
#### 1.5 Call Wait in FrameProcessor
|
||||
|
||||
**File**: `FrameProcessor.cpp` (in `ProcessFrame()`)
|
||||
|
||||
```cpp
|
||||
// After successful decode, copy to staging texture for safe rendering
|
||||
if (result == VAVCORE_SUCCESS) {
|
||||
auto backend = m_renderer->GetRGBASurfaceBackend();
|
||||
if (backend) {
|
||||
HRESULT hr = backend->CopyToStagingTexture(rgbaTexture);
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[FrameProcessor] Failed to copy to staging texture: 0x%08X", hr);
|
||||
} else {
|
||||
// Wait for GPU copy to complete before proceeding
|
||||
hr = backend->WaitForCopyCompletion();
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[FrameProcessor] Failed to wait for copy completion: 0x%08X", hr);
|
||||
} else {
|
||||
LOGF_DEBUG("[FrameProcessor] GPU copy completed, staging texture ready");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### 1.6 Cleanup in Shutdown
|
||||
|
||||
**File**: `RGBASurfaceBackend.cpp` (in `Shutdown()`)
|
||||
|
||||
```cpp
|
||||
// Close fence event handle
|
||||
if (m_copyFenceEvent != nullptr) {
|
||||
CloseHandle(m_copyFenceEvent);
|
||||
m_copyFenceEvent = nullptr;
|
||||
}
|
||||
|
||||
// Release fence
|
||||
m_copyFence.Reset();
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Solution 2: NVDEC Decode Completion Wait
|
||||
|
||||
**Objective**: Ensure NVDEC hardware decoding completes before accessing decoded surface
|
||||
|
||||
**Implementation**:
|
||||
|
||||
#### 2.1 Find Decode Slot in DecodeToSurface
|
||||
|
||||
**File**: `NVDECAV1Decoder.cpp` (in `DecodeToSurface()`, after FIFO wait)
|
||||
|
||||
```cpp
|
||||
// After FIFO ordering wait, find the actual slot used by this submission
|
||||
int slot_idx = -1;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_submissionMutex);
|
||||
|
||||
// Search for slot matching this submission_id
|
||||
for (size_t i = 0; i < RING_BUFFER_SIZE; i++) {
|
||||
if (m_ringBuffer[i].submission_id == submission_id &&
|
||||
m_ringBuffer[i].in_use.load()) {
|
||||
slot_idx = static_cast<int>(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (slot_idx < 0) {
|
||||
LOGF_ERROR("[DecodeToSurface] Failed to find decode slot for submission_id=%llu",
|
||||
submission_id);
|
||||
return VAVCORE_ERROR_DECODE_FAILED;
|
||||
}
|
||||
```
|
||||
|
||||
#### 2.2 Wait for Decode Completion
|
||||
|
||||
**File**: `NVDECAV1Decoder.cpp` (in `DecodeToSurface()`, after finding slot)
|
||||
|
||||
```cpp
|
||||
DecodeSlot& slot = m_ringBuffer[slot_idx];
|
||||
|
||||
// Wait for NVDEC decode to complete (signaled by polling thread)
|
||||
{
|
||||
std::unique_lock<std::mutex> slot_lock(slot.slot_mutex);
|
||||
|
||||
bool decode_ready = slot.frame_ready.wait_for(
|
||||
slot_lock,
|
||||
std::chrono::milliseconds(500), // 500ms timeout
|
||||
[&slot]() {
|
||||
return slot.is_ready.load();
|
||||
}
|
||||
);
|
||||
|
||||
if (!decode_ready) {
|
||||
LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d (submission_id=%llu)",
|
||||
slot_idx, submission_id);
|
||||
|
||||
// Mark slot as failed
|
||||
slot.decoding_failed.store(true);
|
||||
slot.in_use.store(false);
|
||||
|
||||
return VAVCORE_ERROR_DECODE_TIMEOUT;
|
||||
}
|
||||
|
||||
// Check if decoding failed
|
||||
if (slot.decoding_failed.load()) {
|
||||
LOGF_ERROR("[DecodeToSurface] Decode failed for slot %d (submission_id=%llu)",
|
||||
slot_idx, submission_id);
|
||||
|
||||
slot.in_use.store(false);
|
||||
return VAVCORE_ERROR_DECODE_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[DecodeToSurface] Decode completed for slot %d (submission_id=%llu)",
|
||||
slot_idx, submission_id);
|
||||
```
|
||||
|
||||
#### 2.3 Update Polling Thread to Signal Readiness
|
||||
|
||||
**File**: `NVDECAV1Decoder.cpp` (in `PollingThreadFunc()`)
|
||||
|
||||
**Ensure polling thread properly signals `slot.is_ready` when `cuvidGetDecodeStatus()` returns success**
|
||||
|
||||
```cpp
|
||||
void NVDECAV1Decoder::PollingThreadFunc() {
|
||||
while (m_pollingRunning.load()) {
|
||||
// Poll all active slots
|
||||
for (size_t i = 0; i < RING_BUFFER_SIZE; i++) {
|
||||
DecodeSlot& slot = m_ringBuffer[i];
|
||||
|
||||
if (!slot.in_use.load() || slot.is_ready.load()) {
|
||||
continue; // Skip inactive or already ready slots
|
||||
}
|
||||
|
||||
int pic_idx = slot.picture_index;
|
||||
if (pic_idx < 0) {
|
||||
continue; // No picture assigned yet
|
||||
}
|
||||
|
||||
// Check decode status
|
||||
CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx);
|
||||
|
||||
if (result == CUDA_SUCCESS) {
|
||||
// Decode complete
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.is_ready.store(true);
|
||||
}
|
||||
slot.frame_ready.notify_all();
|
||||
|
||||
LOGF_DEBUG("[PollingThread] Slot %zu ready (pic_idx=%d)", i, pic_idx);
|
||||
} else if (result == CUDA_ERROR_NOT_READY) {
|
||||
// Still decoding, continue polling
|
||||
} else {
|
||||
// Decode error
|
||||
LOGF_ERROR("[PollingThread] Decode error for slot %zu (pic_idx=%d): %d",
|
||||
i, pic_idx, result);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.decoding_failed.store(true);
|
||||
slot.is_ready.store(true); // Signal to wake up waiter
|
||||
}
|
||||
slot.frame_ready.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
// Poll interval: 1ms for responsiveness
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[PollingThread] Polling thread exiting");
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Solution 3: Playback Timing Strategy Redesign
|
||||
|
||||
**Objective**: Maintain fixed 33.33ms frame intervals regardless of decode time variation
|
||||
|
||||
**Implementation**:
|
||||
|
||||
#### 3.1 Callback-First with Absolute Target Tracking
|
||||
|
||||
**File**: `PlaybackController.cpp` (in `TimingThreadLoop()`)
|
||||
|
||||
```cpp
|
||||
void PlaybackController::TimingThreadLoop()
|
||||
{
|
||||
// Set Windows timer resolution to 1ms for accurate sleep
|
||||
timeBeginPeriod(1);
|
||||
LOGF_INFO("[PlaybackController] Set Windows timer resolution to 1ms");
|
||||
|
||||
double baseIntervalMs = 1000.0 / m_frameRate;
|
||||
auto startTime = std::chrono::high_resolution_clock::now();
|
||||
auto nextFrameTarget = startTime;
|
||||
|
||||
LOGF_INFO("[PlaybackController] Timing thread loop started (target: %.2f fps, %.2f ms per frame)",
|
||||
m_frameRate, baseIntervalMs);
|
||||
|
||||
while (!m_shouldStopTiming && m_isPlaying) {
|
||||
auto frameStart = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// Apply playback speed
|
||||
double speed = m_playbackSpeed.load();
|
||||
double targetIntervalMs = baseIntervalMs / speed;
|
||||
|
||||
// Invoke callback FIRST (blocking decode + render)
|
||||
// This allows decode time to vary (6-20ms) without affecting frame interval
|
||||
auto callbackStart = std::chrono::high_resolution_clock::now();
|
||||
if (m_frameReadyCallback) {
|
||||
m_frameReadyCallback();
|
||||
}
|
||||
auto callbackEnd = std::chrono::high_resolution_clock::now();
|
||||
double callbackTime = std::chrono::duration<double, std::milli>(callbackEnd - callbackStart).count();
|
||||
|
||||
// Calculate next frame target (fixed interval from start time)
|
||||
nextFrameTarget += std::chrono::microseconds(static_cast<long long>(targetIntervalMs * 1000));
|
||||
|
||||
// Sleep until next frame target
|
||||
auto now = std::chrono::high_resolution_clock::now();
|
||||
auto sleepDuration = nextFrameTarget - now;
|
||||
|
||||
if (sleepDuration.count() > 0) {
|
||||
// Sleep for remaining time
|
||||
std::this_thread::sleep_until(nextFrameTarget);
|
||||
|
||||
double sleepTime = std::chrono::duration<double, std::milli>(sleepDuration).count();
|
||||
LOGF_DEBUG("[PlaybackController] Frame %llu timing: callback=%.2fms, sleep=%.2fms",
|
||||
m_currentFrame, callbackTime, sleepTime);
|
||||
} else {
|
||||
// Missed target - log warning
|
||||
double missedBy = std::chrono::duration<double, std::milli>(-sleepDuration).count();
|
||||
LOGF_WARNING("[PlaybackController] Frame %llu MISSED target by %.2fms (callback took %.2fms)",
|
||||
m_currentFrame, missedBy, callbackTime);
|
||||
|
||||
// Reset target to current time to avoid cumulative drift
|
||||
nextFrameTarget = now;
|
||||
}
|
||||
|
||||
// Update current time
|
||||
m_currentFrame++;
|
||||
m_currentTime = m_currentFrame / m_frameRate;
|
||||
}
|
||||
|
||||
// Restore Windows timer resolution
|
||||
timeEndPeriod(1);
|
||||
LOGF_INFO("[PlaybackController] Timing thread loop exited, timer resolution restored");
|
||||
}
|
||||
```
|
||||
|
||||
#### 3.2 Timing Characteristics
|
||||
|
||||
**New Timing Behavior**:
|
||||
```
|
||||
Frame 0 (Normal, 20ms decode):
|
||||
[Callback: 20ms] [Sleep: 13.33ms] = 33.33ms total ✓
|
||||
|
||||
Frame 1 (Normal, 20ms decode):
|
||||
[Callback: 20ms] [Sleep: 13.33ms] = 33.33ms total ✓
|
||||
|
||||
Frame 2 (Display-only, 6ms decode):
|
||||
[Callback: 6ms] [Sleep: 27.33ms] = 33.33ms total ✓
|
||||
|
||||
Frame 3 (Normal, 20ms decode):
|
||||
[Callback: 20ms] [Sleep: 13.33ms] = 33.33ms total ✓
|
||||
```
|
||||
|
||||
**Benefits**:
|
||||
- ✅ Fixed 33.33ms frame interval maintained
|
||||
- ✅ Decode time variation absorbed by sleep duration
|
||||
- ✅ No cumulative timing drift
|
||||
- ✅ Consistent VSync alignment (every 2 frames = 33.33ms)
|
||||
|
||||
---
|
||||
|
||||
## Implementation Plan
|
||||
|
||||
### Phase 1: Solution 1 (GPU Copy Sync)
|
||||
1. Add fence/event members to `RGBASurfaceBackend.h`
|
||||
2. Create fence in `Initialize()`, cleanup in `Shutdown()`
|
||||
3. Implement `WaitForCopyCompletion()`
|
||||
4. Signal fence in `CopyToStagingTexture()`
|
||||
5. Call wait in `FrameProcessor::ProcessFrame()`
|
||||
6. **Build and test**: Verify staging texture stability
|
||||
|
||||
### Phase 2: Solution 2 (NVDEC Decode Sync)
|
||||
1. Update `DecodeToSurface()` to find slot after FIFO wait
|
||||
2. Add decode completion wait with timeout
|
||||
3. Update `PollingThreadFunc()` to properly signal readiness
|
||||
4. Add error handling for decode failures/timeouts
|
||||
5. **Build and test**: Verify decode completion before surface access
|
||||
|
||||
### Phase 3: Solution 3 (Playback Timing)
|
||||
1. Redesign `TimingThreadLoop()` with callback-first strategy
|
||||
2. Implement absolute frame target tracking
|
||||
3. Add missed frame detection and recovery
|
||||
4. **Build and test**: Verify consistent 33.33ms frame intervals
|
||||
|
||||
### Phase 4: Integration Testing
|
||||
1. Run full playback test with all 3 fixes
|
||||
2. Measure frame timing consistency
|
||||
3. Verify stutter elimination
|
||||
4. Performance profiling (GPU/CPU utilization)
|
||||
|
||||
---
|
||||
|
||||
## Expected Results
|
||||
|
||||
### Before Fix
|
||||
- GPU copy race: Flickering/tearing artifacts
|
||||
- NVDEC not ready: Partial decoded frames
|
||||
- Timing irregular: 33-53ms frame intervals (39% variation)
|
||||
- User perception: **Severe stuttering** ("통통 튀는 현상")
|
||||
|
||||
### After Fix
|
||||
- GPU copy complete: Stable staging texture
|
||||
- NVDEC verified: Complete decoded frames
|
||||
- Timing fixed: Consistent 33.33ms intervals (±1ms tolerance)
|
||||
- User perception: **Smooth 30fps playback**
|
||||
|
||||
---
|
||||
|
||||
## Performance Impact
|
||||
|
||||
### CPU Impact
|
||||
- GPU fence wait: ~0.1ms per frame (minimal, GPU-bound)
|
||||
- NVDEC status poll: ~1ms per frame (already running in background)
|
||||
- Timing redesign: No additional CPU overhead
|
||||
|
||||
### GPU Impact
|
||||
- Fence overhead: Negligible (native GPU operation)
|
||||
- No additional GPU work introduced
|
||||
|
||||
### Latency Impact
|
||||
- Added synchronization: +1-2ms per frame
|
||||
- **Trade-off**: Slightly higher latency for stability and smoothness
|
||||
- Still well within 30fps budget (33.33ms)
|
||||
|
||||
---
|
||||
|
||||
## Risks and Mitigations
|
||||
|
||||
### Risk 1: Fence Wait Timeout
|
||||
**Mitigation**: 5-second timeout with error logging, graceful fallback
|
||||
|
||||
### Risk 2: NVDEC Decode Timeout
|
||||
**Mitigation**: 500ms timeout, mark slot as failed, continue with next frame
|
||||
|
||||
### Risk 3: Callback Takes >33ms
|
||||
**Mitigation**: Detect missed frames, log warning, reset timing target to prevent drift
|
||||
|
||||
---
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests
|
||||
- GPU fence creation/signaling/waiting
|
||||
- NVDEC decode status polling accuracy
|
||||
- Frame timing calculation correctness
|
||||
|
||||
### Integration Tests
|
||||
- Full 30fps playback for 60 seconds
|
||||
- Frame interval histogram (should cluster at 33.33ms ±1ms)
|
||||
- Visual inspection for stutter/artifacts
|
||||
|
||||
### Performance Tests
|
||||
- CPU utilization during playback
|
||||
- GPU utilization during playback
|
||||
- Memory usage over extended playback
|
||||
|
||||
---
|
||||
|
||||
## Success Criteria
|
||||
|
||||
1. **No GPU copy race**: Staging texture content stable across frames
|
||||
2. **NVDEC decode verified**: All frames fully decoded before access
|
||||
3. **Frame timing consistent**: 95% of frames within 33.33ms ±2ms
|
||||
4. **Stutter eliminated**: No visible "jumping" or irregular motion
|
||||
5. **Performance acceptable**: <5% CPU overhead, <2ms added latency
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- Previous fix: `NVDEC_Frame_Reordering_Fix_Design.md`
|
||||
- CUDA Documentation: `cuvidGetDecodeStatus()`
|
||||
- D3D12 Documentation: `ID3D12Fence`, `SetEventOnCompletion()`
|
||||
- Windows Multimedia Timer: `timeBeginPeriod()`
|
||||
|
||||
---
|
||||
|
||||
**Document Status**: Design Complete, Ready for Implementation
|
||||
**Next Step**: Implement Solution 1 (GPU Copy Sync)
|
||||
@@ -109,6 +109,14 @@ bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
|
||||
HRESULT hr = backend->CopyToStagingTexture(rgbaTexture);
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[FrameProcessor] Failed to copy to staging texture: 0x%08X", hr);
|
||||
} else {
|
||||
// Wait for GPU copy to complete before proceeding
|
||||
hr = backend->WaitForCopyCompletion();
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[FrameProcessor] Failed to wait for copy completion: 0x%08X", hr);
|
||||
} else {
|
||||
LOGF_DEBUG("[FrameProcessor] GPU copy completed, staging texture ready");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -334,35 +334,23 @@ void PlaybackController::TimingThreadLoop()
|
||||
LOGF_INFO("[PlaybackController] Set Windows timer resolution to 1ms");
|
||||
|
||||
double baseIntervalMs = 1000.0 / m_frameRate;
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
auto lastFrameTime = start;
|
||||
auto startTime = std::chrono::high_resolution_clock::now();
|
||||
auto nextFrameTarget = startTime;
|
||||
|
||||
LOGF_INFO("[PlaybackController] Timing thread loop started (target: %.2f fps, %.2f ms per frame)",
|
||||
m_frameRate, baseIntervalMs);
|
||||
|
||||
while (!m_shouldStopTiming && m_isPlaying) {
|
||||
auto frameStart = std::chrono::high_resolution_clock::now();
|
||||
double sinceLast = std::chrono::duration<double, std::milli>(frameStart - lastFrameTime).count();
|
||||
|
||||
LOGF_INFO("[PlaybackController] ==== FRAME %llu START (%.2f ms since last) ====",
|
||||
m_currentFrame, sinceLast);
|
||||
|
||||
// Apply playback speed: smaller interval = faster playback
|
||||
// Apply playback speed
|
||||
double speed = m_playbackSpeed.load();
|
||||
double targetIntervalMs = baseIntervalMs / speed;
|
||||
|
||||
// Sleep FIRST to maintain precise frame timing
|
||||
// This ensures callback blocking (decode time) doesn't affect interval
|
||||
if (targetIntervalMs > 0.0) {
|
||||
auto sleepDuration = std::chrono::microseconds(static_cast<long long>(targetIntervalMs * 1000));
|
||||
std::this_thread::sleep_for(sleepDuration);
|
||||
}
|
||||
LOGF_INFO("[PlaybackController] ==== FRAME %llu START ====", m_currentFrame);
|
||||
|
||||
auto afterSleep = std::chrono::high_resolution_clock::now();
|
||||
double sleepTime = std::chrono::duration<double, std::milli>(afterSleep - frameStart).count();
|
||||
|
||||
// Invoke frame-ready callback AFTER sleep
|
||||
// Callback may block (10-15ms decode), but this doesn't affect next frame timing
|
||||
// Invoke callback FIRST (blocking decode + render)
|
||||
// This allows decode time to vary (6-20ms) without affecting frame interval
|
||||
auto callbackStart = std::chrono::high_resolution_clock::now();
|
||||
if (m_frameReadyCallback) {
|
||||
m_frameReadyCallback();
|
||||
@@ -370,16 +358,35 @@ void PlaybackController::TimingThreadLoop()
|
||||
auto callbackEnd = std::chrono::high_resolution_clock::now();
|
||||
double callbackTime = std::chrono::duration<double, std::milli>(callbackEnd - callbackStart).count();
|
||||
|
||||
// Calculate next frame target (fixed interval from start time)
|
||||
nextFrameTarget += std::chrono::microseconds(static_cast<long long>(targetIntervalMs * 1000));
|
||||
|
||||
// Sleep until next frame target
|
||||
auto now = std::chrono::high_resolution_clock::now();
|
||||
auto sleepDuration = nextFrameTarget - now;
|
||||
|
||||
if (sleepDuration.count() > 0) {
|
||||
// Sleep for remaining time
|
||||
std::this_thread::sleep_until(nextFrameTarget);
|
||||
|
||||
double sleepTime = std::chrono::duration<double, std::milli>(sleepDuration).count();
|
||||
double totalTime = std::chrono::duration<double, std::milli>(nextFrameTarget - frameStart).count();
|
||||
|
||||
LOGF_INFO("[PlaybackController] Frame %llu complete: callback=%.2fms, sleep=%.2fms, total=%.2fms",
|
||||
m_currentFrame, callbackTime, sleepTime, totalTime);
|
||||
} else {
|
||||
// Missed target - log warning
|
||||
double missedBy = std::chrono::duration<double, std::milli>(-sleepDuration).count();
|
||||
LOGF_WARNING("[PlaybackController] Frame %llu MISSED target by %.2fms (callback took %.2fms)",
|
||||
m_currentFrame, missedBy, callbackTime);
|
||||
|
||||
// Reset target to current time to avoid cumulative drift
|
||||
nextFrameTarget = now;
|
||||
}
|
||||
|
||||
// Update current time
|
||||
m_currentFrame++;
|
||||
m_currentTime = m_currentFrame / m_frameRate;
|
||||
|
||||
double totalFrameTime = std::chrono::duration<double, std::milli>(callbackEnd - frameStart).count();
|
||||
|
||||
LOGF_INFO("[PlaybackController] Frame %llu complete (total: %.2f ms, sleep: %.2f ms, callback: %.2f ms)",
|
||||
m_currentFrame - 1, totalFrameTime, sleepTime, callbackTime);
|
||||
|
||||
lastFrameTime = frameStart;
|
||||
}
|
||||
|
||||
// Restore Windows timer resolution
|
||||
|
||||
@@ -61,6 +61,15 @@ void RGBASurfaceBackend::Shutdown() {
|
||||
m_copyCommandAllocator.Reset();
|
||||
m_stagingTexture.Reset();
|
||||
|
||||
// Close fence event handle
|
||||
if (m_copyFenceEvent != nullptr) {
|
||||
CloseHandle(m_copyFenceEvent);
|
||||
m_copyFenceEvent = nullptr;
|
||||
}
|
||||
|
||||
// Release fence
|
||||
m_copyFence.Reset();
|
||||
|
||||
// Clear references (not owned)
|
||||
m_device = nullptr;
|
||||
m_commandQueue = nullptr;
|
||||
@@ -138,6 +147,37 @@ HRESULT RGBASurfaceBackend::CreateVideoTexture(uint32_t width, uint32_t height)
|
||||
return hr;
|
||||
}
|
||||
|
||||
// Create fence for GPU copy synchronization
|
||||
hr = m_device->CreateFence(
|
||||
0,
|
||||
D3D12_FENCE_FLAG_NONE,
|
||||
IID_PPV_ARGS(&m_copyFence)
|
||||
);
|
||||
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[RGBASurfaceBackend] Failed to create copy fence: 0x%08X", hr);
|
||||
m_stagingTexture.Reset();
|
||||
for (int i = 0; i < BUFFER_COUNT; i++) {
|
||||
m_rgbaTextures[i].Reset();
|
||||
}
|
||||
return hr;
|
||||
}
|
||||
|
||||
// Create fence event for CPU wait
|
||||
m_copyFenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
|
||||
if (m_copyFenceEvent == nullptr) {
|
||||
hr = HRESULT_FROM_WIN32(GetLastError());
|
||||
LOGF_ERROR("[RGBASurfaceBackend] Failed to create fence event: 0x%08X", hr);
|
||||
m_copyFence.Reset();
|
||||
m_stagingTexture.Reset();
|
||||
for (int i = 0; i < BUFFER_COUNT; i++) {
|
||||
m_rgbaTextures[i].Reset();
|
||||
}
|
||||
return hr;
|
||||
}
|
||||
|
||||
LOGF_INFO("[RGBASurfaceBackend] Copy fence and event created successfully");
|
||||
|
||||
// Create command allocator and list for texture copy operations
|
||||
hr = m_device->CreateCommandAllocator(
|
||||
D3D12_COMMAND_LIST_TYPE_DIRECT,
|
||||
@@ -594,6 +634,16 @@ HRESULT RGBASurfaceBackend::CopyToStagingTexture(ID3D12Resource* sourceTexture)
|
||||
return hr;
|
||||
}
|
||||
|
||||
// Transition staging texture from PIXEL_SHADER_RESOURCE to COPY_DEST
|
||||
// (First frame: starts in COPY_DEST, subsequent frames: comes from PIXEL_SHADER_RESOURCE)
|
||||
D3D12_RESOURCE_BARRIER stagingToCopyDest = {};
|
||||
stagingToCopyDest.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||
stagingToCopyDest.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
|
||||
stagingToCopyDest.Transition.pResource = m_stagingTexture.Get();
|
||||
stagingToCopyDest.Transition.StateBefore = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
|
||||
stagingToCopyDest.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST;
|
||||
stagingToCopyDest.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||
|
||||
// Transition source texture to COPY_SOURCE
|
||||
D3D12_RESOURCE_BARRIER sourceBarrier = {};
|
||||
sourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
|
||||
@@ -603,7 +653,8 @@ HRESULT RGBASurfaceBackend::CopyToStagingTexture(ID3D12Resource* sourceTexture)
|
||||
sourceBarrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
|
||||
sourceBarrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
|
||||
|
||||
m_copyCommandList->ResourceBarrier(1, &sourceBarrier);
|
||||
D3D12_RESOURCE_BARRIER barriers[] = { stagingToCopyDest, sourceBarrier };
|
||||
m_copyCommandList->ResourceBarrier(2, barriers);
|
||||
|
||||
// Copy texture
|
||||
m_copyCommandList->CopyResource(m_stagingTexture.Get(), sourceTexture);
|
||||
@@ -635,10 +686,48 @@ HRESULT RGBASurfaceBackend::CopyToStagingTexture(ID3D12Resource* sourceTexture)
|
||||
ID3D12CommandList* commandLists[] = { m_copyCommandList.Get() };
|
||||
m_commandQueue->ExecuteCommandLists(1, commandLists);
|
||||
|
||||
// Note: We don't wait for completion here - the next Present() will sync via fence
|
||||
// This allows the copy to happen asynchronously while decoder continues
|
||||
// Signal fence after copy submission
|
||||
m_copyFenceValue++;
|
||||
hr = m_commandQueue->Signal(m_copyFence.Get(), m_copyFenceValue);
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[RGBASurfaceBackend] Failed to signal copy fence: 0x%08X", hr);
|
||||
return hr;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[RGBASurfaceBackend] Copied decoder texture to staging texture (async)");
|
||||
LOGF_DEBUG("[RGBASurfaceBackend] GPU copy submitted (fence value: %llu)", m_copyFenceValue);
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
HRESULT RGBASurfaceBackend::WaitForCopyCompletion() {
|
||||
if (!m_copyFence || m_copyFenceEvent == nullptr) {
|
||||
LOGF_ERROR("[RGBASurfaceBackend] Copy fence or event not initialized");
|
||||
return E_NOT_VALID_STATE;
|
||||
}
|
||||
|
||||
// Check if copy already completed
|
||||
if (m_copyFence->GetCompletedValue() >= m_copyFenceValue) {
|
||||
LOGF_DEBUG("[RGBASurfaceBackend] GPU copy already complete (fence value: %llu)", m_copyFenceValue);
|
||||
return S_OK; // Already complete
|
||||
}
|
||||
|
||||
// Wait for GPU copy to complete
|
||||
HRESULT hr = m_copyFence->SetEventOnCompletion(
|
||||
m_copyFenceValue,
|
||||
m_copyFenceEvent
|
||||
);
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[RGBASurfaceBackend] SetEventOnCompletion failed: 0x%08X", hr);
|
||||
return hr;
|
||||
}
|
||||
|
||||
DWORD waitResult = WaitForSingleObject(m_copyFenceEvent, 5000); // 5 second timeout
|
||||
if (waitResult != WAIT_OBJECT_0) {
|
||||
LOGF_ERROR("[RGBASurfaceBackend] Wait failed or timed out: %lu", waitResult);
|
||||
return E_FAIL;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[RGBASurfaceBackend] GPU copy completed (fence value: %llu)", m_copyFenceValue);
|
||||
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
@@ -52,6 +52,9 @@ public:
|
||||
// Copy decoder texture to stable staging texture
|
||||
HRESULT CopyToStagingTexture(ID3D12Resource* sourceTexture);
|
||||
|
||||
// Wait for GPU copy to complete
|
||||
HRESULT WaitForCopyCompletion();
|
||||
|
||||
// Get stable staging texture for rendering (never overwritten by decoder)
|
||||
ID3D12Resource* GetStagingTexture() const { return m_stagingTexture.Get(); }
|
||||
|
||||
@@ -87,6 +90,11 @@ private:
|
||||
ComPtr<ID3D12CommandAllocator> m_copyCommandAllocator;
|
||||
ComPtr<ID3D12GraphicsCommandList> m_copyCommandList;
|
||||
|
||||
// GPU synchronization for copy operations
|
||||
ComPtr<ID3D12Fence> m_copyFence;
|
||||
UINT64 m_copyFenceValue = 0;
|
||||
HANDLE m_copyFenceEvent = nullptr;
|
||||
|
||||
// Graphics pipeline for simple RGBA texture sampling
|
||||
ComPtr<ID3D12RootSignature> m_rootSignature;
|
||||
ComPtr<ID3D12PipelineState> m_pipelineState;
|
||||
|
||||
Reference in New Issue
Block a user