WIP
This commit is contained in:
23
todo16.txt
23
todo16.txt
@@ -371,3 +371,26 @@ ulNumDecodeSurfaces 갯수만큼 cuvidCreateDecoder() 인스턴스를 생성해
|
||||
|
||||
The core NVDEC integration is working - frames are being decoded successfully by the hardware. The remaining work is properly synchronizing the async decode pipeline with
|
||||
the multi-frame packet structure.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
● 이제 simple_test.webm으로 테스트하겠습니다:
|
||||
|
||||
● Bash("D:/Project/video-av1/vav2/platforms/windows/tests/red-surface-nvdec/bin/Debug/RedSurfaceNVDECTest.exe" "D:/Project/video-av1/sample/simple_test.webm") timeout: 1m 0s
|
||||
⎿ Error: Command timed out after 1m 0s [NVDECAV1Decoder] CUDA ERROR in cuvidParseVideoData: unknown error (code: 999)
|
||||
[NVDECAV1Decoder] CUDA ERROR in cuvidParseVideoData: unknown error (code: 999)
|
||||
|
||||
[DllMain] VavCore.dll loaded - Process Attach
|
||||
[DllMain] Safe to initialize: Ready for vavcore_initialize() call
|
||||
[RedSurfaceNVDECTest] Starting test...
|
||||
[RedSurfaceNVDECTest] Video: D:/Project/video-av1/sample/simple_test.webm
|
||||
|
||||
[Step 1] Initializing VavCore...
|
||||
[DllMain] Performing safe decoder registration...
|
||||
… +46 lines (ctrl+o to see all)
|
||||
|
||||
● ChromaFormat 3 (YUV444)가 문제입니다. 이건 Option C 문제가 아니라 기존 문제입니다. test_720p_stripe.webm으로 테스트하겠습니다:
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# NVDEC RingBuffer-based Asynchronous Decoding Design
|
||||
|
||||
**Date**: 2025-10-04
|
||||
**Status**: 🔄 **In Progress**
|
||||
**Target**: NVDECAV1Decoder DecodeToSurface() optimization
|
||||
**Date**: 2025-10-05 (Updated)
|
||||
**Status**: ✅ **Design Finalized - Ready for Implementation**
|
||||
**Target**: NVDECAV1Decoder DecodeToSurface() optimization with multi-frame packet support
|
||||
|
||||
---
|
||||
|
||||
@@ -78,6 +78,32 @@ Thread B: pop() → picture_index=4 ❌ (gets packet1 result!)
|
||||
|
||||
---
|
||||
|
||||
#### Issue 3: Multi-Frame Packet Handling ⚠️ **Critical Discovery**
|
||||
|
||||
**Scenario**: Single WebM packet contains multiple AV1 frames
|
||||
|
||||
```cpp
|
||||
// DecodeToSurface called ONCE
|
||||
DecodeToSurface(packet_175bytes, surface1, frame1);
|
||||
|
||||
// NVDEC parser extracts MULTIPLE frames from single packet:
|
||||
→ HandlePictureDecode(CurrPicIdx=0, IntraPicFlag=1) // I-frame
|
||||
→ HandlePictureDecode(CurrPicIdx=1, IntraPicFlag=0) // P-frame
|
||||
→ HandlePictureDecode(CurrPicIdx=2, IntraPicFlag=0) // P-frame
|
||||
→ ... (up to 8 frames in one packet)
|
||||
|
||||
// Problem: Which picture_index should be returned?
|
||||
// Current design assumes: 1 packet = 1 frame ❌
|
||||
```
|
||||
|
||||
**Impact**:
|
||||
- Slot allocation assumes 1 packet → 1 slot → 1 picture_index
|
||||
- Reality: 1 packet → 1 slot → **N picture_indices**
|
||||
- Must track multiple picture_indices per slot
|
||||
- Must decide which frame to return (first? last? all?)
|
||||
|
||||
---
|
||||
|
||||
#### Issue 3: ulNumOutputSurfaces Underutilization
|
||||
|
||||
**NVDEC Configuration**:
|
||||
@@ -172,13 +198,14 @@ struct DecodeSlot {
|
||||
void* target_surface; // Destination D3D12 resource
|
||||
VavCoreSurfaceType surface_type; // Surface type
|
||||
|
||||
// NVDEC information (from HandlePictureDisplay callback)
|
||||
int picture_index; // NVDEC frame index for cuvidMapVideoFrame
|
||||
// NVDEC information (from HandlePictureDecode callback)
|
||||
// ⚠️ Multi-frame support: One packet can decode to multiple frames
|
||||
std::vector<int> picture_indices; // All NVDEC frame indices from this packet
|
||||
|
||||
// Synchronization primitives
|
||||
std::condition_variable frame_ready; // Signaled when decode complete
|
||||
std::condition_variable frame_ready; // Signaled when ALL frames are decoded
|
||||
std::mutex slot_mutex; // Protects this slot's state
|
||||
bool is_ready; // Decode completed flag
|
||||
bool is_ready; // All frames decoded flag
|
||||
};
|
||||
```
|
||||
|
||||
@@ -191,9 +218,9 @@ private:
|
||||
|
||||
DecodeSlot m_ringBuffer[RING_BUFFER_SIZE];
|
||||
|
||||
// Producer-consumer indices
|
||||
std::atomic<size_t> m_submitIndex{0}; // Next slot to allocate (producer)
|
||||
std::atomic<size_t> m_returnIndex{0}; // Next slot to return (consumer)
|
||||
// 🎯 Option C: Unified slot allocation counter (no mapping needed!)
|
||||
std::atomic<uint64_t> m_slotIdCounter{0}; // Monotonically increasing slot ID
|
||||
std::atomic<uint64_t> m_returnIdCounter{0}; // Return order enforcement (FIFO)
|
||||
|
||||
// Polling thread
|
||||
std::thread m_pollingThread;
|
||||
@@ -207,14 +234,15 @@ private:
|
||||
|
||||
### Component 1: Slot Allocation (Producer)
|
||||
|
||||
**Purpose**: Assign RingBuffer slot to each DecodeToSurface call
|
||||
**Purpose**: Assign RingBuffer slot to each DecodeToSurface call using Option C design
|
||||
|
||||
```cpp
|
||||
// In DecodeToSurface()
|
||||
|
||||
// 1. Allocate next available slot
|
||||
size_t my_slot_idx = m_submitIndex.fetch_add(1) % RING_BUFFER_SIZE;
|
||||
DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
|
||||
// 🎯 Option C: Allocate unique ID (serves as both slot ID and submission order)
|
||||
uint64_t my_id = m_slotIdCounter.fetch_add(1);
|
||||
size_t slot_idx = my_id % RING_BUFFER_SIZE;
|
||||
DecodeSlot& my_slot = m_ringBuffer[slot_idx];
|
||||
|
||||
// 2. Check for overflow
|
||||
{
|
||||
@@ -230,27 +258,29 @@ DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
|
||||
my_slot.in_use = true;
|
||||
my_slot.target_surface = target_surface;
|
||||
my_slot.surface_type = target_type;
|
||||
my_slot.picture_index = -1; // Set by HandlePictureDisplay
|
||||
my_slot.picture_indices.clear(); // Multi-frame support: clear previous frames
|
||||
my_slot.is_ready = false;
|
||||
}
|
||||
```
|
||||
|
||||
**Atomic Counter Behavior**:
|
||||
**🎯 Option C: No Mapping Needed - Direct Calculation**:
|
||||
```
|
||||
Thread 1: m_submitIndex.fetch_add(1) → 0 % 8 = slot[0]
|
||||
Thread 2: m_submitIndex.fetch_add(1) → 1 % 8 = slot[1]
|
||||
Thread 3: m_submitIndex.fetch_add(1) → 2 % 8 = slot[2]
|
||||
Thread 1: m_slotIdCounter.fetch_add(1) → ID=0, slot_idx = 0 % 8 = slot[0]
|
||||
Thread 2: m_slotIdCounter.fetch_add(1) → ID=1, slot_idx = 1 % 8 = slot[1]
|
||||
Thread 3: m_slotIdCounter.fetch_add(1) → ID=2, slot_idx = 2 % 8 = slot[2]
|
||||
...
|
||||
Thread 9: m_submitIndex.fetch_add(1) → 8 % 8 = slot[0] (wrap around)
|
||||
Thread 9: m_slotIdCounter.fetch_add(1) → ID=8, slot_idx = 8 % 8 = slot[0] (wrap around)
|
||||
```
|
||||
|
||||
**Key Advantage**: ID → slot_idx calculation is **deterministic**, no map storage needed!
|
||||
|
||||
**Overflow Protection**: If `slot[0].in_use == true` when Thread 9 arrives → error
|
||||
|
||||
---
|
||||
|
||||
### Component 2: Packet Submission
|
||||
|
||||
**Purpose**: Submit packet to NVDEC with slot index tracking
|
||||
**Purpose**: Submit packet to NVDEC with slot ID tracking via timestamp
|
||||
|
||||
```cpp
|
||||
// 4. Submit packet to NVDEC parser
|
||||
@@ -258,7 +288,7 @@ CUVIDSOURCEDATAPACKET packet = {};
|
||||
packet.payload = packet_data;
|
||||
packet.payload_size = packet_size;
|
||||
packet.flags = CUVID_PKT_ENDOFPICTURE;
|
||||
packet.timestamp = my_slot_idx; // ✅ Embed slot index in timestamp
|
||||
packet.timestamp = static_cast<int64_t>(my_id); // 🎯 Pass full slot_id (NOT modulo!)
|
||||
|
||||
CUresult result = cuvidParseVideoData(m_parser, &packet);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
@@ -267,91 +297,99 @@ if (result != CUDA_SUCCESS) {
|
||||
}
|
||||
```
|
||||
|
||||
**Timestamp Flow**:
|
||||
**🎯 Option C Timestamp Flow** (multi-frame packet support):
|
||||
```
|
||||
cuvidParseVideoData(packet, timestamp=2)
|
||||
cuvidParseVideoData(packet, timestamp=my_id=17)
|
||||
↓
|
||||
HandleVideoSequence() (first time only)
|
||||
↓
|
||||
HandlePictureDecode(timestamp=2)
|
||||
HandlePictureDecode(timestamp=17, CurrPicIdx=0) → slot_idx = 17 % 8 = 1
|
||||
→ m_ringBuffer[1].picture_indices.push_back(0)
|
||||
↓
|
||||
GPU decodes packet...
|
||||
HandlePictureDecode(timestamp=17, CurrPicIdx=1) → slot_idx = 17 % 8 = 1
|
||||
→ m_ringBuffer[1].picture_indices.push_back(1) // Same packet, multiple frames!
|
||||
↓
|
||||
HandlePictureDisplay(timestamp=2, picture_index=5)
|
||||
HandlePictureDecode(timestamp=17, CurrPicIdx=2) → slot_idx = 17 % 8 = 1
|
||||
→ m_ringBuffer[1].picture_indices.push_back(2)
|
||||
↓
|
||||
m_ringBuffer[2].picture_index = 5 // ✅ Slot 2 now linked to picture_index 5
|
||||
PollingThread checks ALL picture_indices for slot[1]
|
||||
↓
|
||||
When all complete: slot[1].is_ready = true, notify thread
|
||||
```
|
||||
|
||||
**Key Point**: Timestamp carries **full slot_id**, HandlePictureDecode calculates slot_idx directly
|
||||
|
||||
---
|
||||
|
||||
### Component 3: Polling Thread (Background Status Checker)
|
||||
|
||||
**Purpose**: Continuously poll `m_returnIndex` slot for decode completion
|
||||
**Purpose**: Continuously poll `m_returnIdCounter` slot for decode completion (multi-frame support)
|
||||
|
||||
```cpp
|
||||
void NVDECAV1Decoder::PollingThreadFunc() {
|
||||
while (m_pollingRunning) {
|
||||
// 1. Get current return slot (oldest pending decode)
|
||||
size_t current_return_idx = m_returnIndex.load();
|
||||
DecodeSlot& slot = m_ringBuffer[current_return_idx];
|
||||
// 1. Get current return ID and calculate slot index
|
||||
uint64_t current_return_id = m_returnIdCounter.load();
|
||||
size_t slot_idx = current_return_id % RING_BUFFER_SIZE;
|
||||
DecodeSlot& slot = m_ringBuffer[slot_idx];
|
||||
|
||||
// 2. Check if slot is in use and not yet ready
|
||||
if (slot.in_use && !slot.is_ready && slot.picture_index >= 0) {
|
||||
if (slot.in_use && !slot.is_ready) {
|
||||
|
||||
// 3. Query NVDEC for decode status
|
||||
CUVIDGETDECODESTATUS decodeStatus = {};
|
||||
CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus);
|
||||
// 3. Get copy of picture indices (multi-frame support)
|
||||
std::vector<int> picture_indices_copy;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
picture_indices_copy = slot.picture_indices;
|
||||
}
|
||||
|
||||
if (result == CUDA_SUCCESS) {
|
||||
if (decodeStatus.decodeStatus == cuvidDecodeStatus_Success) {
|
||||
// ✅ Decode complete!
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.is_ready = true;
|
||||
}
|
||||
// 4. Check if ALL frames are decoded
|
||||
bool all_complete = true;
|
||||
for (int pic_idx : picture_indices_copy) {
|
||||
CUVIDGETDECODESTATUS decodeStatus = {};
|
||||
CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus);
|
||||
|
||||
// Wake up waiting DecodeToSurface thread
|
||||
slot.frame_ready.notify_one();
|
||||
|
||||
OutputDebugStringA("[Polling] Slot ready\n");
|
||||
if (result != CUDA_SUCCESS ||
|
||||
decodeStatus.decodeStatus != cuvidDecodeStatus_Success) {
|
||||
all_complete = false;
|
||||
break;
|
||||
}
|
||||
else if (decodeStatus.decodeStatus == cuvidDecodeStatus_Error) {
|
||||
// Decode error - mark as ready to unblock
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.is_ready = true; // Error also counts as "ready"
|
||||
}
|
||||
slot.frame_ready.notify_one();
|
||||
}
|
||||
|
||||
OutputDebugStringA("[Polling] Decode error\n");
|
||||
// 5. If all frames complete, signal ready
|
||||
if (all_complete && !picture_indices_copy.empty()) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.is_ready = true;
|
||||
}
|
||||
// cuvidDecodeStatus_InProgress → keep polling
|
||||
slot.frame_ready.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Sleep to avoid busy-wait
|
||||
// 6. Sleep to avoid busy-wait
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(100));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Key Points**:
|
||||
- ✅ Only polls `m_returnIndex` slot (not all 8 slots) → efficient
|
||||
- ✅ Only polls `m_returnIdCounter` slot (not all 8 slots) → efficient
|
||||
- ✅ **Multi-frame support**: Checks ALL picture_indices for completion
|
||||
- ✅ Uses `cuvidGetDecodeStatus()` non-blocking query
|
||||
- ✅ 100us sleep → ~10,000 checks/second (low CPU usage)
|
||||
- ✅ Handles decode errors gracefully
|
||||
- ✅ Thread-safe picture_indices copy to avoid lock contention
|
||||
|
||||
---
|
||||
|
||||
### Component 4: Sequential Return Wait
|
||||
### Component 4: Sequential Return Wait (FIFO Guarantee)
|
||||
|
||||
**Purpose**: Enforce FIFO order even when decodes complete out-of-order
|
||||
**Purpose**: Enforce FIFO order even when decodes complete out-of-order using Option C
|
||||
|
||||
```cpp
|
||||
// In DecodeToSurface() - PHASE 2
|
||||
|
||||
// 5. Wait for my turn (sequential return order)
|
||||
while (m_returnIndex.load() != my_slot_idx) {
|
||||
// 5. Wait for my turn (FIFO order enforcement)
|
||||
while (m_returnIdCounter.load() != my_id) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
|
||||
@@ -366,44 +404,47 @@ while (m_returnIndex.load() != my_slot_idx) {
|
||||
// Timeout - decode took too long
|
||||
LogError("Decode timeout");
|
||||
my_slot.in_use = false;
|
||||
m_returnIndex.fetch_add(1); // Skip this slot to avoid deadlock
|
||||
m_returnIdCounter.fetch_add(1); // Skip this slot to avoid deadlock
|
||||
return false;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Timeline Example**:
|
||||
**🎯 Option C Timeline Example** (using slot ID, not slot index):
|
||||
```
|
||||
Thread 1 (slot 0): Wait for returnIndex==0 ✅ (immediate)
|
||||
Wait for is_ready...
|
||||
Thread 1 (ID=17, slot 1): Wait for returnIdCounter==17 ✅ (immediate)
|
||||
Wait for is_ready...
|
||||
|
||||
Thread 2 (slot 1): Wait for returnIndex==1 ⏸️ (blocked)
|
||||
Thread 2 (ID=18, slot 2): Wait for returnIdCounter==18 ⏸️ (blocked)
|
||||
|
||||
Thread 3 (slot 2): Wait for returnIndex==2 ⏸️ (blocked)
|
||||
Thread 3 (ID=19, slot 3): Wait for returnIdCounter==19 ⏸️ (blocked)
|
||||
|
||||
GPU: packet2 completes first @ t=3ms
|
||||
GPU: ID=18 completes first @ t=3ms
|
||||
→ slot[2].is_ready = true
|
||||
→ Thread 2 still blocked (returnIdCounter=17)
|
||||
|
||||
GPU: ID=17 completes @ t=15ms
|
||||
→ slot[1].is_ready = true
|
||||
→ Thread 2 still blocked (returnIndex=0)
|
||||
|
||||
GPU: packet1 completes @ t=15ms
|
||||
→ slot[0].is_ready = true
|
||||
→ Thread 1 wakes up ✅
|
||||
→ Thread 1 processes → returnIndex = 1
|
||||
→ Thread 1 processes → returnIdCounter = 18
|
||||
→ Thread 2 now unblocked ✅
|
||||
```
|
||||
|
||||
**Key Point**: Wait on **slot_id** (my_id), not slot_idx, for correct FIFO ordering
|
||||
|
||||
---
|
||||
|
||||
### Component 5: Frame Retrieval & Cleanup
|
||||
### Component 5: Frame Retrieval & Cleanup (Multi-Frame Support)
|
||||
|
||||
**Purpose**: Map decoded frame, copy to surface, release slot
|
||||
|
||||
```cpp
|
||||
// In DecodeToSurface() - PHASE 3
|
||||
|
||||
int frameIdx = my_slot.picture_index;
|
||||
// 7. Get first frame from multi-frame packet
|
||||
int frameIdx = my_slot.picture_indices[0]; // Return first frame only
|
||||
|
||||
// 7. Map decoded frame from NVDEC
|
||||
// 8. Map decoded frame from NVDEC
|
||||
CUVIDPROCPARAMS procParams = {};
|
||||
procParams.progressive_frame = 1;
|
||||
|
||||
@@ -413,148 +454,178 @@ unsigned int srcPitch = 0;
|
||||
CUresult result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
my_slot.in_use = false;
|
||||
m_returnIndex.fetch_add(1);
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
// 8. Copy to D3D12 surface
|
||||
// 9. Copy to D3D12 surface
|
||||
ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(target_surface);
|
||||
bool copySuccess = m_d3d12Handler->CopyNV12Frame(
|
||||
srcDevicePtr, srcPitch, d3d12Resource, m_width, m_height
|
||||
);
|
||||
|
||||
// 9. Unmap frame
|
||||
// 10. Unmap frame
|
||||
cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
|
||||
|
||||
// 10. Release slot
|
||||
// 11. Release slot
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(my_slot.slot_mutex);
|
||||
my_slot.in_use = false;
|
||||
}
|
||||
|
||||
// 11. Advance return index
|
||||
m_returnIndex.fetch_add(1);
|
||||
// 12. Advance return ID counter (FIFO order)
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
|
||||
return copySuccess;
|
||||
```
|
||||
|
||||
**Multi-Frame Decision**: Return **first frame only** from multi-frame packet (indices[0])
|
||||
|
||||
---
|
||||
|
||||
### Component 6: NVDEC Callback Integration
|
||||
### Component 6: NVDEC Callback Integration (Option C)
|
||||
|
||||
**Purpose**: Link NVDEC picture_index to RingBuffer slot
|
||||
**Purpose**: Link NVDEC picture_index to RingBuffer slot using direct slot_id calculation
|
||||
|
||||
```cpp
|
||||
int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDISPINFO* disp_info) {
|
||||
int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS* pic_params) {
|
||||
auto* decoder = static_cast<NVDECAV1Decoder*>(user_data);
|
||||
|
||||
// Extract slot index from timestamp
|
||||
size_t slot_idx = static_cast<size_t>(disp_info->timestamp) % RING_BUFFER_SIZE;
|
||||
// 🎯 Option C: Direct slot_id → slot_idx calculation (no map lookup!)
|
||||
uint64_t slot_id = static_cast<uint64_t>(pic_params->nTimeStamp);
|
||||
size_t slot_idx = slot_id % RING_BUFFER_SIZE;
|
||||
|
||||
DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
|
||||
|
||||
// Store NVDEC picture index
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.picture_index = disp_info->picture_index;
|
||||
// Submit frame to NVDEC decoder
|
||||
CUresult result = cuvidDecodePicture(decoder->m_decoder, pic_params);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
decoder->LogCUDAError(result, "cuvidDecodePicture failed");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Polling thread will check cuvidGetDecodeStatus() for this picture_index
|
||||
// Store picture_index for polling (multi-frame support)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.picture_indices.push_back(pic_params->CurrPicIdx);
|
||||
}
|
||||
|
||||
// Polling thread will check cuvidGetDecodeStatus() for ALL picture_indices
|
||||
|
||||
return 1;
|
||||
}
|
||||
```
|
||||
|
||||
**🎯 Option C Key Advantages**:
|
||||
- ✅ **No mapping overhead**: Direct modulo calculation, no unordered_map lookup
|
||||
- ✅ **No mutex contention**: No global map mutex needed
|
||||
- ✅ **Multi-frame support**: Automatically handles multiple frames per packet
|
||||
- ✅ **Deterministic**: Same slot_id always maps to same slot_idx
|
||||
|
||||
---
|
||||
|
||||
## 📐 Implementation Plan
|
||||
## 📐 Implementation Plan (Updated for Option C)
|
||||
|
||||
### Phase 1: Data Structure Setup ✅
|
||||
|
||||
**Files to Modify**:
|
||||
- `NVDECAV1Decoder.h` - Add RingBuffer members
|
||||
- `NVDECAV1Decoder.h` - Add RingBuffer members with Option C design
|
||||
- `NVDECAV1Decoder.cpp` - Initialize RingBuffer in constructor
|
||||
|
||||
**Tasks**:
|
||||
- [x] Define `DecodeSlot` structure
|
||||
- [x] Define `DecodeSlot` structure with `std::vector<int> picture_indices`
|
||||
- [x] Add `m_ringBuffer[8]` array
|
||||
- [x] Add `m_submitIndex`, `m_returnIndex` atomic counters
|
||||
- [x] **🎯 Option C**: Add `m_slotIdCounter`, `m_returnIdCounter` atomic counters (NOT submitIndex/returnIndex)
|
||||
- [x] Add `m_pollingThread`, `m_pollingRunning` members
|
||||
|
||||
**Estimated Time**: 30 minutes
|
||||
|
||||
---
|
||||
|
||||
### Phase 2: Polling Thread Implementation
|
||||
### Phase 2: Polling Thread Implementation (Multi-Frame Support)
|
||||
|
||||
**Files to Modify**:
|
||||
- `NVDECAV1Decoder.cpp` - Implement `PollingThreadFunc()`
|
||||
|
||||
**Tasks**:
|
||||
- [ ] Implement polling loop with `cuvidGetDecodeStatus()`
|
||||
- [ ] Implement polling loop with `cuvidGetDecodeStatus()` checking ALL picture_indices
|
||||
- [ ] Poll slot at `m_returnIdCounter % RING_BUFFER_SIZE` (NOT all slots)
|
||||
- [ ] Add thread start in `Initialize()`
|
||||
- [ ] Add thread stop in `Cleanup()`
|
||||
- [ ] Add debug logging for polling events
|
||||
- [ ] Add debug logging for multi-frame packet events
|
||||
|
||||
**Testing**:
|
||||
- Verify thread starts/stops correctly
|
||||
- Verify multi-frame packet handling (all frames checked)
|
||||
- Verify `cuvidGetDecodeStatus()` calls work
|
||||
|
||||
**Estimated Time**: 1 hour
|
||||
|
||||
---
|
||||
|
||||
### Phase 3: DecodeToSurface Refactoring
|
||||
### Phase 3: DecodeToSurface Refactoring (Option C Implementation)
|
||||
|
||||
**Files to Modify**:
|
||||
- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()`
|
||||
- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` with Option C design
|
||||
|
||||
**Tasks**:
|
||||
- [ ] Phase 1: Slot allocation logic
|
||||
- [ ] Phase 2: Sequential return wait logic
|
||||
- [ ] Phase 3: Frame retrieval & cleanup logic
|
||||
- [ ] **🎯 Phase 1**: Slot allocation using `my_id = m_slotIdCounter.fetch_add(1)`
|
||||
- [ ] **🎯 Phase 2**: Packet submission with `packet.timestamp = my_id` (full slot_id, NOT modulo!)
|
||||
- [ ] **🎯 Phase 3**: FIFO wait using `while(m_returnIdCounter != my_id)` (NOT slot_idx!)
|
||||
- [ ] **🎯 Phase 4**: Frame retrieval from `picture_indices[0]` (first frame only)
|
||||
- [ ] **🎯 Phase 5**: Cleanup and `m_returnIdCounter.fetch_add(1)`
|
||||
- [ ] Error handling for all failure paths
|
||||
|
||||
**Testing**:
|
||||
- Single-threaded decode test
|
||||
- Multi-threaded decode test (2-3 threads)
|
||||
- Verify packet-surface mapping correctness
|
||||
- Verify packet-surface mapping correctness with multi-frame packets
|
||||
- Verify FIFO ordering with out-of-order completion
|
||||
|
||||
**Estimated Time**: 2 hours
|
||||
|
||||
---
|
||||
|
||||
### Phase 4: HandlePictureDisplay Update
|
||||
### Phase 4: HandlePictureDecode Update (Option C)
|
||||
|
||||
**Files to Modify**:
|
||||
- `NVDECAV1Decoder.cpp` - Modify `HandlePictureDisplay()`
|
||||
- `NVDECAV1Decoder.cpp` - Modify `HandlePictureDecode()` callback
|
||||
|
||||
**Tasks**:
|
||||
- [ ] Extract slot_idx from timestamp
|
||||
- [ ] Store picture_index in correct slot
|
||||
- [ ] Add debug logging
|
||||
- [ ] **🎯 Option C**: Extract `slot_id` from `pic_params->nTimeStamp`
|
||||
- [ ] **🎯 Option C**: Calculate `slot_idx = slot_id % RING_BUFFER_SIZE` (NO map lookup!)
|
||||
- [ ] **🎯 Multi-frame**: Use `slot.picture_indices.push_back(CurrPicIdx)` (NOT single index!)
|
||||
- [ ] Add debug logging for multi-frame packets
|
||||
|
||||
**Testing**:
|
||||
- Verify timestamp → slot_idx mapping
|
||||
- Verify picture_index stored correctly
|
||||
- Verify timestamp → slot_idx direct calculation works
|
||||
- Verify picture_indices vector correctly stores multiple frames
|
||||
- Test with video that has multi-frame packets (test_720p_stripe.webm)
|
||||
|
||||
**Estimated Time**: 30 minutes
|
||||
|
||||
---
|
||||
|
||||
### Phase 5: Integration Testing
|
||||
### Phase 5: Integration Testing (Option C Validation)
|
||||
|
||||
**Test Scenarios**:
|
||||
1. **Single packet decode** - Verify basic functionality
|
||||
2. **Sequential 3 packets** - Verify FIFO order
|
||||
3. **Out-of-order completion** - Verify correct mapping (I-frame after P-frame)
|
||||
4. **RingBuffer overflow** - Verify error handling (9+ concurrent calls)
|
||||
5. **Decode errors** - Verify graceful failure
|
||||
6. **Performance benchmark** - Measure latency reduction
|
||||
1. **Single packet decode** - Verify Option C basic functionality
|
||||
2. **Multi-frame packet** - Verify vector-based picture_indices handling (test_720p_stripe.webm)
|
||||
3. **Sequential 3 packets** - Verify FIFO order using m_returnIdCounter
|
||||
4. **Out-of-order completion** - Verify slot_id → slot_idx mapping (I-frame after P-frame)
|
||||
5. **RingBuffer overflow** - Verify error handling (9+ concurrent calls)
|
||||
6. **Decode errors** - Verify graceful failure with multi-frame packets
|
||||
7. **Performance benchmark** - Measure latency reduction vs old queue-based approach
|
||||
|
||||
**Test Files**:
|
||||
- Simple test video (simple_test.webm)
|
||||
- Complex GOP structure video (test_720p_stripe.webm)
|
||||
- Simple test video (simple_test.webm) - basic validation
|
||||
- **Multi-frame packet video (test_720p_stripe.webm)** - critical multi-frame test ⚠️
|
||||
|
||||
**Validation Criteria**:
|
||||
- ✅ No slot ID → slot_idx mapping errors
|
||||
- ✅ All frames from multi-frame packets detected and polled
|
||||
- ✅ FIFO order maintained even with out-of-order GPU completion
|
||||
- ✅ No memory corruption or race conditions
|
||||
|
||||
**Estimated Time**: 2 hours
|
||||
|
||||
@@ -563,15 +634,63 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI
|
||||
### Phase 6: Documentation & Cleanup
|
||||
|
||||
**Tasks**:
|
||||
- [ ] Update NVDEC design documentation
|
||||
- [ ] Add inline code comments
|
||||
- [ ] Remove old queue-based code
|
||||
- [ ] Move design doc to `docs/completed/`
|
||||
- [x] Update NVDEC design documentation with Option C and multi-frame support
|
||||
- [ ] Add inline code comments explaining Option C design choices
|
||||
- [ ] Remove old queue-based code and any ID→Index mapping attempts
|
||||
- [ ] Move design doc to `docs/completed/` after successful implementation
|
||||
- [ ] Document multi-frame packet behavior and first-frame-only decision
|
||||
|
||||
**Estimated Time**: 1 hour
|
||||
|
||||
---
|
||||
|
||||
## 🎯 Option C Design Summary
|
||||
|
||||
### **Core Principle**: Eliminate mapping overhead through deterministic calculation
|
||||
|
||||
**Key Components**:
|
||||
1. **Single Counter for Dual Purpose**: `m_slotIdCounter` serves as both unique ID and submission order
|
||||
2. **Direct Slot Calculation**: `slot_idx = slot_id % RING_BUFFER_SIZE` (no map needed)
|
||||
3. **FIFO via ID Comparison**: `while(m_returnIdCounter != my_id)` ensures ordering
|
||||
4. **Multi-Frame Vector**: `std::vector<int> picture_indices` handles packets with multiple frames
|
||||
|
||||
**Data Flow**:
|
||||
```
|
||||
DecodeToSurface:
|
||||
my_id = m_slotIdCounter++ (e.g., 17)
|
||||
slot_idx = 17 % 8 = 1
|
||||
packet.timestamp = 17
|
||||
|
||||
HandlePictureDecode:
|
||||
slot_id = pic_params->nTimeStamp (17)
|
||||
slot_idx = 17 % 8 = 1
|
||||
m_ringBuffer[1].picture_indices.push_back(CurrPicIdx)
|
||||
|
||||
PollingThread:
|
||||
return_id = m_returnIdCounter (17)
|
||||
slot_idx = 17 % 8 = 1
|
||||
Check all m_ringBuffer[1].picture_indices[]
|
||||
|
||||
DecodeToSurface (wait):
|
||||
while(m_returnIdCounter != 17) { wait }
|
||||
Process frame from picture_indices[0]
|
||||
m_returnIdCounter++ (18)
|
||||
```
|
||||
|
||||
**Eliminated Complexity**:
|
||||
- ❌ No `std::unordered_map<uint64_t, size_t>` mapping
|
||||
- ❌ No global map mutex
|
||||
- ❌ No map insert/erase operations
|
||||
- ❌ No lookup failures or stale entries
|
||||
|
||||
**Multi-Frame Packet Handling**:
|
||||
- One packet → multiple HandlePictureDecode calls
|
||||
- All frames stored in `picture_indices` vector
|
||||
- PollingThread checks ALL frames complete
|
||||
- Return first frame only (`picture_indices[0]`)
|
||||
|
||||
---
|
||||
|
||||
## 📊 Performance Analysis
|
||||
|
||||
### Expected Improvements
|
||||
|
||||
@@ -879,8 +879,8 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS
|
||||
}
|
||||
|
||||
char debug_buf[512];
|
||||
sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, IntraPicFlag=%d, ref_pic_flag=%d)\n",
|
||||
decoder->m_decoder, pic_params->CurrPicIdx, pic_params->intra_pic_flag, pic_params->ref_pic_flag);
|
||||
sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, IntraPicFlag=%d)\n",
|
||||
decoder->m_decoder, pic_params->CurrPicIdx, pic_params->intra_pic_flag);
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
|
||||
@@ -893,21 +893,7 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Store picture_index in ring buffer slot for polling thread
|
||||
// The slot index was embedded in the timestamp by DecodeToSurface
|
||||
// Note: pic_params doesn't have timestamp, so we use CurrPicIdx as picture_index
|
||||
// and let the polling thread find the right slot
|
||||
int picture_index = pic_params->CurrPicIdx;
|
||||
size_t submit_idx = decoder->m_submitIndex.load() - 1; // Last submitted slot
|
||||
DecodeSlot& slot = decoder->m_ringBuffer[submit_idx % decoder->RING_BUFFER_SIZE];
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.picture_index = picture_index;
|
||||
}
|
||||
|
||||
sprintf_s(debug_buf, "[HandlePictureDecode] Success - stored picture_index=%d in slot %zu\n",
|
||||
picture_index, submit_idx % decoder->RING_BUFFER_SIZE);
|
||||
sprintf_s(debug_buf, "[HandlePictureDecode] cuvidDecodePicture succeeded (CurrPicIdx=%d)\n", pic_params->CurrPicIdx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
|
||||
@@ -921,34 +907,45 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Option C: Extract slot_id from timestamp and calculate slot_idx directly (NO map lookup!)
|
||||
uint64_t slot_id = static_cast<uint64_t>(disp_info->timestamp);
|
||||
size_t slot_idx = slot_id % decoder->RING_BUFFER_SIZE;
|
||||
DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
|
||||
|
||||
char debug_buf[256];
|
||||
sprintf_s(debug_buf, "[HandlePictureDisplay] Frame decoded: picture_index=%d, timestamp=%lld\n",
|
||||
disp_info->picture_index, disp_info->timestamp);
|
||||
sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: slot_id=%llu, slot_idx=%zu, picture_index=%d\n",
|
||||
slot_id, slot_idx, disp_info->picture_index);
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
|
||||
// ===== Phase 4: RingBuffer Integration =====
|
||||
// Extract slot index from timestamp (set in DecodeToSurface)
|
||||
size_t slot_idx = static_cast<size_t>(disp_info->timestamp);
|
||||
DecodeSlot& slot = decoder->m_ringBuffer[slot_idx % decoder->RING_BUFFER_SIZE];
|
||||
|
||||
sprintf_s(debug_buf, "[HandlePictureDisplay] Updating RingBuffer slot %zu with picture_index=%d\n",
|
||||
slot_idx % decoder->RING_BUFFER_SIZE, disp_info->picture_index);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// IMPORTANT: pfnDisplayPicture is called AFTER GPU decoding completes
|
||||
// So we can directly mark the frame as ready without polling
|
||||
// Option C Multi-Frame Support: Store picture_index in vector (one packet can have multiple frames)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.picture_index = disp_info->picture_index;
|
||||
slot.is_ready = true; // Frame is already decoded and ready
|
||||
slot.picture_indices.push_back(disp_info->picture_index);
|
||||
}
|
||||
|
||||
// Signal waiting thread that frame is ready
|
||||
slot.frame_ready.notify_one();
|
||||
|
||||
sprintf_s(debug_buf, "[HandlePictureDisplay] Slot %zu marked ready (picture_index=%d)\\n",
|
||||
slot_idx % decoder->RING_BUFFER_SIZE, disp_info->picture_index);
|
||||
sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Pushed picture_index=%d to slot %zu (total frames: %zu)\n",
|
||||
disp_info->picture_index, slot_idx, slot.picture_indices.size());
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
|
||||
// IMPORTANT: HandlePictureDisplay is called AFTER decoding completes
|
||||
// So we can mark the slot as ready immediately (no polling needed for this callback)
|
||||
bool should_notify = false;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
if (!slot.is_ready) {
|
||||
slot.is_ready = true;
|
||||
should_notify = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (should_notify) {
|
||||
slot.frame_ready.notify_one();
|
||||
sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Slot %zu marked ready and notified\n", slot_idx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
@@ -1151,12 +1148,13 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
try {
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// ===== Phase 3: RingBuffer Slot Allocation =====
|
||||
// 1. Allocate next available slot (producer)
|
||||
size_t my_slot_idx = m_submitIndex.fetch_add(1);
|
||||
DecodeSlot& my_slot = m_ringBuffer[my_slot_idx % RING_BUFFER_SIZE];
|
||||
// ===== Phase 1: Option C Slot Allocation =====
|
||||
// 1. Allocate unique slot ID (serves as both ID and submission order)
|
||||
uint64_t my_id = m_slotIdCounter.fetch_add(1);
|
||||
size_t slot_idx = my_id % RING_BUFFER_SIZE;
|
||||
DecodeSlot& my_slot = m_ringBuffer[slot_idx];
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Allocating slot %zu (submitIndex=%zu)\n", my_slot_idx % RING_BUFFER_SIZE, my_slot_idx);
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Option C: Allocated slot_id=%llu, slot_idx=%zu\n", my_id, slot_idx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// 2. Check for overflow (RingBuffer full)
|
||||
@@ -1165,17 +1163,17 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
|
||||
if (my_slot.in_use) {
|
||||
// RingBuffer overflow - too many concurrent decodes
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", my_slot_idx % RING_BUFFER_SIZE);
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", slot_idx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
LogError("RingBuffer overflow - max 8 concurrent decodes");
|
||||
return false;
|
||||
}
|
||||
|
||||
// 3. Initialize slot
|
||||
// 3. Initialize slot (multi-frame support)
|
||||
my_slot.in_use = true;
|
||||
my_slot.target_surface = target_surface;
|
||||
my_slot.surface_type = target_type;
|
||||
my_slot.picture_index = -1; // Will be set by HandlePictureDisplay
|
||||
my_slot.picture_indices.clear(); // Clear previous frames
|
||||
my_slot.is_ready = false;
|
||||
}
|
||||
|
||||
@@ -1191,14 +1189,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
printf("%s", debug_buf);
|
||||
}
|
||||
|
||||
// 5. Submit packet to NVDEC parser with slot index in timestamp
|
||||
// 5. Option C: Submit packet with full slot_id in timestamp (NOT modulo!)
|
||||
CUVIDSOURCEDATAPACKET packet = {};
|
||||
packet.payload = final_packet_data;
|
||||
packet.payload_size = static_cast<unsigned long>(final_packet_size);
|
||||
packet.flags = CUVID_PKT_ENDOFPICTURE;
|
||||
packet.timestamp = static_cast<int64_t>(my_slot_idx); // Embed slot index in timestamp
|
||||
packet.timestamp = static_cast<int64_t>(my_id); // Pass full slot_id for HandlePictureDecode
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot %zu)...\n", packet.timestamp, my_slot_idx % RING_BUFFER_SIZE);
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot_id=%llu, slot_idx=%zu)...\n", packet.timestamp, my_id, slot_idx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
CUresult result = cuvidParseVideoData(m_parser, &packet);
|
||||
@@ -1217,49 +1215,60 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
}
|
||||
OutputDebugStringA("[DecodeToSurface] cuvidParseVideoData succeeded\n");
|
||||
|
||||
// ===== Component 4: Sequential Return Wait =====
|
||||
// 5. Wait for my turn (enforce FIFO order)
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Waiting for slot %zu to be return head (current returnIndex=%zu)...\n",
|
||||
my_slot_idx % RING_BUFFER_SIZE, m_returnIndex.load());
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
while (m_returnIndex.load() != my_slot_idx) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu is now return head\n", my_slot_idx % RING_BUFFER_SIZE);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// 6. Wait for decode to complete (polling thread signals this)
|
||||
// ===== Component 4: Wait for decode to complete FIRST =====
|
||||
// 6. Wait for decode to complete (HandlePictureDisplay signals this)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(my_slot.slot_mutex);
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot %zu, is_ready=%d)...\n",
|
||||
my_slot_idx % RING_BUFFER_SIZE, my_slot.is_ready);
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot_idx=%zu, is_ready=%d)...\n",
|
||||
slot_idx, my_slot.is_ready);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500),
|
||||
if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(5000),
|
||||
[&my_slot]() { return my_slot.is_ready; })) {
|
||||
// Timeout - decode took too long
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot %zu (is_ready=%d)\n",
|
||||
my_slot_idx % RING_BUFFER_SIZE, my_slot.is_ready);
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot_idx=%zu (is_ready=%d)\n",
|
||||
slot_idx, my_slot.is_ready);
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
LogError("Decode timeout");
|
||||
my_slot.in_use = false;
|
||||
m_returnIndex.fetch_add(1); // Skip this slot to avoid deadlock
|
||||
m_returnIdCounter.fetch_add(1); // Option C: Advance to unblock next thread
|
||||
return false;
|
||||
}
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot %zu)\n", my_slot_idx % RING_BUFFER_SIZE);
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot_idx=%zu)\n", slot_idx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
}
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu decode complete (picture_index=%d)\n", my_slot_idx % RING_BUFFER_SIZE, my_slot.picture_index);
|
||||
// ===== Component 5: Option C Sequential Return Wait (FIFO Guarantee) =====
|
||||
// 5. Wait for my turn using slot_id (NOT slot_idx!) AFTER decoding completes
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Option C: Waiting for FIFO turn slot_id=%llu (current returnIdCounter=%llu)...\n",
|
||||
my_id, m_returnIdCounter.load());
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// ===== Component 5: Frame Retrieval & Cleanup =====
|
||||
int frameIdx = my_slot.picture_index;
|
||||
while (m_returnIdCounter.load() != my_id) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||
}
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Option C: slot_id=%llu is now return head\n", my_id);
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// ===== Component 5: Option C Frame Retrieval (Multi-Frame Support) =====
|
||||
// Get first frame from multi-frame packet (picture_indices[0])
|
||||
if (my_slot.picture_indices.empty()) {
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] ERROR: No picture_indices in slot %zu\n", slot_idx);
|
||||
OutputDebugStringA(debug_buf);
|
||||
LogError("No frames decoded");
|
||||
my_slot.in_use = false;
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
int frameIdx = my_slot.picture_indices[0]; // Return first frame only
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu decode complete, using first frame (picture_index=%d, total_frames=%zu)\n",
|
||||
slot_idx, frameIdx, my_slot.picture_indices.size());
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// For CUDA device surface, we need to map the decoded frame
|
||||
if (target_type == VAVCORE_SURFACE_CUDA_DEVICE) {
|
||||
@@ -1275,7 +1284,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LogCUDAError(result, "cuvidMapVideoFrame");
|
||||
my_slot.in_use = false;
|
||||
m_returnIndex.fetch_add(1);
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1301,7 +1310,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
OutputDebugStringA("[DecodeToSurface] ERROR: target_surface or m_d3d12Device is null\n");
|
||||
LogError("D3D12 resource or device not available");
|
||||
my_slot.in_use = false;
|
||||
m_returnIndex.fetch_add(1);
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1340,7 +1349,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
OutputDebugStringA(debug_buf);
|
||||
LogCUDAError(result, "cuvidMapVideoFrame");
|
||||
my_slot.in_use = false;
|
||||
m_returnIndex.fetch_add(1);
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1367,7 +1376,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
if (!copySuccess) {
|
||||
OutputDebugStringA("[DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed\n");
|
||||
my_slot.in_use = false;
|
||||
m_returnIndex.fetch_add(1);
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1391,10 +1400,10 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
|
||||
my_slot.in_use = false;
|
||||
}
|
||||
|
||||
// 11. Advance return index
|
||||
m_returnIndex.fetch_add(1);
|
||||
// 11. Option C: Advance return ID counter (FIFO order)
|
||||
m_returnIdCounter.fetch_add(1);
|
||||
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu released, returnIndex advanced to %zu\n", my_slot_idx % RING_BUFFER_SIZE, m_returnIndex.load());
|
||||
sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu released, returnIdCounter advanced to %llu\n", slot_idx, m_returnIdCounter.load());
|
||||
OutputDebugStringA(debug_buf);
|
||||
|
||||
// Update statistics
|
||||
@@ -1455,63 +1464,53 @@ void NVDECAV1Decoder::PollingThreadFunc() {
|
||||
|
||||
int poll_count = 0;
|
||||
while (m_pollingRunning) {
|
||||
// Get current return slot (oldest pending decode)
|
||||
size_t current_return_idx = m_returnIndex.load();
|
||||
size_t slot_idx = current_return_idx % RING_BUFFER_SIZE;
|
||||
// Option C: Get current return ID and calculate slot index
|
||||
uint64_t current_return_id = m_returnIdCounter.load();
|
||||
size_t slot_idx = current_return_id % RING_BUFFER_SIZE;
|
||||
DecodeSlot& slot = m_ringBuffer[slot_idx];
|
||||
|
||||
// Debug logging every 100 iterations
|
||||
if (poll_count % 100 == 0) {
|
||||
char debug_buf[256];
|
||||
sprintf_s(debug_buf, "[PollingThread] Poll #%d: returnIdx=%zu, slot=%zu, in_use=%d, is_ready=%d, picture_index=%d\n",
|
||||
poll_count, current_return_idx, slot_idx, slot.in_use, slot.is_ready, slot.picture_index);
|
||||
sprintf_s(debug_buf, "[PollingThread] Option C: Poll #%d: returnId=%llu, slot=%zu, in_use=%d, is_ready=%d, frames=%zu\n",
|
||||
poll_count, current_return_id, slot_idx, slot.in_use, slot.is_ready, slot.picture_indices.size());
|
||||
OutputDebugStringA(debug_buf);
|
||||
}
|
||||
poll_count++;
|
||||
|
||||
// Check if slot is in use and not yet ready
|
||||
if (slot.in_use && !slot.is_ready && slot.picture_index >= 0) {
|
||||
// Query NVDEC for decode status
|
||||
CUVIDGETDECODESTATUS decodeStatus = {};
|
||||
CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus);
|
||||
|
||||
if (result == CUDA_SUCCESS) {
|
||||
if (decodeStatus.decodeStatus == cuvidDecodeStatus_Success) {
|
||||
// Decode complete!
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.is_ready = true;
|
||||
}
|
||||
slot.frame_ready.notify_one();
|
||||
|
||||
char debug_buf[128];
|
||||
sprintf_s(debug_buf, "[PollingThread] Slot %zu decode complete (picture_index=%d)\n",
|
||||
slot_idx, slot.picture_index);
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
}
|
||||
else if (decodeStatus.decodeStatus == cuvidDecodeStatus_Error ||
|
||||
decodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed) {
|
||||
// Decode error - mark as ready to unblock
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.is_ready = true;
|
||||
}
|
||||
slot.frame_ready.notify_one();
|
||||
|
||||
char debug_buf[128];
|
||||
sprintf_s(debug_buf, "[PollingThread] Slot %zu decode error (status=%d)\n",
|
||||
slot_idx, decodeStatus.decodeStatus);
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
}
|
||||
// else: decodeStatus.decodeStatus == cuvidDecodeStatus_InProgress, continue polling
|
||||
if (slot.in_use && !slot.is_ready) {
|
||||
// Option C Multi-Frame Support: Get copy of picture indices
|
||||
std::vector<int> picture_indices_copy;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
picture_indices_copy = slot.picture_indices;
|
||||
}
|
||||
else {
|
||||
// cuvidGetDecodeStatus failed
|
||||
char debug_buf[128];
|
||||
sprintf_s(debug_buf, "[PollingThread] cuvidGetDecodeStatus failed for slot %zu (error=%d)\n",
|
||||
slot_idx, result);
|
||||
|
||||
// Check if ALL frames are decoded
|
||||
bool all_complete = true;
|
||||
for (int pic_idx : picture_indices_copy) {
|
||||
CUVIDGETDECODESTATUS decodeStatus = {};
|
||||
CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus);
|
||||
|
||||
if (result != CUDA_SUCCESS ||
|
||||
decodeStatus.decodeStatus != cuvidDecodeStatus_Success) {
|
||||
all_complete = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If all frames complete, signal ready
|
||||
if (all_complete && !picture_indices_copy.empty()) {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(slot.slot_mutex);
|
||||
slot.is_ready = true;
|
||||
}
|
||||
slot.frame_ready.notify_one();
|
||||
|
||||
char debug_buf[256];
|
||||
sprintf_s(debug_buf, "[PollingThread] Option C: Slot %zu ALL frames complete (%zu frames)\n",
|
||||
slot_idx, picture_indices_copy.size());
|
||||
OutputDebugStringA(debug_buf);
|
||||
printf("%s", debug_buf);
|
||||
}
|
||||
|
||||
@@ -152,20 +152,21 @@ private:
|
||||
void* target_surface = nullptr; // Destination D3D12 resource
|
||||
VavCoreSurfaceType surface_type = VAVCORE_SURFACE_CPU;
|
||||
|
||||
// NVDEC information (from HandlePictureDisplay callback)
|
||||
int picture_index = -1; // NVDEC frame index for cuvidMapVideoFrame
|
||||
// NVDEC information (from HandlePictureDecode callback)
|
||||
// Multi-frame support: One packet can decode to multiple frames
|
||||
std::vector<int> picture_indices; // All NVDEC frame indices from this packet
|
||||
|
||||
// Synchronization primitives
|
||||
std::condition_variable frame_ready; // Signaled when decode complete
|
||||
std::condition_variable frame_ready; // Signaled when ALL frames are decoded
|
||||
std::mutex slot_mutex; // Protects this slot's state
|
||||
bool is_ready = false; // Decode completed flag
|
||||
bool is_ready = false; // All frames decoded flag
|
||||
};
|
||||
|
||||
DecodeSlot m_ringBuffer[RING_BUFFER_SIZE];
|
||||
|
||||
// Producer-consumer indices
|
||||
std::atomic<size_t> m_submitIndex{0}; // Next slot to allocate (producer)
|
||||
std::atomic<size_t> m_returnIndex{0}; // Next slot to return (consumer)
|
||||
// Option C: Unified slot allocation counters (no mapping needed!)
|
||||
std::atomic<uint64_t> m_slotIdCounter{0}; // Monotonically increasing slot ID (submission order)
|
||||
std::atomic<uint64_t> m_returnIdCounter{0}; // Return order enforcement (FIFO)
|
||||
|
||||
// Polling thread for cuvidGetDecodeStatus
|
||||
std::thread m_pollingThread;
|
||||
|
||||
Reference in New Issue
Block a user