diff --git a/todo16.txt b/todo16.txt index 1b0f313..d98ab6b 100644 --- a/todo16.txt +++ b/todo16.txt @@ -371,3 +371,26 @@ ulNumDecodeSurfaces 갯수만큼 cuvidCreateDecoder() 인스턴스를 생성해 The core NVDEC integration is working - frames are being decoded successfully by the hardware. The remaining work is properly synchronizing the async decode pipeline with the multi-frame packet structure. + + + + + + + +● 이제 simple_test.webm으로 테스트하겠습니다: + +● Bash("D:/Project/video-av1/vav2/platforms/windows/tests/red-surface-nvdec/bin/Debug/RedSurfaceNVDECTest.exe" "D:/Project/video-av1/sample/simple_test.webm") timeout: 1m 0s + ⎿  Error: Command timed out after 1m 0s [NVDECAV1Decoder] CUDA ERROR in cuvidParseVideoData: unknown error (code: 999) + [NVDECAV1Decoder] CUDA ERROR in cuvidParseVideoData: unknown error (code: 999) + + [DllMain] VavCore.dll loaded - Process Attach + [DllMain] Safe to initialize: Ready for vavcore_initialize() call + [RedSurfaceNVDECTest] Starting test... + [RedSurfaceNVDECTest] Video: D:/Project/video-av1/sample/simple_test.webm + + [Step 1] Initializing VavCore... + [DllMain] Performing safe decoder registration... + … +46 lines (ctrl+o to see all) + +● ChromaFormat 3 (YUV444)가 문제입니다. 이건 Option C 문제가 아니라 기존 문제입니다. test_720p_stripe.webm으로 테스트하겠습니다: diff --git a/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md b/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md index 1d4c9e3..90d0015 100644 --- a/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md +++ b/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md @@ -1,8 +1,8 @@ # NVDEC RingBuffer-based Asynchronous Decoding Design -**Date**: 2025-10-04 -**Status**: 🔄 **In Progress** -**Target**: NVDECAV1Decoder DecodeToSurface() optimization +**Date**: 2025-10-05 (Updated) +**Status**: ✅ **Design Finalized - Ready for Implementation** +**Target**: NVDECAV1Decoder DecodeToSurface() optimization with multi-frame packet support --- @@ -78,6 +78,32 @@ Thread B: pop() → picture_index=4 ❌ (gets packet1 result!) --- +#### Issue 3: Multi-Frame Packet Handling ⚠️ **Critical Discovery** + +**Scenario**: Single WebM packet contains multiple AV1 frames + +```cpp +// DecodeToSurface called ONCE +DecodeToSurface(packet_175bytes, surface1, frame1); + +// NVDEC parser extracts MULTIPLE frames from single packet: +→ HandlePictureDecode(CurrPicIdx=0, IntraPicFlag=1) // I-frame +→ HandlePictureDecode(CurrPicIdx=1, IntraPicFlag=0) // P-frame +→ HandlePictureDecode(CurrPicIdx=2, IntraPicFlag=0) // P-frame +→ ... (up to 8 frames in one packet) + +// Problem: Which picture_index should be returned? +// Current design assumes: 1 packet = 1 frame ❌ +``` + +**Impact**: +- Slot allocation assumes 1 packet → 1 slot → 1 picture_index +- Reality: 1 packet → 1 slot → **N picture_indices** +- Must track multiple picture_indices per slot +- Must decide which frame to return (first? last? all?) + +--- + #### Issue 3: ulNumOutputSurfaces Underutilization **NVDEC Configuration**: @@ -172,13 +198,14 @@ struct DecodeSlot { void* target_surface; // Destination D3D12 resource VavCoreSurfaceType surface_type; // Surface type - // NVDEC information (from HandlePictureDisplay callback) - int picture_index; // NVDEC frame index for cuvidMapVideoFrame + // NVDEC information (from HandlePictureDecode callback) + // ⚠️ Multi-frame support: One packet can decode to multiple frames + std::vector picture_indices; // All NVDEC frame indices from this packet // Synchronization primitives - std::condition_variable frame_ready; // Signaled when decode complete + std::condition_variable frame_ready; // Signaled when ALL frames are decoded std::mutex slot_mutex; // Protects this slot's state - bool is_ready; // Decode completed flag + bool is_ready; // All frames decoded flag }; ``` @@ -191,9 +218,9 @@ private: DecodeSlot m_ringBuffer[RING_BUFFER_SIZE]; - // Producer-consumer indices - std::atomic m_submitIndex{0}; // Next slot to allocate (producer) - std::atomic m_returnIndex{0}; // Next slot to return (consumer) + // 🎯 Option C: Unified slot allocation counter (no mapping needed!) + std::atomic m_slotIdCounter{0}; // Monotonically increasing slot ID + std::atomic m_returnIdCounter{0}; // Return order enforcement (FIFO) // Polling thread std::thread m_pollingThread; @@ -207,14 +234,15 @@ private: ### Component 1: Slot Allocation (Producer) -**Purpose**: Assign RingBuffer slot to each DecodeToSurface call +**Purpose**: Assign RingBuffer slot to each DecodeToSurface call using Option C design ```cpp // In DecodeToSurface() -// 1. Allocate next available slot -size_t my_slot_idx = m_submitIndex.fetch_add(1) % RING_BUFFER_SIZE; -DecodeSlot& my_slot = m_ringBuffer[my_slot_idx]; +// 🎯 Option C: Allocate unique ID (serves as both slot ID and submission order) +uint64_t my_id = m_slotIdCounter.fetch_add(1); +size_t slot_idx = my_id % RING_BUFFER_SIZE; +DecodeSlot& my_slot = m_ringBuffer[slot_idx]; // 2. Check for overflow { @@ -230,27 +258,29 @@ DecodeSlot& my_slot = m_ringBuffer[my_slot_idx]; my_slot.in_use = true; my_slot.target_surface = target_surface; my_slot.surface_type = target_type; - my_slot.picture_index = -1; // Set by HandlePictureDisplay + my_slot.picture_indices.clear(); // Multi-frame support: clear previous frames my_slot.is_ready = false; } ``` -**Atomic Counter Behavior**: +**🎯 Option C: No Mapping Needed - Direct Calculation**: ``` -Thread 1: m_submitIndex.fetch_add(1) → 0 % 8 = slot[0] -Thread 2: m_submitIndex.fetch_add(1) → 1 % 8 = slot[1] -Thread 3: m_submitIndex.fetch_add(1) → 2 % 8 = slot[2] +Thread 1: m_slotIdCounter.fetch_add(1) → ID=0, slot_idx = 0 % 8 = slot[0] +Thread 2: m_slotIdCounter.fetch_add(1) → ID=1, slot_idx = 1 % 8 = slot[1] +Thread 3: m_slotIdCounter.fetch_add(1) → ID=2, slot_idx = 2 % 8 = slot[2] ... -Thread 9: m_submitIndex.fetch_add(1) → 8 % 8 = slot[0] (wrap around) +Thread 9: m_slotIdCounter.fetch_add(1) → ID=8, slot_idx = 8 % 8 = slot[0] (wrap around) ``` +**Key Advantage**: ID → slot_idx calculation is **deterministic**, no map storage needed! + **Overflow Protection**: If `slot[0].in_use == true` when Thread 9 arrives → error --- ### Component 2: Packet Submission -**Purpose**: Submit packet to NVDEC with slot index tracking +**Purpose**: Submit packet to NVDEC with slot ID tracking via timestamp ```cpp // 4. Submit packet to NVDEC parser @@ -258,7 +288,7 @@ CUVIDSOURCEDATAPACKET packet = {}; packet.payload = packet_data; packet.payload_size = packet_size; packet.flags = CUVID_PKT_ENDOFPICTURE; -packet.timestamp = my_slot_idx; // ✅ Embed slot index in timestamp +packet.timestamp = static_cast(my_id); // 🎯 Pass full slot_id (NOT modulo!) CUresult result = cuvidParseVideoData(m_parser, &packet); if (result != CUDA_SUCCESS) { @@ -267,91 +297,99 @@ if (result != CUDA_SUCCESS) { } ``` -**Timestamp Flow**: +**🎯 Option C Timestamp Flow** (multi-frame packet support): ``` -cuvidParseVideoData(packet, timestamp=2) +cuvidParseVideoData(packet, timestamp=my_id=17) ↓ HandleVideoSequence() (first time only) ↓ -HandlePictureDecode(timestamp=2) +HandlePictureDecode(timestamp=17, CurrPicIdx=0) → slot_idx = 17 % 8 = 1 + → m_ringBuffer[1].picture_indices.push_back(0) ↓ -GPU decodes packet... +HandlePictureDecode(timestamp=17, CurrPicIdx=1) → slot_idx = 17 % 8 = 1 + → m_ringBuffer[1].picture_indices.push_back(1) // Same packet, multiple frames! ↓ -HandlePictureDisplay(timestamp=2, picture_index=5) +HandlePictureDecode(timestamp=17, CurrPicIdx=2) → slot_idx = 17 % 8 = 1 + → m_ringBuffer[1].picture_indices.push_back(2) ↓ -m_ringBuffer[2].picture_index = 5 // ✅ Slot 2 now linked to picture_index 5 +PollingThread checks ALL picture_indices for slot[1] + ↓ +When all complete: slot[1].is_ready = true, notify thread ``` +**Key Point**: Timestamp carries **full slot_id**, HandlePictureDecode calculates slot_idx directly + --- ### Component 3: Polling Thread (Background Status Checker) -**Purpose**: Continuously poll `m_returnIndex` slot for decode completion +**Purpose**: Continuously poll `m_returnIdCounter` slot for decode completion (multi-frame support) ```cpp void NVDECAV1Decoder::PollingThreadFunc() { while (m_pollingRunning) { - // 1. Get current return slot (oldest pending decode) - size_t current_return_idx = m_returnIndex.load(); - DecodeSlot& slot = m_ringBuffer[current_return_idx]; + // 1. Get current return ID and calculate slot index + uint64_t current_return_id = m_returnIdCounter.load(); + size_t slot_idx = current_return_id % RING_BUFFER_SIZE; + DecodeSlot& slot = m_ringBuffer[slot_idx]; // 2. Check if slot is in use and not yet ready - if (slot.in_use && !slot.is_ready && slot.picture_index >= 0) { + if (slot.in_use && !slot.is_ready) { - // 3. Query NVDEC for decode status - CUVIDGETDECODESTATUS decodeStatus = {}; - CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus); + // 3. Get copy of picture indices (multi-frame support) + std::vector picture_indices_copy; + { + std::lock_guard lock(slot.slot_mutex); + picture_indices_copy = slot.picture_indices; + } - if (result == CUDA_SUCCESS) { - if (decodeStatus.decodeStatus == cuvidDecodeStatus_Success) { - // ✅ Decode complete! - { - std::lock_guard lock(slot.slot_mutex); - slot.is_ready = true; - } + // 4. Check if ALL frames are decoded + bool all_complete = true; + for (int pic_idx : picture_indices_copy) { + CUVIDGETDECODESTATUS decodeStatus = {}; + CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus); - // Wake up waiting DecodeToSurface thread - slot.frame_ready.notify_one(); - - OutputDebugStringA("[Polling] Slot ready\n"); + if (result != CUDA_SUCCESS || + decodeStatus.decodeStatus != cuvidDecodeStatus_Success) { + all_complete = false; + break; } - else if (decodeStatus.decodeStatus == cuvidDecodeStatus_Error) { - // Decode error - mark as ready to unblock - { - std::lock_guard lock(slot.slot_mutex); - slot.is_ready = true; // Error also counts as "ready" - } - slot.frame_ready.notify_one(); + } - OutputDebugStringA("[Polling] Decode error\n"); + // 5. If all frames complete, signal ready + if (all_complete && !picture_indices_copy.empty()) { + { + std::lock_guard lock(slot.slot_mutex); + slot.is_ready = true; } - // cuvidDecodeStatus_InProgress → keep polling + slot.frame_ready.notify_one(); } } - // 4. Sleep to avoid busy-wait + // 6. Sleep to avoid busy-wait std::this_thread::sleep_for(std::chrono::microseconds(100)); } } ``` **Key Points**: -- ✅ Only polls `m_returnIndex` slot (not all 8 slots) → efficient +- ✅ Only polls `m_returnIdCounter` slot (not all 8 slots) → efficient +- ✅ **Multi-frame support**: Checks ALL picture_indices for completion - ✅ Uses `cuvidGetDecodeStatus()` non-blocking query - ✅ 100us sleep → ~10,000 checks/second (low CPU usage) -- ✅ Handles decode errors gracefully +- ✅ Thread-safe picture_indices copy to avoid lock contention --- -### Component 4: Sequential Return Wait +### Component 4: Sequential Return Wait (FIFO Guarantee) -**Purpose**: Enforce FIFO order even when decodes complete out-of-order +**Purpose**: Enforce FIFO order even when decodes complete out-of-order using Option C ```cpp // In DecodeToSurface() - PHASE 2 -// 5. Wait for my turn (sequential return order) -while (m_returnIndex.load() != my_slot_idx) { +// 5. Wait for my turn (FIFO order enforcement) +while (m_returnIdCounter.load() != my_id) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } @@ -366,44 +404,47 @@ while (m_returnIndex.load() != my_slot_idx) { // Timeout - decode took too long LogError("Decode timeout"); my_slot.in_use = false; - m_returnIndex.fetch_add(1); // Skip this slot to avoid deadlock + m_returnIdCounter.fetch_add(1); // Skip this slot to avoid deadlock return false; } } ``` -**Timeline Example**: +**🎯 Option C Timeline Example** (using slot ID, not slot index): ``` -Thread 1 (slot 0): Wait for returnIndex==0 ✅ (immediate) - Wait for is_ready... +Thread 1 (ID=17, slot 1): Wait for returnIdCounter==17 ✅ (immediate) + Wait for is_ready... -Thread 2 (slot 1): Wait for returnIndex==1 ⏸️ (blocked) +Thread 2 (ID=18, slot 2): Wait for returnIdCounter==18 ⏸️ (blocked) -Thread 3 (slot 2): Wait for returnIndex==2 ⏸️ (blocked) +Thread 3 (ID=19, slot 3): Wait for returnIdCounter==19 ⏸️ (blocked) -GPU: packet2 completes first @ t=3ms +GPU: ID=18 completes first @ t=3ms + → slot[2].is_ready = true + → Thread 2 still blocked (returnIdCounter=17) + +GPU: ID=17 completes @ t=15ms → slot[1].is_ready = true - → Thread 2 still blocked (returnIndex=0) - -GPU: packet1 completes @ t=15ms - → slot[0].is_ready = true → Thread 1 wakes up ✅ - → Thread 1 processes → returnIndex = 1 + → Thread 1 processes → returnIdCounter = 18 → Thread 2 now unblocked ✅ ``` +**Key Point**: Wait on **slot_id** (my_id), not slot_idx, for correct FIFO ordering + --- -### Component 5: Frame Retrieval & Cleanup +### Component 5: Frame Retrieval & Cleanup (Multi-Frame Support) **Purpose**: Map decoded frame, copy to surface, release slot ```cpp // In DecodeToSurface() - PHASE 3 -int frameIdx = my_slot.picture_index; +// 7. Get first frame from multi-frame packet +int frameIdx = my_slot.picture_indices[0]; // Return first frame only -// 7. Map decoded frame from NVDEC +// 8. Map decoded frame from NVDEC CUVIDPROCPARAMS procParams = {}; procParams.progressive_frame = 1; @@ -413,148 +454,178 @@ unsigned int srcPitch = 0; CUresult result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams); if (result != CUDA_SUCCESS) { my_slot.in_use = false; - m_returnIndex.fetch_add(1); + m_returnIdCounter.fetch_add(1); return false; } -// 8. Copy to D3D12 surface +// 9. Copy to D3D12 surface ID3D12Resource* d3d12Resource = static_cast(target_surface); bool copySuccess = m_d3d12Handler->CopyNV12Frame( srcDevicePtr, srcPitch, d3d12Resource, m_width, m_height ); -// 9. Unmap frame +// 10. Unmap frame cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); -// 10. Release slot +// 11. Release slot { std::lock_guard lock(my_slot.slot_mutex); my_slot.in_use = false; } -// 11. Advance return index -m_returnIndex.fetch_add(1); +// 12. Advance return ID counter (FIFO order) +m_returnIdCounter.fetch_add(1); return copySuccess; ``` +**Multi-Frame Decision**: Return **first frame only** from multi-frame packet (indices[0]) + --- -### Component 6: NVDEC Callback Integration +### Component 6: NVDEC Callback Integration (Option C) -**Purpose**: Link NVDEC picture_index to RingBuffer slot +**Purpose**: Link NVDEC picture_index to RingBuffer slot using direct slot_id calculation ```cpp -int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDISPINFO* disp_info) { +int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS* pic_params) { auto* decoder = static_cast(user_data); - // Extract slot index from timestamp - size_t slot_idx = static_cast(disp_info->timestamp) % RING_BUFFER_SIZE; + // 🎯 Option C: Direct slot_id → slot_idx calculation (no map lookup!) + uint64_t slot_id = static_cast(pic_params->nTimeStamp); + size_t slot_idx = slot_id % RING_BUFFER_SIZE; DecodeSlot& slot = decoder->m_ringBuffer[slot_idx]; - // Store NVDEC picture index - { - std::lock_guard lock(slot.slot_mutex); - slot.picture_index = disp_info->picture_index; + // Submit frame to NVDEC decoder + CUresult result = cuvidDecodePicture(decoder->m_decoder, pic_params); + if (result != CUDA_SUCCESS) { + decoder->LogCUDAError(result, "cuvidDecodePicture failed"); + return 0; } - // Polling thread will check cuvidGetDecodeStatus() for this picture_index + // Store picture_index for polling (multi-frame support) + { + std::lock_guard lock(slot.slot_mutex); + slot.picture_indices.push_back(pic_params->CurrPicIdx); + } + + // Polling thread will check cuvidGetDecodeStatus() for ALL picture_indices return 1; } ``` +**🎯 Option C Key Advantages**: +- ✅ **No mapping overhead**: Direct modulo calculation, no unordered_map lookup +- ✅ **No mutex contention**: No global map mutex needed +- ✅ **Multi-frame support**: Automatically handles multiple frames per packet +- ✅ **Deterministic**: Same slot_id always maps to same slot_idx + --- -## 📐 Implementation Plan +## 📐 Implementation Plan (Updated for Option C) ### Phase 1: Data Structure Setup ✅ **Files to Modify**: -- `NVDECAV1Decoder.h` - Add RingBuffer members +- `NVDECAV1Decoder.h` - Add RingBuffer members with Option C design - `NVDECAV1Decoder.cpp` - Initialize RingBuffer in constructor **Tasks**: -- [x] Define `DecodeSlot` structure +- [x] Define `DecodeSlot` structure with `std::vector picture_indices` - [x] Add `m_ringBuffer[8]` array -- [x] Add `m_submitIndex`, `m_returnIndex` atomic counters +- [x] **🎯 Option C**: Add `m_slotIdCounter`, `m_returnIdCounter` atomic counters (NOT submitIndex/returnIndex) - [x] Add `m_pollingThread`, `m_pollingRunning` members **Estimated Time**: 30 minutes --- -### Phase 2: Polling Thread Implementation +### Phase 2: Polling Thread Implementation (Multi-Frame Support) **Files to Modify**: - `NVDECAV1Decoder.cpp` - Implement `PollingThreadFunc()` **Tasks**: -- [ ] Implement polling loop with `cuvidGetDecodeStatus()` +- [ ] Implement polling loop with `cuvidGetDecodeStatus()` checking ALL picture_indices +- [ ] Poll slot at `m_returnIdCounter % RING_BUFFER_SIZE` (NOT all slots) - [ ] Add thread start in `Initialize()` - [ ] Add thread stop in `Cleanup()` -- [ ] Add debug logging for polling events +- [ ] Add debug logging for multi-frame packet events **Testing**: - Verify thread starts/stops correctly +- Verify multi-frame packet handling (all frames checked) - Verify `cuvidGetDecodeStatus()` calls work **Estimated Time**: 1 hour --- -### Phase 3: DecodeToSurface Refactoring +### Phase 3: DecodeToSurface Refactoring (Option C Implementation) **Files to Modify**: -- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` +- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` with Option C design **Tasks**: -- [ ] Phase 1: Slot allocation logic -- [ ] Phase 2: Sequential return wait logic -- [ ] Phase 3: Frame retrieval & cleanup logic +- [ ] **🎯 Phase 1**: Slot allocation using `my_id = m_slotIdCounter.fetch_add(1)` +- [ ] **🎯 Phase 2**: Packet submission with `packet.timestamp = my_id` (full slot_id, NOT modulo!) +- [ ] **🎯 Phase 3**: FIFO wait using `while(m_returnIdCounter != my_id)` (NOT slot_idx!) +- [ ] **🎯 Phase 4**: Frame retrieval from `picture_indices[0]` (first frame only) +- [ ] **🎯 Phase 5**: Cleanup and `m_returnIdCounter.fetch_add(1)` - [ ] Error handling for all failure paths **Testing**: - Single-threaded decode test - Multi-threaded decode test (2-3 threads) -- Verify packet-surface mapping correctness +- Verify packet-surface mapping correctness with multi-frame packets +- Verify FIFO ordering with out-of-order completion **Estimated Time**: 2 hours --- -### Phase 4: HandlePictureDisplay Update +### Phase 4: HandlePictureDecode Update (Option C) **Files to Modify**: -- `NVDECAV1Decoder.cpp` - Modify `HandlePictureDisplay()` +- `NVDECAV1Decoder.cpp` - Modify `HandlePictureDecode()` callback **Tasks**: -- [ ] Extract slot_idx from timestamp -- [ ] Store picture_index in correct slot -- [ ] Add debug logging +- [ ] **🎯 Option C**: Extract `slot_id` from `pic_params->nTimeStamp` +- [ ] **🎯 Option C**: Calculate `slot_idx = slot_id % RING_BUFFER_SIZE` (NO map lookup!) +- [ ] **🎯 Multi-frame**: Use `slot.picture_indices.push_back(CurrPicIdx)` (NOT single index!) +- [ ] Add debug logging for multi-frame packets **Testing**: -- Verify timestamp → slot_idx mapping -- Verify picture_index stored correctly +- Verify timestamp → slot_idx direct calculation works +- Verify picture_indices vector correctly stores multiple frames +- Test with video that has multi-frame packets (test_720p_stripe.webm) **Estimated Time**: 30 minutes --- -### Phase 5: Integration Testing +### Phase 5: Integration Testing (Option C Validation) **Test Scenarios**: -1. **Single packet decode** - Verify basic functionality -2. **Sequential 3 packets** - Verify FIFO order -3. **Out-of-order completion** - Verify correct mapping (I-frame after P-frame) -4. **RingBuffer overflow** - Verify error handling (9+ concurrent calls) -5. **Decode errors** - Verify graceful failure -6. **Performance benchmark** - Measure latency reduction +1. **Single packet decode** - Verify Option C basic functionality +2. **Multi-frame packet** - Verify vector-based picture_indices handling (test_720p_stripe.webm) +3. **Sequential 3 packets** - Verify FIFO order using m_returnIdCounter +4. **Out-of-order completion** - Verify slot_id → slot_idx mapping (I-frame after P-frame) +5. **RingBuffer overflow** - Verify error handling (9+ concurrent calls) +6. **Decode errors** - Verify graceful failure with multi-frame packets +7. **Performance benchmark** - Measure latency reduction vs old queue-based approach **Test Files**: -- Simple test video (simple_test.webm) -- Complex GOP structure video (test_720p_stripe.webm) +- Simple test video (simple_test.webm) - basic validation +- **Multi-frame packet video (test_720p_stripe.webm)** - critical multi-frame test ⚠️ + +**Validation Criteria**: +- ✅ No slot ID → slot_idx mapping errors +- ✅ All frames from multi-frame packets detected and polled +- ✅ FIFO order maintained even with out-of-order GPU completion +- ✅ No memory corruption or race conditions **Estimated Time**: 2 hours @@ -563,15 +634,63 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI ### Phase 6: Documentation & Cleanup **Tasks**: -- [ ] Update NVDEC design documentation -- [ ] Add inline code comments -- [ ] Remove old queue-based code -- [ ] Move design doc to `docs/completed/` +- [x] Update NVDEC design documentation with Option C and multi-frame support +- [ ] Add inline code comments explaining Option C design choices +- [ ] Remove old queue-based code and any ID→Index mapping attempts +- [ ] Move design doc to `docs/completed/` after successful implementation +- [ ] Document multi-frame packet behavior and first-frame-only decision **Estimated Time**: 1 hour --- +## 🎯 Option C Design Summary + +### **Core Principle**: Eliminate mapping overhead through deterministic calculation + +**Key Components**: +1. **Single Counter for Dual Purpose**: `m_slotIdCounter` serves as both unique ID and submission order +2. **Direct Slot Calculation**: `slot_idx = slot_id % RING_BUFFER_SIZE` (no map needed) +3. **FIFO via ID Comparison**: `while(m_returnIdCounter != my_id)` ensures ordering +4. **Multi-Frame Vector**: `std::vector picture_indices` handles packets with multiple frames + +**Data Flow**: +``` +DecodeToSurface: + my_id = m_slotIdCounter++ (e.g., 17) + slot_idx = 17 % 8 = 1 + packet.timestamp = 17 + +HandlePictureDecode: + slot_id = pic_params->nTimeStamp (17) + slot_idx = 17 % 8 = 1 + m_ringBuffer[1].picture_indices.push_back(CurrPicIdx) + +PollingThread: + return_id = m_returnIdCounter (17) + slot_idx = 17 % 8 = 1 + Check all m_ringBuffer[1].picture_indices[] + +DecodeToSurface (wait): + while(m_returnIdCounter != 17) { wait } + Process frame from picture_indices[0] + m_returnIdCounter++ (18) +``` + +**Eliminated Complexity**: +- ❌ No `std::unordered_map` mapping +- ❌ No global map mutex +- ❌ No map insert/erase operations +- ❌ No lookup failures or stale entries + +**Multi-Frame Packet Handling**: +- One packet → multiple HandlePictureDecode calls +- All frames stored in `picture_indices` vector +- PollingThread checks ALL frames complete +- Return first frame only (`picture_indices[0]`) + +--- + ## 📊 Performance Analysis ### Expected Improvements diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp index 58edffb..61ac2f6 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp @@ -879,8 +879,8 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS } char debug_buf[512]; - sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, IntraPicFlag=%d, ref_pic_flag=%d)\n", - decoder->m_decoder, pic_params->CurrPicIdx, pic_params->intra_pic_flag, pic_params->ref_pic_flag); + sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, IntraPicFlag=%d)\n", + decoder->m_decoder, pic_params->CurrPicIdx, pic_params->intra_pic_flag); OutputDebugStringA(debug_buf); printf("%s", debug_buf); @@ -893,21 +893,7 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS return 0; } - // Store picture_index in ring buffer slot for polling thread - // The slot index was embedded in the timestamp by DecodeToSurface - // Note: pic_params doesn't have timestamp, so we use CurrPicIdx as picture_index - // and let the polling thread find the right slot - int picture_index = pic_params->CurrPicIdx; - size_t submit_idx = decoder->m_submitIndex.load() - 1; // Last submitted slot - DecodeSlot& slot = decoder->m_ringBuffer[submit_idx % decoder->RING_BUFFER_SIZE]; - - { - std::lock_guard lock(slot.slot_mutex); - slot.picture_index = picture_index; - } - - sprintf_s(debug_buf, "[HandlePictureDecode] Success - stored picture_index=%d in slot %zu\n", - picture_index, submit_idx % decoder->RING_BUFFER_SIZE); + sprintf_s(debug_buf, "[HandlePictureDecode] cuvidDecodePicture succeeded (CurrPicIdx=%d)\n", pic_params->CurrPicIdx); OutputDebugStringA(debug_buf); printf("%s", debug_buf); @@ -921,34 +907,45 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI return 0; } + // Option C: Extract slot_id from timestamp and calculate slot_idx directly (NO map lookup!) + uint64_t slot_id = static_cast(disp_info->timestamp); + size_t slot_idx = slot_id % decoder->RING_BUFFER_SIZE; + DecodeSlot& slot = decoder->m_ringBuffer[slot_idx]; + char debug_buf[256]; - sprintf_s(debug_buf, "[HandlePictureDisplay] Frame decoded: picture_index=%d, timestamp=%lld\n", - disp_info->picture_index, disp_info->timestamp); + sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: slot_id=%llu, slot_idx=%zu, picture_index=%d\n", + slot_id, slot_idx, disp_info->picture_index); OutputDebugStringA(debug_buf); + printf("%s", debug_buf); - // ===== Phase 4: RingBuffer Integration ===== - // Extract slot index from timestamp (set in DecodeToSurface) - size_t slot_idx = static_cast(disp_info->timestamp); - DecodeSlot& slot = decoder->m_ringBuffer[slot_idx % decoder->RING_BUFFER_SIZE]; - - sprintf_s(debug_buf, "[HandlePictureDisplay] Updating RingBuffer slot %zu with picture_index=%d\n", - slot_idx % decoder->RING_BUFFER_SIZE, disp_info->picture_index); - OutputDebugStringA(debug_buf); - - // IMPORTANT: pfnDisplayPicture is called AFTER GPU decoding completes - // So we can directly mark the frame as ready without polling + // Option C Multi-Frame Support: Store picture_index in vector (one packet can have multiple frames) { std::lock_guard lock(slot.slot_mutex); - slot.picture_index = disp_info->picture_index; - slot.is_ready = true; // Frame is already decoded and ready + slot.picture_indices.push_back(disp_info->picture_index); } - // Signal waiting thread that frame is ready - slot.frame_ready.notify_one(); - - sprintf_s(debug_buf, "[HandlePictureDisplay] Slot %zu marked ready (picture_index=%d)\\n", - slot_idx % decoder->RING_BUFFER_SIZE, disp_info->picture_index); + sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Pushed picture_index=%d to slot %zu (total frames: %zu)\n", + disp_info->picture_index, slot_idx, slot.picture_indices.size()); OutputDebugStringA(debug_buf); + printf("%s", debug_buf); + + // IMPORTANT: HandlePictureDisplay is called AFTER decoding completes + // So we can mark the slot as ready immediately (no polling needed for this callback) + bool should_notify = false; + { + std::lock_guard lock(slot.slot_mutex); + if (!slot.is_ready) { + slot.is_ready = true; + should_notify = true; + } + } + + if (should_notify) { + slot.frame_ready.notify_one(); + sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Slot %zu marked ready and notified\n", slot_idx); + OutputDebugStringA(debug_buf); + printf("%s", debug_buf); + } return 1; } @@ -1151,12 +1148,13 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ try { auto start_time = std::chrono::high_resolution_clock::now(); - // ===== Phase 3: RingBuffer Slot Allocation ===== - // 1. Allocate next available slot (producer) - size_t my_slot_idx = m_submitIndex.fetch_add(1); - DecodeSlot& my_slot = m_ringBuffer[my_slot_idx % RING_BUFFER_SIZE]; + // ===== Phase 1: Option C Slot Allocation ===== + // 1. Allocate unique slot ID (serves as both ID and submission order) + uint64_t my_id = m_slotIdCounter.fetch_add(1); + size_t slot_idx = my_id % RING_BUFFER_SIZE; + DecodeSlot& my_slot = m_ringBuffer[slot_idx]; - sprintf_s(debug_buf, "[DecodeToSurface] Allocating slot %zu (submitIndex=%zu)\n", my_slot_idx % RING_BUFFER_SIZE, my_slot_idx); + sprintf_s(debug_buf, "[DecodeToSurface] Option C: Allocated slot_id=%llu, slot_idx=%zu\n", my_id, slot_idx); OutputDebugStringA(debug_buf); // 2. Check for overflow (RingBuffer full) @@ -1165,17 +1163,17 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ if (my_slot.in_use) { // RingBuffer overflow - too many concurrent decodes - sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", my_slot_idx % RING_BUFFER_SIZE); + sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", slot_idx); OutputDebugStringA(debug_buf); LogError("RingBuffer overflow - max 8 concurrent decodes"); return false; } - // 3. Initialize slot + // 3. Initialize slot (multi-frame support) my_slot.in_use = true; my_slot.target_surface = target_surface; my_slot.surface_type = target_type; - my_slot.picture_index = -1; // Will be set by HandlePictureDisplay + my_slot.picture_indices.clear(); // Clear previous frames my_slot.is_ready = false; } @@ -1191,14 +1189,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ printf("%s", debug_buf); } - // 5. Submit packet to NVDEC parser with slot index in timestamp + // 5. Option C: Submit packet with full slot_id in timestamp (NOT modulo!) CUVIDSOURCEDATAPACKET packet = {}; packet.payload = final_packet_data; packet.payload_size = static_cast(final_packet_size); packet.flags = CUVID_PKT_ENDOFPICTURE; - packet.timestamp = static_cast(my_slot_idx); // Embed slot index in timestamp + packet.timestamp = static_cast(my_id); // Pass full slot_id for HandlePictureDecode - sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot %zu)...\n", packet.timestamp, my_slot_idx % RING_BUFFER_SIZE); + sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot_id=%llu, slot_idx=%zu)...\n", packet.timestamp, my_id, slot_idx); OutputDebugStringA(debug_buf); CUresult result = cuvidParseVideoData(m_parser, &packet); @@ -1217,49 +1215,60 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ } OutputDebugStringA("[DecodeToSurface] cuvidParseVideoData succeeded\n"); - // ===== Component 4: Sequential Return Wait ===== - // 5. Wait for my turn (enforce FIFO order) - sprintf_s(debug_buf, "[DecodeToSurface] Waiting for slot %zu to be return head (current returnIndex=%zu)...\n", - my_slot_idx % RING_BUFFER_SIZE, m_returnIndex.load()); - OutputDebugStringA(debug_buf); - - while (m_returnIndex.load() != my_slot_idx) { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - - sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu is now return head\n", my_slot_idx % RING_BUFFER_SIZE); - OutputDebugStringA(debug_buf); - - // 6. Wait for decode to complete (polling thread signals this) + // ===== Component 4: Wait for decode to complete FIRST ===== + // 6. Wait for decode to complete (HandlePictureDisplay signals this) { std::unique_lock lock(my_slot.slot_mutex); - sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot %zu, is_ready=%d)...\n", - my_slot_idx % RING_BUFFER_SIZE, my_slot.is_ready); + sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot_idx=%zu, is_ready=%d)...\n", + slot_idx, my_slot.is_ready); OutputDebugStringA(debug_buf); - if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500), + if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(5000), [&my_slot]() { return my_slot.is_ready; })) { // Timeout - decode took too long - sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot %zu (is_ready=%d)\n", - my_slot_idx % RING_BUFFER_SIZE, my_slot.is_ready); + sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot_idx=%zu (is_ready=%d)\n", + slot_idx, my_slot.is_ready); OutputDebugStringA(debug_buf); printf("%s", debug_buf); LogError("Decode timeout"); my_slot.in_use = false; - m_returnIndex.fetch_add(1); // Skip this slot to avoid deadlock + m_returnIdCounter.fetch_add(1); // Option C: Advance to unblock next thread return false; } - sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot %zu)\n", my_slot_idx % RING_BUFFER_SIZE); + sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot_idx=%zu)\n", slot_idx); OutputDebugStringA(debug_buf); } - sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu decode complete (picture_index=%d)\n", my_slot_idx % RING_BUFFER_SIZE, my_slot.picture_index); + // ===== Component 5: Option C Sequential Return Wait (FIFO Guarantee) ===== + // 5. Wait for my turn using slot_id (NOT slot_idx!) AFTER decoding completes + sprintf_s(debug_buf, "[DecodeToSurface] Option C: Waiting for FIFO turn slot_id=%llu (current returnIdCounter=%llu)...\n", + my_id, m_returnIdCounter.load()); OutputDebugStringA(debug_buf); - // ===== Component 5: Frame Retrieval & Cleanup ===== - int frameIdx = my_slot.picture_index; + while (m_returnIdCounter.load() != my_id) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + + sprintf_s(debug_buf, "[DecodeToSurface] Option C: slot_id=%llu is now return head\n", my_id); + OutputDebugStringA(debug_buf); + + // ===== Component 5: Option C Frame Retrieval (Multi-Frame Support) ===== + // Get first frame from multi-frame packet (picture_indices[0]) + if (my_slot.picture_indices.empty()) { + sprintf_s(debug_buf, "[DecodeToSurface] ERROR: No picture_indices in slot %zu\n", slot_idx); + OutputDebugStringA(debug_buf); + LogError("No frames decoded"); + my_slot.in_use = false; + m_returnIdCounter.fetch_add(1); + return false; + } + + int frameIdx = my_slot.picture_indices[0]; // Return first frame only + sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu decode complete, using first frame (picture_index=%d, total_frames=%zu)\n", + slot_idx, frameIdx, my_slot.picture_indices.size()); + OutputDebugStringA(debug_buf); // For CUDA device surface, we need to map the decoded frame if (target_type == VAVCORE_SURFACE_CUDA_DEVICE) { @@ -1275,7 +1284,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ if (result != CUDA_SUCCESS) { LogCUDAError(result, "cuvidMapVideoFrame"); my_slot.in_use = false; - m_returnIndex.fetch_add(1); + m_returnIdCounter.fetch_add(1); return false; } @@ -1301,7 +1310,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ OutputDebugStringA("[DecodeToSurface] ERROR: target_surface or m_d3d12Device is null\n"); LogError("D3D12 resource or device not available"); my_slot.in_use = false; - m_returnIndex.fetch_add(1); + m_returnIdCounter.fetch_add(1); return false; } @@ -1340,7 +1349,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ OutputDebugStringA(debug_buf); LogCUDAError(result, "cuvidMapVideoFrame"); my_slot.in_use = false; - m_returnIndex.fetch_add(1); + m_returnIdCounter.fetch_add(1); return false; } @@ -1367,7 +1376,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ if (!copySuccess) { OutputDebugStringA("[DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed\n"); my_slot.in_use = false; - m_returnIndex.fetch_add(1); + m_returnIdCounter.fetch_add(1); return false; } @@ -1391,10 +1400,10 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ my_slot.in_use = false; } - // 11. Advance return index - m_returnIndex.fetch_add(1); + // 11. Option C: Advance return ID counter (FIFO order) + m_returnIdCounter.fetch_add(1); - sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu released, returnIndex advanced to %zu\n", my_slot_idx % RING_BUFFER_SIZE, m_returnIndex.load()); + sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu released, returnIdCounter advanced to %llu\n", slot_idx, m_returnIdCounter.load()); OutputDebugStringA(debug_buf); // Update statistics @@ -1455,63 +1464,53 @@ void NVDECAV1Decoder::PollingThreadFunc() { int poll_count = 0; while (m_pollingRunning) { - // Get current return slot (oldest pending decode) - size_t current_return_idx = m_returnIndex.load(); - size_t slot_idx = current_return_idx % RING_BUFFER_SIZE; + // Option C: Get current return ID and calculate slot index + uint64_t current_return_id = m_returnIdCounter.load(); + size_t slot_idx = current_return_id % RING_BUFFER_SIZE; DecodeSlot& slot = m_ringBuffer[slot_idx]; // Debug logging every 100 iterations if (poll_count % 100 == 0) { char debug_buf[256]; - sprintf_s(debug_buf, "[PollingThread] Poll #%d: returnIdx=%zu, slot=%zu, in_use=%d, is_ready=%d, picture_index=%d\n", - poll_count, current_return_idx, slot_idx, slot.in_use, slot.is_ready, slot.picture_index); + sprintf_s(debug_buf, "[PollingThread] Option C: Poll #%d: returnId=%llu, slot=%zu, in_use=%d, is_ready=%d, frames=%zu\n", + poll_count, current_return_id, slot_idx, slot.in_use, slot.is_ready, slot.picture_indices.size()); OutputDebugStringA(debug_buf); } poll_count++; // Check if slot is in use and not yet ready - if (slot.in_use && !slot.is_ready && slot.picture_index >= 0) { - // Query NVDEC for decode status - CUVIDGETDECODESTATUS decodeStatus = {}; - CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus); - - if (result == CUDA_SUCCESS) { - if (decodeStatus.decodeStatus == cuvidDecodeStatus_Success) { - // Decode complete! - { - std::lock_guard lock(slot.slot_mutex); - slot.is_ready = true; - } - slot.frame_ready.notify_one(); - - char debug_buf[128]; - sprintf_s(debug_buf, "[PollingThread] Slot %zu decode complete (picture_index=%d)\n", - slot_idx, slot.picture_index); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - } - else if (decodeStatus.decodeStatus == cuvidDecodeStatus_Error || - decodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed) { - // Decode error - mark as ready to unblock - { - std::lock_guard lock(slot.slot_mutex); - slot.is_ready = true; - } - slot.frame_ready.notify_one(); - - char debug_buf[128]; - sprintf_s(debug_buf, "[PollingThread] Slot %zu decode error (status=%d)\n", - slot_idx, decodeStatus.decodeStatus); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - } - // else: decodeStatus.decodeStatus == cuvidDecodeStatus_InProgress, continue polling + if (slot.in_use && !slot.is_ready) { + // Option C Multi-Frame Support: Get copy of picture indices + std::vector picture_indices_copy; + { + std::lock_guard lock(slot.slot_mutex); + picture_indices_copy = slot.picture_indices; } - else { - // cuvidGetDecodeStatus failed - char debug_buf[128]; - sprintf_s(debug_buf, "[PollingThread] cuvidGetDecodeStatus failed for slot %zu (error=%d)\n", - slot_idx, result); + + // Check if ALL frames are decoded + bool all_complete = true; + for (int pic_idx : picture_indices_copy) { + CUVIDGETDECODESTATUS decodeStatus = {}; + CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus); + + if (result != CUDA_SUCCESS || + decodeStatus.decodeStatus != cuvidDecodeStatus_Success) { + all_complete = false; + break; + } + } + + // If all frames complete, signal ready + if (all_complete && !picture_indices_copy.empty()) { + { + std::lock_guard lock(slot.slot_mutex); + slot.is_ready = true; + } + slot.frame_ready.notify_one(); + + char debug_buf[256]; + sprintf_s(debug_buf, "[PollingThread] Option C: Slot %zu ALL frames complete (%zu frames)\n", + slot_idx, picture_indices_copy.size()); OutputDebugStringA(debug_buf); printf("%s", debug_buf); } diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h index 6de79af..52ec795 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h @@ -152,20 +152,21 @@ private: void* target_surface = nullptr; // Destination D3D12 resource VavCoreSurfaceType surface_type = VAVCORE_SURFACE_CPU; - // NVDEC information (from HandlePictureDisplay callback) - int picture_index = -1; // NVDEC frame index for cuvidMapVideoFrame + // NVDEC information (from HandlePictureDecode callback) + // Multi-frame support: One packet can decode to multiple frames + std::vector picture_indices; // All NVDEC frame indices from this packet // Synchronization primitives - std::condition_variable frame_ready; // Signaled when decode complete + std::condition_variable frame_ready; // Signaled when ALL frames are decoded std::mutex slot_mutex; // Protects this slot's state - bool is_ready = false; // Decode completed flag + bool is_ready = false; // All frames decoded flag }; DecodeSlot m_ringBuffer[RING_BUFFER_SIZE]; - // Producer-consumer indices - std::atomic m_submitIndex{0}; // Next slot to allocate (producer) - std::atomic m_returnIndex{0}; // Next slot to return (consumer) + // Option C: Unified slot allocation counters (no mapping needed!) + std::atomic m_slotIdCounter{0}; // Monotonically increasing slot ID (submission order) + std::atomic m_returnIdCounter{0}; // Return order enforcement (FIFO) // Polling thread for cuvidGetDecodeStatus std::thread m_pollingThread;