WIP

2025-10-05 11:12:57 +09:00
parent b67aba9a5f
commit 7c2973f17f
4 changed files with 401 additions and 259 deletions
--- a/todo16.txt
+++ b/todo16.txt
@@ -371,3 +371,26 @@ ulNumDecodeSurfaces 갯수만큼 cuvidCreateDecoder() 인스턴스를 생성해

  The core NVDEC integration is working - frames are being decoded successfully by the hardware. The remaining work is properly synchronizing the async decode pipeline with
  the multi-frame packet structure.
+
+
+
+
+
+
+
+● 이제 simple_test.webm으로 테스트하겠습니다:
+
+● Bash("D:/Project/video-av1/vav2/platforms/windows/tests/red-surface-nvdec/bin/Debug/RedSurfaceNVDECTest.exe" "D:/Project/video-av1/sample/simple_test.webm") timeout: 1m 0s
+  ⎿  Error: Command timed out after 1m 0s [NVDECAV1Decoder] CUDA ERROR in cuvidParseVideoData: unknown error (code: 999)
+     [NVDECAV1Decoder] CUDA ERROR in cuvidParseVideoData: unknown error (code: 999)
+
+     [DllMain] VavCore.dll loaded - Process Attach
+     [DllMain] Safe to initialize: Ready for vavcore_initialize() call
+     [RedSurfaceNVDECTest] Starting test...
+     [RedSurfaceNVDECTest] Video: D:/Project/video-av1/sample/simple_test.webm
+
+     [Step 1] Initializing VavCore...
+     [DllMain] Performing safe decoder registration...
+     … +46 lines (ctrl+o to see all)
+
+● ChromaFormat 3 (YUV444)가 문제입니다. 이건 Option C 문제가 아니라 기존 문제입니다. test_720p_stripe.webm으로 테스트하겠습니다:
--- a/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md
+++ b/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md
@@ -1,8 +1,8 @@
 # NVDEC RingBuffer-based Asynchronous Decoding Design

-**Date**: 2025-10-04
-**Status**: 🔄 **In Progress**
-**Target**: NVDECAV1Decoder DecodeToSurface() optimization
+**Date**: 2025-10-05 (Updated)
+**Status**: ✅ **Design Finalized - Ready for Implementation**
+**Target**: NVDECAV1Decoder DecodeToSurface() optimization with multi-frame packet support

 ---

@@ -78,6 +78,32 @@ Thread B: pop() → picture_index=4 ❌ (gets packet1 result!)

 ---

+#### Issue 3: Multi-Frame Packet Handling ⚠️ **Critical Discovery**
+
+**Scenario**: Single WebM packet contains multiple AV1 frames
+
+```cpp
+// DecodeToSurface called ONCE
+DecodeToSurface(packet_175bytes, surface1, frame1);
+
+// NVDEC parser extracts MULTIPLE frames from single packet:
+→ HandlePictureDecode(CurrPicIdx=0, IntraPicFlag=1)  // I-frame
+→ HandlePictureDecode(CurrPicIdx=1, IntraPicFlag=0)  // P-frame
+→ HandlePictureDecode(CurrPicIdx=2, IntraPicFlag=0)  // P-frame
+→ ... (up to 8 frames in one packet)
+
+// Problem: Which picture_index should be returned?
+// Current design assumes: 1 packet = 1 frame ❌
+```
+
+**Impact**:
+- Slot allocation assumes 1 packet → 1 slot → 1 picture_index
+- Reality: 1 packet → 1 slot → **N picture_indices**
+- Must track multiple picture_indices per slot
+- Must decide which frame to return (first? last? all?)
+
+---
+
 #### Issue 3: ulNumOutputSurfaces Underutilization

 **NVDEC Configuration**:
@@ -172,13 +198,14 @@ struct DecodeSlot {
    void* target_surface;                 // Destination D3D12 resource
    VavCoreSurfaceType surface_type;      // Surface type

-    // NVDEC information (from HandlePictureDisplay callback)
-    int picture_index;                    // NVDEC frame index for cuvidMapVideoFrame
+    // NVDEC information (from HandlePictureDecode callback)
+    // ⚠️ Multi-frame support: One packet can decode to multiple frames
+    std::vector<int> picture_indices;     // All NVDEC frame indices from this packet

    // Synchronization primitives
-    std::condition_variable frame_ready;  // Signaled when decode complete
+    std::condition_variable frame_ready;  // Signaled when ALL frames are decoded
    std::mutex slot_mutex;                // Protects this slot's state
-    bool is_ready;                        // Decode completed flag
+    bool is_ready;                        // All frames decoded flag
 };
 ```

@@ -191,9 +218,9 @@ private:

    DecodeSlot m_ringBuffer[RING_BUFFER_SIZE];

-    // Producer-consumer indices
-    std::atomic<size_t> m_submitIndex{0};   // Next slot to allocate (producer)
-    std::atomic<size_t> m_returnIndex{0};   // Next slot to return (consumer)
+    // 🎯 Option C: Unified slot allocation counter (no mapping needed!)
+    std::atomic<uint64_t> m_slotIdCounter{0};   // Monotonically increasing slot ID
+    std::atomic<uint64_t> m_returnIdCounter{0}; // Return order enforcement (FIFO)

    // Polling thread
    std::thread m_pollingThread;
@@ -207,14 +234,15 @@ private:

 ### Component 1: Slot Allocation (Producer)

-**Purpose**: Assign RingBuffer slot to each DecodeToSurface call
+**Purpose**: Assign RingBuffer slot to each DecodeToSurface call using Option C design

 ```cpp
 // In DecodeToSurface()

-// 1. Allocate next available slot
-size_t my_slot_idx = m_submitIndex.fetch_add(1) % RING_BUFFER_SIZE;
-DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
+// 🎯 Option C: Allocate unique ID (serves as both slot ID and submission order)
+uint64_t my_id = m_slotIdCounter.fetch_add(1);
+size_t slot_idx = my_id % RING_BUFFER_SIZE;
+DecodeSlot& my_slot = m_ringBuffer[slot_idx];

 // 2. Check for overflow
 {
@@ -230,27 +258,29 @@ DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
    my_slot.in_use = true;
    my_slot.target_surface = target_surface;
    my_slot.surface_type = target_type;
-    my_slot.picture_index = -1;  // Set by HandlePictureDisplay
+    my_slot.picture_indices.clear();  // Multi-frame support: clear previous frames
    my_slot.is_ready = false;
 }
 ```

-**Atomic Counter Behavior**:
+**🎯 Option C: No Mapping Needed - Direct Calculation**:
 ```
-Thread 1: m_submitIndex.fetch_add(1) → 0 % 8 = slot[0]
-Thread 2: m_submitIndex.fetch_add(1) → 1 % 8 = slot[1]
-Thread 3: m_submitIndex.fetch_add(1) → 2 % 8 = slot[2]
+Thread 1: m_slotIdCounter.fetch_add(1) → ID=0, slot_idx = 0 % 8 = slot[0]
+Thread 2: m_slotIdCounter.fetch_add(1) → ID=1, slot_idx = 1 % 8 = slot[1]
+Thread 3: m_slotIdCounter.fetch_add(1) → ID=2, slot_idx = 2 % 8 = slot[2]
 ...
-Thread 9: m_submitIndex.fetch_add(1) → 8 % 8 = slot[0] (wrap around)
+Thread 9: m_slotIdCounter.fetch_add(1) → ID=8, slot_idx = 8 % 8 = slot[0] (wrap around)
 ```

+**Key Advantage**: ID → slot_idx calculation is **deterministic**, no map storage needed!
+
 **Overflow Protection**: If `slot[0].in_use == true` when Thread 9 arrives → error

 ---

 ### Component 2: Packet Submission

-**Purpose**: Submit packet to NVDEC with slot index tracking
+**Purpose**: Submit packet to NVDEC with slot ID tracking via timestamp

 ```cpp
 // 4. Submit packet to NVDEC parser
@@ -258,7 +288,7 @@ CUVIDSOURCEDATAPACKET packet = {};
 packet.payload = packet_data;
 packet.payload_size = packet_size;
 packet.flags = CUVID_PKT_ENDOFPICTURE;
-packet.timestamp = my_slot_idx;  // ✅ Embed slot index in timestamp
+packet.timestamp = static_cast<int64_t>(my_id);  // 🎯 Pass full slot_id (NOT modulo!)

 CUresult result = cuvidParseVideoData(m_parser, &packet);
 if (result != CUDA_SUCCESS) {
@@ -267,91 +297,99 @@ if (result != CUDA_SUCCESS) {
 }
 ```

-**Timestamp Flow**:
+**🎯 Option C Timestamp Flow** (multi-frame packet support):
 ```
-cuvidParseVideoData(packet, timestamp=2)
+cuvidParseVideoData(packet, timestamp=my_id=17)
    ↓
 HandleVideoSequence() (first time only)
    ↓
-HandlePictureDecode(timestamp=2)
+HandlePictureDecode(timestamp=17, CurrPicIdx=0) → slot_idx = 17 % 8 = 1
+  → m_ringBuffer[1].picture_indices.push_back(0)
    ↓
-GPU decodes packet...
+HandlePictureDecode(timestamp=17, CurrPicIdx=1) → slot_idx = 17 % 8 = 1
+  → m_ringBuffer[1].picture_indices.push_back(1)  // Same packet, multiple frames!
    ↓
-HandlePictureDisplay(timestamp=2, picture_index=5)
+HandlePictureDecode(timestamp=17, CurrPicIdx=2) → slot_idx = 17 % 8 = 1
+  → m_ringBuffer[1].picture_indices.push_back(2)
    ↓
-m_ringBuffer[2].picture_index = 5  // ✅ Slot 2 now linked to picture_index 5
+PollingThread checks ALL picture_indices for slot[1]
+    ↓
+When all complete: slot[1].is_ready = true, notify thread
 ```

+**Key Point**: Timestamp carries **full slot_id**, HandlePictureDecode calculates slot_idx directly
+
 ---

 ### Component 3: Polling Thread (Background Status Checker)

-**Purpose**: Continuously poll `m_returnIndex` slot for decode completion
+**Purpose**: Continuously poll `m_returnIdCounter` slot for decode completion (multi-frame support)

 ```cpp
 void NVDECAV1Decoder::PollingThreadFunc() {
    while (m_pollingRunning) {
-        // 1. Get current return slot (oldest pending decode)
-        size_t current_return_idx = m_returnIndex.load();
-        DecodeSlot& slot = m_ringBuffer[current_return_idx];
+        // 1. Get current return ID and calculate slot index
+        uint64_t current_return_id = m_returnIdCounter.load();
+        size_t slot_idx = current_return_id % RING_BUFFER_SIZE;
+        DecodeSlot& slot = m_ringBuffer[slot_idx];

        // 2. Check if slot is in use and not yet ready
-        if (slot.in_use && !slot.is_ready && slot.picture_index >= 0) {
+        if (slot.in_use && !slot.is_ready) {

-            // 3. Query NVDEC for decode status
-            CUVIDGETDECODESTATUS decodeStatus = {};
-            CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus);
+            // 3. Get copy of picture indices (multi-frame support)
+            std::vector<int> picture_indices_copy;
+            {
+                std::lock_guard<std::mutex> lock(slot.slot_mutex);
+                picture_indices_copy = slot.picture_indices;
+            }

-            if (result == CUDA_SUCCESS) {
-                if (decodeStatus.decodeStatus == cuvidDecodeStatus_Success) {
-                    // ✅ Decode complete!
-                    {
-                        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                        slot.is_ready = true;
-                    }
+            // 4. Check if ALL frames are decoded
+            bool all_complete = true;
+            for (int pic_idx : picture_indices_copy) {
+                CUVIDGETDECODESTATUS decodeStatus = {};
+                CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus);

-                    // Wake up waiting DecodeToSurface thread
-                    slot.frame_ready.notify_one();
-
-                    OutputDebugStringA("[Polling] Slot ready\n");
+                if (result != CUDA_SUCCESS ||
+                    decodeStatus.decodeStatus != cuvidDecodeStatus_Success) {
+                    all_complete = false;
+                    break;
                }
-                else if (decodeStatus.decodeStatus == cuvidDecodeStatus_Error) {
-                    // Decode error - mark as ready to unblock
-                    {
-                        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                        slot.is_ready = true;  // Error also counts as "ready"
-                    }
-                    slot.frame_ready.notify_one();
+            }

-                    OutputDebugStringA("[Polling] Decode error\n");
+            // 5. If all frames complete, signal ready
+            if (all_complete && !picture_indices_copy.empty()) {
+                {
+                    std::lock_guard<std::mutex> lock(slot.slot_mutex);
+                    slot.is_ready = true;
                }
-                // cuvidDecodeStatus_InProgress → keep polling
+                slot.frame_ready.notify_one();
            }
        }

-        // 4. Sleep to avoid busy-wait
+        // 6. Sleep to avoid busy-wait
        std::this_thread::sleep_for(std::chrono::microseconds(100));
    }
 }
 ```

 **Key Points**:
- ✅ Only polls `m_returnIndex` slot (not all 8 slots) → efficient
+- ✅ Only polls `m_returnIdCounter` slot (not all 8 slots) → efficient
+- ✅ **Multi-frame support**: Checks ALL picture_indices for completion
 - ✅ Uses `cuvidGetDecodeStatus()` non-blocking query
 - ✅ 100us sleep → ~10,000 checks/second (low CPU usage)
- ✅ Handles decode errors gracefully
+- ✅ Thread-safe picture_indices copy to avoid lock contention

 ---

-### Component 4: Sequential Return Wait
+### Component 4: Sequential Return Wait (FIFO Guarantee)

-**Purpose**: Enforce FIFO order even when decodes complete out-of-order
+**Purpose**: Enforce FIFO order even when decodes complete out-of-order using Option C

 ```cpp
 // In DecodeToSurface() - PHASE 2

-// 5. Wait for my turn (sequential return order)
-while (m_returnIndex.load() != my_slot_idx) {
+// 5. Wait for my turn (FIFO order enforcement)
+while (m_returnIdCounter.load() != my_id) {
    std::this_thread::sleep_for(std::chrono::milliseconds(1));
 }

@@ -366,44 +404,47 @@ while (m_returnIndex.load() != my_slot_idx) {
        // Timeout - decode took too long
        LogError("Decode timeout");
        my_slot.in_use = false;
-        m_returnIndex.fetch_add(1);  // Skip this slot to avoid deadlock
+        m_returnIdCounter.fetch_add(1);  // Skip this slot to avoid deadlock
        return false;
    }
 }
 ```

-**Timeline Example**:
+**🎯 Option C Timeline Example** (using slot ID, not slot index):
 ```
-Thread 1 (slot 0): Wait for returnIndex==0 ✅ (immediate)
-                   Wait for is_ready...
+Thread 1 (ID=17, slot 1): Wait for returnIdCounter==17 ✅ (immediate)
+                          Wait for is_ready...

-Thread 2 (slot 1): Wait for returnIndex==1 ⏸️ (blocked)
+Thread 2 (ID=18, slot 2): Wait for returnIdCounter==18 ⏸️ (blocked)

-Thread 3 (slot 2): Wait for returnIndex==2 ⏸️ (blocked)
+Thread 3 (ID=19, slot 3): Wait for returnIdCounter==19 ⏸️ (blocked)

-GPU: packet2 completes first @ t=3ms
+GPU: ID=18 completes first @ t=3ms
+     → slot[2].is_ready = true
+     → Thread 2 still blocked (returnIdCounter=17)
+
+GPU: ID=17 completes @ t=15ms
     → slot[1].is_ready = true
-     → Thread 2 still blocked (returnIndex=0)
-
-GPU: packet1 completes @ t=15ms
-     → slot[0].is_ready = true
     → Thread 1 wakes up ✅
-     → Thread 1 processes → returnIndex = 1
+     → Thread 1 processes → returnIdCounter = 18
     → Thread 2 now unblocked ✅
 ```

+**Key Point**: Wait on **slot_id** (my_id), not slot_idx, for correct FIFO ordering
+
 ---

-### Component 5: Frame Retrieval & Cleanup
+### Component 5: Frame Retrieval & Cleanup (Multi-Frame Support)

 **Purpose**: Map decoded frame, copy to surface, release slot

 ```cpp
 // In DecodeToSurface() - PHASE 3

-int frameIdx = my_slot.picture_index;
+// 7. Get first frame from multi-frame packet
+int frameIdx = my_slot.picture_indices[0];  // Return first frame only

-// 7. Map decoded frame from NVDEC
+// 8. Map decoded frame from NVDEC
 CUVIDPROCPARAMS procParams = {};
 procParams.progressive_frame = 1;

@@ -413,148 +454,178 @@ unsigned int srcPitch = 0;
 CUresult result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams);
 if (result != CUDA_SUCCESS) {
    my_slot.in_use = false;
-    m_returnIndex.fetch_add(1);
+    m_returnIdCounter.fetch_add(1);
    return false;
 }

-// 8. Copy to D3D12 surface
+// 9. Copy to D3D12 surface
 ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(target_surface);
 bool copySuccess = m_d3d12Handler->CopyNV12Frame(
    srcDevicePtr, srcPitch, d3d12Resource, m_width, m_height
 );

-// 9. Unmap frame
+// 10. Unmap frame
 cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);

-// 10. Release slot
+// 11. Release slot
 {
    std::lock_guard<std::mutex> lock(my_slot.slot_mutex);
    my_slot.in_use = false;
 }

-// 11. Advance return index
-m_returnIndex.fetch_add(1);
+// 12. Advance return ID counter (FIFO order)
+m_returnIdCounter.fetch_add(1);

 return copySuccess;
 ```

+**Multi-Frame Decision**: Return **first frame only** from multi-frame packet (indices[0])
+
 ---

-### Component 6: NVDEC Callback Integration
+### Component 6: NVDEC Callback Integration (Option C)

-**Purpose**: Link NVDEC picture_index to RingBuffer slot
+**Purpose**: Link NVDEC picture_index to RingBuffer slot using direct slot_id calculation

 ```cpp
-int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDISPINFO* disp_info) {
+int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS* pic_params) {
    auto* decoder = static_cast<NVDECAV1Decoder*>(user_data);

-    // Extract slot index from timestamp
-    size_t slot_idx = static_cast<size_t>(disp_info->timestamp) % RING_BUFFER_SIZE;
+    // 🎯 Option C: Direct slot_id → slot_idx calculation (no map lookup!)
+    uint64_t slot_id = static_cast<uint64_t>(pic_params->nTimeStamp);
+    size_t slot_idx = slot_id % RING_BUFFER_SIZE;

    DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];

-    // Store NVDEC picture index
-    {
-        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-        slot.picture_index = disp_info->picture_index;
+    // Submit frame to NVDEC decoder
+    CUresult result = cuvidDecodePicture(decoder->m_decoder, pic_params);
+    if (result != CUDA_SUCCESS) {
+        decoder->LogCUDAError(result, "cuvidDecodePicture failed");
+        return 0;
    }

-    // Polling thread will check cuvidGetDecodeStatus() for this picture_index
+    // Store picture_index for polling (multi-frame support)
+    {
+        std::lock_guard<std::mutex> lock(slot.slot_mutex);
+        slot.picture_indices.push_back(pic_params->CurrPicIdx);
+    }
+
+    // Polling thread will check cuvidGetDecodeStatus() for ALL picture_indices

    return 1;
 }
 ```

+**🎯 Option C Key Advantages**:
+- ✅ **No mapping overhead**: Direct modulo calculation, no unordered_map lookup
+- ✅ **No mutex contention**: No global map mutex needed
+- ✅ **Multi-frame support**: Automatically handles multiple frames per packet
+- ✅ **Deterministic**: Same slot_id always maps to same slot_idx
+
 ---

-## 📐 Implementation Plan
+## 📐 Implementation Plan (Updated for Option C)

 ### Phase 1: Data Structure Setup ✅

 **Files to Modify**:
- `NVDECAV1Decoder.h` - Add RingBuffer members
+- `NVDECAV1Decoder.h` - Add RingBuffer members with Option C design
 - `NVDECAV1Decoder.cpp` - Initialize RingBuffer in constructor

 **Tasks**:
- [x] Define `DecodeSlot` structure
+- [x] Define `DecodeSlot` structure with `std::vector<int> picture_indices`
 - [x] Add `m_ringBuffer[8]` array
- [x] Add `m_submitIndex`, `m_returnIndex` atomic counters
+- [x] **🎯 Option C**: Add `m_slotIdCounter`, `m_returnIdCounter` atomic counters (NOT submitIndex/returnIndex)
 - [x] Add `m_pollingThread`, `m_pollingRunning` members

 **Estimated Time**: 30 minutes

 ---

-### Phase 2: Polling Thread Implementation
+### Phase 2: Polling Thread Implementation (Multi-Frame Support)

 **Files to Modify**:
 - `NVDECAV1Decoder.cpp` - Implement `PollingThreadFunc()`

 **Tasks**:
- [ ] Implement polling loop with `cuvidGetDecodeStatus()`
+- [ ] Implement polling loop with `cuvidGetDecodeStatus()` checking ALL picture_indices
+- [ ] Poll slot at `m_returnIdCounter % RING_BUFFER_SIZE` (NOT all slots)
 - [ ] Add thread start in `Initialize()`
 - [ ] Add thread stop in `Cleanup()`
- [ ] Add debug logging for polling events
+- [ ] Add debug logging for multi-frame packet events

 **Testing**:
 - Verify thread starts/stops correctly
+- Verify multi-frame packet handling (all frames checked)
 - Verify `cuvidGetDecodeStatus()` calls work

 **Estimated Time**: 1 hour

 ---

-### Phase 3: DecodeToSurface Refactoring
+### Phase 3: DecodeToSurface Refactoring (Option C Implementation)

 **Files to Modify**:
- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()`
+- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` with Option C design

 **Tasks**:
- [ ] Phase 1: Slot allocation logic
- [ ] Phase 2: Sequential return wait logic
- [ ] Phase 3: Frame retrieval & cleanup logic
+- [ ] **🎯 Phase 1**: Slot allocation using `my_id = m_slotIdCounter.fetch_add(1)`
+- [ ] **🎯 Phase 2**: Packet submission with `packet.timestamp = my_id` (full slot_id, NOT modulo!)
+- [ ] **🎯 Phase 3**: FIFO wait using `while(m_returnIdCounter != my_id)` (NOT slot_idx!)
+- [ ] **🎯 Phase 4**: Frame retrieval from `picture_indices[0]` (first frame only)
+- [ ] **🎯 Phase 5**: Cleanup and `m_returnIdCounter.fetch_add(1)`
 - [ ] Error handling for all failure paths

 **Testing**:
 - Single-threaded decode test
 - Multi-threaded decode test (2-3 threads)
- Verify packet-surface mapping correctness
+- Verify packet-surface mapping correctness with multi-frame packets
+- Verify FIFO ordering with out-of-order completion

 **Estimated Time**: 2 hours

 ---

-### Phase 4: HandlePictureDisplay Update
+### Phase 4: HandlePictureDecode Update (Option C)

 **Files to Modify**:
- `NVDECAV1Decoder.cpp` - Modify `HandlePictureDisplay()`
+- `NVDECAV1Decoder.cpp` - Modify `HandlePictureDecode()` callback

 **Tasks**:
- [ ] Extract slot_idx from timestamp
- [ ] Store picture_index in correct slot
- [ ] Add debug logging
+- [ ] **🎯 Option C**: Extract `slot_id` from `pic_params->nTimeStamp`
+- [ ] **🎯 Option C**: Calculate `slot_idx = slot_id % RING_BUFFER_SIZE` (NO map lookup!)
+- [ ] **🎯 Multi-frame**: Use `slot.picture_indices.push_back(CurrPicIdx)` (NOT single index!)
+- [ ] Add debug logging for multi-frame packets

 **Testing**:
- Verify timestamp → slot_idx mapping
- Verify picture_index stored correctly
+- Verify timestamp → slot_idx direct calculation works
+- Verify picture_indices vector correctly stores multiple frames
+- Test with video that has multi-frame packets (test_720p_stripe.webm)

 **Estimated Time**: 30 minutes

 ---

-### Phase 5: Integration Testing
+### Phase 5: Integration Testing (Option C Validation)

 **Test Scenarios**:
-1. **Single packet decode** - Verify basic functionality
-2. **Sequential 3 packets** - Verify FIFO order
-3. **Out-of-order completion** - Verify correct mapping (I-frame after P-frame)
-4. **RingBuffer overflow** - Verify error handling (9+ concurrent calls)
-5. **Decode errors** - Verify graceful failure
-6. **Performance benchmark** - Measure latency reduction
+1. **Single packet decode** - Verify Option C basic functionality
+2. **Multi-frame packet** - Verify vector-based picture_indices handling (test_720p_stripe.webm)
+3. **Sequential 3 packets** - Verify FIFO order using m_returnIdCounter
+4. **Out-of-order completion** - Verify slot_id → slot_idx mapping (I-frame after P-frame)
+5. **RingBuffer overflow** - Verify error handling (9+ concurrent calls)
+6. **Decode errors** - Verify graceful failure with multi-frame packets
+7. **Performance benchmark** - Measure latency reduction vs old queue-based approach

 **Test Files**:
- Simple test video (simple_test.webm)
- Complex GOP structure video (test_720p_stripe.webm)
+- Simple test video (simple_test.webm) - basic validation
+- **Multi-frame packet video (test_720p_stripe.webm)** - critical multi-frame test ⚠️
+
+**Validation Criteria**:
+- ✅ No slot ID → slot_idx mapping errors
+- ✅ All frames from multi-frame packets detected and polled
+- ✅ FIFO order maintained even with out-of-order GPU completion
+- ✅ No memory corruption or race conditions

 **Estimated Time**: 2 hours

@@ -563,15 +634,63 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI
 ### Phase 6: Documentation & Cleanup

 **Tasks**:
- [ ] Update NVDEC design documentation
- [ ] Add inline code comments
- [ ] Remove old queue-based code
- [ ] Move design doc to `docs/completed/`
+- [x] Update NVDEC design documentation with Option C and multi-frame support
+- [ ] Add inline code comments explaining Option C design choices
+- [ ] Remove old queue-based code and any ID→Index mapping attempts
+- [ ] Move design doc to `docs/completed/` after successful implementation
+- [ ] Document multi-frame packet behavior and first-frame-only decision

 **Estimated Time**: 1 hour

 ---

+## 🎯 Option C Design Summary
+
+### **Core Principle**: Eliminate mapping overhead through deterministic calculation
+
+**Key Components**:
+1. **Single Counter for Dual Purpose**: `m_slotIdCounter` serves as both unique ID and submission order
+2. **Direct Slot Calculation**: `slot_idx = slot_id % RING_BUFFER_SIZE` (no map needed)
+3. **FIFO via ID Comparison**: `while(m_returnIdCounter != my_id)` ensures ordering
+4. **Multi-Frame Vector**: `std::vector<int> picture_indices` handles packets with multiple frames
+
+**Data Flow**:
+```
+DecodeToSurface:
+  my_id = m_slotIdCounter++ (e.g., 17)
+  slot_idx = 17 % 8 = 1
+  packet.timestamp = 17
+
+HandlePictureDecode:
+  slot_id = pic_params->nTimeStamp (17)
+  slot_idx = 17 % 8 = 1
+  m_ringBuffer[1].picture_indices.push_back(CurrPicIdx)
+
+PollingThread:
+  return_id = m_returnIdCounter (17)
+  slot_idx = 17 % 8 = 1
+  Check all m_ringBuffer[1].picture_indices[]
+
+DecodeToSurface (wait):
+  while(m_returnIdCounter != 17) { wait }
+  Process frame from picture_indices[0]
+  m_returnIdCounter++ (18)
+```
+
+**Eliminated Complexity**:
+- ❌ No `std::unordered_map<uint64_t, size_t>` mapping
+- ❌ No global map mutex
+- ❌ No map insert/erase operations
+- ❌ No lookup failures or stale entries
+
+**Multi-Frame Packet Handling**:
+- One packet → multiple HandlePictureDecode calls
+- All frames stored in `picture_indices` vector
+- PollingThread checks ALL frames complete
+- Return first frame only (`picture_indices[0]`)
+
+---
+
 ## 📊 Performance Analysis

 ### Expected Improvements
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -879,8 +879,8 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS
    }

    char debug_buf[512];
-    sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, IntraPicFlag=%d, ref_pic_flag=%d)\n",
-              decoder->m_decoder, pic_params->CurrPicIdx, pic_params->intra_pic_flag, pic_params->ref_pic_flag);
+    sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, IntraPicFlag=%d)\n",
+              decoder->m_decoder, pic_params->CurrPicIdx, pic_params->intra_pic_flag);
    OutputDebugStringA(debug_buf);
    printf("%s", debug_buf);

@@ -893,21 +893,7 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS
        return 0;
    }

-    // Store picture_index in ring buffer slot for polling thread
-    // The slot index was embedded in the timestamp by DecodeToSurface
-    // Note: pic_params doesn't have timestamp, so we use CurrPicIdx as picture_index
-    // and let the polling thread find the right slot
-    int picture_index = pic_params->CurrPicIdx;
-    size_t submit_idx = decoder->m_submitIndex.load() - 1;  // Last submitted slot
-    DecodeSlot& slot = decoder->m_ringBuffer[submit_idx % decoder->RING_BUFFER_SIZE];
-
-    {
-        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-        slot.picture_index = picture_index;
-    }
-
-    sprintf_s(debug_buf, "[HandlePictureDecode] Success - stored picture_index=%d in slot %zu\n",
-              picture_index, submit_idx % decoder->RING_BUFFER_SIZE);
+    sprintf_s(debug_buf, "[HandlePictureDecode] cuvidDecodePicture succeeded (CurrPicIdx=%d)\n", pic_params->CurrPicIdx);
    OutputDebugStringA(debug_buf);
    printf("%s", debug_buf);

@@ -921,34 +907,45 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDI
        return 0;
    }

+    // Option C: Extract slot_id from timestamp and calculate slot_idx directly (NO map lookup!)
+    uint64_t slot_id = static_cast<uint64_t>(disp_info->timestamp);
+    size_t slot_idx = slot_id % decoder->RING_BUFFER_SIZE;
+    DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
+
    char debug_buf[256];
-    sprintf_s(debug_buf, "[HandlePictureDisplay] Frame decoded: picture_index=%d, timestamp=%lld\n",
-              disp_info->picture_index, disp_info->timestamp);
+    sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: slot_id=%llu, slot_idx=%zu, picture_index=%d\n",
+              slot_id, slot_idx, disp_info->picture_index);
    OutputDebugStringA(debug_buf);
+    printf("%s", debug_buf);

-    // ===== Phase 4: RingBuffer Integration =====
-    // Extract slot index from timestamp (set in DecodeToSurface)
-    size_t slot_idx = static_cast<size_t>(disp_info->timestamp);
-    DecodeSlot& slot = decoder->m_ringBuffer[slot_idx % decoder->RING_BUFFER_SIZE];
-
-    sprintf_s(debug_buf, "[HandlePictureDisplay] Updating RingBuffer slot %zu with picture_index=%d\n",
-              slot_idx % decoder->RING_BUFFER_SIZE, disp_info->picture_index);
-    OutputDebugStringA(debug_buf);
-
-    // IMPORTANT: pfnDisplayPicture is called AFTER GPU decoding completes
-    // So we can directly mark the frame as ready without polling
+    // Option C Multi-Frame Support: Store picture_index in vector (one packet can have multiple frames)
    {
        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-        slot.picture_index = disp_info->picture_index;
-        slot.is_ready = true;  // Frame is already decoded and ready
+        slot.picture_indices.push_back(disp_info->picture_index);
    }

-    // Signal waiting thread that frame is ready
-    slot.frame_ready.notify_one();
-
-    sprintf_s(debug_buf, "[HandlePictureDisplay] Slot %zu marked ready (picture_index=%d)\\n",
-              slot_idx % decoder->RING_BUFFER_SIZE, disp_info->picture_index);
+    sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Pushed picture_index=%d to slot %zu (total frames: %zu)\n",
+              disp_info->picture_index, slot_idx, slot.picture_indices.size());
    OutputDebugStringA(debug_buf);
+    printf("%s", debug_buf);
+
+    // IMPORTANT: HandlePictureDisplay is called AFTER decoding completes
+    // So we can mark the slot as ready immediately (no polling needed for this callback)
+    bool should_notify = false;
+    {
+        std::lock_guard<std::mutex> lock(slot.slot_mutex);
+        if (!slot.is_ready) {
+            slot.is_ready = true;
+            should_notify = true;
+        }
+    }
+
+    if (should_notify) {
+        slot.frame_ready.notify_one();
+        sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Slot %zu marked ready and notified\n", slot_idx);
+        OutputDebugStringA(debug_buf);
+        printf("%s", debug_buf);
+    }

    return 1;
 }
@@ -1151,12 +1148,13 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
    try {
        auto start_time = std::chrono::high_resolution_clock::now();

-        // ===== Phase 3: RingBuffer Slot Allocation =====
-        // 1. Allocate next available slot (producer)
-        size_t my_slot_idx = m_submitIndex.fetch_add(1);
-        DecodeSlot& my_slot = m_ringBuffer[my_slot_idx % RING_BUFFER_SIZE];
+        // ===== Phase 1: Option C Slot Allocation =====
+        // 1. Allocate unique slot ID (serves as both ID and submission order)
+        uint64_t my_id = m_slotIdCounter.fetch_add(1);
+        size_t slot_idx = my_id % RING_BUFFER_SIZE;
+        DecodeSlot& my_slot = m_ringBuffer[slot_idx];

-        sprintf_s(debug_buf, "[DecodeToSurface] Allocating slot %zu (submitIndex=%zu)\n", my_slot_idx % RING_BUFFER_SIZE, my_slot_idx);
+        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Allocated slot_id=%llu, slot_idx=%zu\n", my_id, slot_idx);
        OutputDebugStringA(debug_buf);

        // 2. Check for overflow (RingBuffer full)
@@ -1165,17 +1163,17 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_

            if (my_slot.in_use) {
                // RingBuffer overflow - too many concurrent decodes
-                sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", my_slot_idx % RING_BUFFER_SIZE);
+                sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", slot_idx);
                OutputDebugStringA(debug_buf);
                LogError("RingBuffer overflow - max 8 concurrent decodes");
                return false;
            }

-            // 3. Initialize slot
+            // 3. Initialize slot (multi-frame support)
            my_slot.in_use = true;
            my_slot.target_surface = target_surface;
            my_slot.surface_type = target_type;
-            my_slot.picture_index = -1;  // Will be set by HandlePictureDisplay
+            my_slot.picture_indices.clear();  // Clear previous frames
            my_slot.is_ready = false;
        }

@@ -1191,14 +1189,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
            printf("%s", debug_buf);
        }

-        // 5. Submit packet to NVDEC parser with slot index in timestamp
+        // 5. Option C: Submit packet with full slot_id in timestamp (NOT modulo!)
        CUVIDSOURCEDATAPACKET packet = {};
        packet.payload = final_packet_data;
        packet.payload_size = static_cast<unsigned long>(final_packet_size);
        packet.flags = CUVID_PKT_ENDOFPICTURE;
-        packet.timestamp = static_cast<int64_t>(my_slot_idx);  // Embed slot index in timestamp
+        packet.timestamp = static_cast<int64_t>(my_id);  // Pass full slot_id for HandlePictureDecode

-        sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot %zu)...\n", packet.timestamp, my_slot_idx % RING_BUFFER_SIZE);
+        sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot_id=%llu, slot_idx=%zu)...\n", packet.timestamp, my_id, slot_idx);
        OutputDebugStringA(debug_buf);

        CUresult result = cuvidParseVideoData(m_parser, &packet);
@@ -1217,49 +1215,60 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
        }
        OutputDebugStringA("[DecodeToSurface] cuvidParseVideoData succeeded\n");

-        // ===== Component 4: Sequential Return Wait =====
-        // 5. Wait for my turn (enforce FIFO order)
-        sprintf_s(debug_buf, "[DecodeToSurface] Waiting for slot %zu to be return head (current returnIndex=%zu)...\n",
-                  my_slot_idx % RING_BUFFER_SIZE, m_returnIndex.load());
-        OutputDebugStringA(debug_buf);
-
-        while (m_returnIndex.load() != my_slot_idx) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(1));
-        }
-
-        sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu is now return head\n", my_slot_idx % RING_BUFFER_SIZE);
-        OutputDebugStringA(debug_buf);
-
-        // 6. Wait for decode to complete (polling thread signals this)
+        // ===== Component 4: Wait for decode to complete FIRST =====
+        // 6. Wait for decode to complete (HandlePictureDisplay signals this)
        {
            std::unique_lock<std::mutex> lock(my_slot.slot_mutex);

-            sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot %zu, is_ready=%d)...\n",
-                      my_slot_idx % RING_BUFFER_SIZE, my_slot.is_ready);
+            sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot_idx=%zu, is_ready=%d)...\n",
+                      slot_idx, my_slot.is_ready);
            OutputDebugStringA(debug_buf);

-            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500),
+            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(5000),
                [&my_slot]() { return my_slot.is_ready; })) {
                // Timeout - decode took too long
-                sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot %zu (is_ready=%d)\n",
-                          my_slot_idx % RING_BUFFER_SIZE, my_slot.is_ready);
+                sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot_idx=%zu (is_ready=%d)\n",
+                          slot_idx, my_slot.is_ready);
                OutputDebugStringA(debug_buf);
                printf("%s", debug_buf);
                LogError("Decode timeout");
                my_slot.in_use = false;
-                m_returnIndex.fetch_add(1);  // Skip this slot to avoid deadlock
+                m_returnIdCounter.fetch_add(1);  // Option C: Advance to unblock next thread
                return false;
            }

-            sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot %zu)\n", my_slot_idx % RING_BUFFER_SIZE);
+            sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot_idx=%zu)\n", slot_idx);
            OutputDebugStringA(debug_buf);
        }

-        sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu decode complete (picture_index=%d)\n", my_slot_idx % RING_BUFFER_SIZE, my_slot.picture_index);
+        // ===== Component 5: Option C Sequential Return Wait (FIFO Guarantee) =====
+        // 5. Wait for my turn using slot_id (NOT slot_idx!) AFTER decoding completes
+        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Waiting for FIFO turn slot_id=%llu (current returnIdCounter=%llu)...\n",
+                  my_id, m_returnIdCounter.load());
        OutputDebugStringA(debug_buf);

-        // ===== Component 5: Frame Retrieval & Cleanup =====
-        int frameIdx = my_slot.picture_index;
+        while (m_returnIdCounter.load() != my_id) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        }
+
+        sprintf_s(debug_buf, "[DecodeToSurface] Option C: slot_id=%llu is now return head\n", my_id);
+        OutputDebugStringA(debug_buf);
+
+        // ===== Component 5: Option C Frame Retrieval (Multi-Frame Support) =====
+        // Get first frame from multi-frame packet (picture_indices[0])
+        if (my_slot.picture_indices.empty()) {
+            sprintf_s(debug_buf, "[DecodeToSurface] ERROR: No picture_indices in slot %zu\n", slot_idx);
+            OutputDebugStringA(debug_buf);
+            LogError("No frames decoded");
+            my_slot.in_use = false;
+            m_returnIdCounter.fetch_add(1);
+            return false;
+        }
+
+        int frameIdx = my_slot.picture_indices[0];  // Return first frame only
+        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu decode complete, using first frame (picture_index=%d, total_frames=%zu)\n",
+                  slot_idx, frameIdx, my_slot.picture_indices.size());
+        OutputDebugStringA(debug_buf);

        // For CUDA device surface, we need to map the decoded frame
        if (target_type == VAVCORE_SURFACE_CUDA_DEVICE) {
@@ -1275,7 +1284,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
            if (result != CUDA_SUCCESS) {
                LogCUDAError(result, "cuvidMapVideoFrame");
                my_slot.in_use = false;
-                m_returnIndex.fetch_add(1);
+                m_returnIdCounter.fetch_add(1);
                return false;
            }

@@ -1301,7 +1310,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                OutputDebugStringA("[DecodeToSurface] ERROR: target_surface or m_d3d12Device is null\n");
                LogError("D3D12 resource or device not available");
                my_slot.in_use = false;
-                m_returnIndex.fetch_add(1);
+                m_returnIdCounter.fetch_add(1);
                return false;
            }

@@ -1340,7 +1349,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                OutputDebugStringA(debug_buf);
                LogCUDAError(result, "cuvidMapVideoFrame");
                my_slot.in_use = false;
-                m_returnIndex.fetch_add(1);
+                m_returnIdCounter.fetch_add(1);
                return false;
            }

@@ -1367,7 +1376,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
            if (!copySuccess) {
                OutputDebugStringA("[DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed\n");
                my_slot.in_use = false;
-                m_returnIndex.fetch_add(1);
+                m_returnIdCounter.fetch_add(1);
                return false;
            }

@@ -1391,10 +1400,10 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
            my_slot.in_use = false;
        }

-        // 11. Advance return index
-        m_returnIndex.fetch_add(1);
+        // 11. Option C: Advance return ID counter (FIFO order)
+        m_returnIdCounter.fetch_add(1);

-        sprintf_s(debug_buf, "[DecodeToSurface] Slot %zu released, returnIndex advanced to %zu\n", my_slot_idx % RING_BUFFER_SIZE, m_returnIndex.load());
+        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu released, returnIdCounter advanced to %llu\n", slot_idx, m_returnIdCounter.load());
        OutputDebugStringA(debug_buf);

        // Update statistics
@@ -1455,63 +1464,53 @@ void NVDECAV1Decoder::PollingThreadFunc() {

    int poll_count = 0;
    while (m_pollingRunning) {
-        // Get current return slot (oldest pending decode)
-        size_t current_return_idx = m_returnIndex.load();
-        size_t slot_idx = current_return_idx % RING_BUFFER_SIZE;
+        // Option C: Get current return ID and calculate slot index
+        uint64_t current_return_id = m_returnIdCounter.load();
+        size_t slot_idx = current_return_id % RING_BUFFER_SIZE;
        DecodeSlot& slot = m_ringBuffer[slot_idx];

        // Debug logging every 100 iterations
        if (poll_count % 100 == 0) {
            char debug_buf[256];
-            sprintf_s(debug_buf, "[PollingThread] Poll #%d: returnIdx=%zu, slot=%zu, in_use=%d, is_ready=%d, picture_index=%d\n",
-                      poll_count, current_return_idx, slot_idx, slot.in_use, slot.is_ready, slot.picture_index);
+            sprintf_s(debug_buf, "[PollingThread] Option C: Poll #%d: returnId=%llu, slot=%zu, in_use=%d, is_ready=%d, frames=%zu\n",
+                      poll_count, current_return_id, slot_idx, slot.in_use, slot.is_ready, slot.picture_indices.size());
            OutputDebugStringA(debug_buf);
        }
        poll_count++;

        // Check if slot is in use and not yet ready
-        if (slot.in_use && !slot.is_ready && slot.picture_index >= 0) {
-            // Query NVDEC for decode status
-            CUVIDGETDECODESTATUS decodeStatus = {};
-            CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus);
-
-            if (result == CUDA_SUCCESS) {
-                if (decodeStatus.decodeStatus == cuvidDecodeStatus_Success) {
-                    // Decode complete!
-                    {
-                        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                        slot.is_ready = true;
-                    }
-                    slot.frame_ready.notify_one();
-
-                    char debug_buf[128];
-                    sprintf_s(debug_buf, "[PollingThread] Slot %zu decode complete (picture_index=%d)\n",
-                              slot_idx, slot.picture_index);
-                    OutputDebugStringA(debug_buf);
-                    printf("%s", debug_buf);
-                }
-                else if (decodeStatus.decodeStatus == cuvidDecodeStatus_Error ||
-                         decodeStatus.decodeStatus == cuvidDecodeStatus_Error_Concealed) {
-                    // Decode error - mark as ready to unblock
-                    {
-                        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                        slot.is_ready = true;
-                    }
-                    slot.frame_ready.notify_one();
-
-                    char debug_buf[128];
-                    sprintf_s(debug_buf, "[PollingThread] Slot %zu decode error (status=%d)\n",
-                              slot_idx, decodeStatus.decodeStatus);
-                    OutputDebugStringA(debug_buf);
-                    printf("%s", debug_buf);
-                }
-                // else: decodeStatus.decodeStatus == cuvidDecodeStatus_InProgress, continue polling
+        if (slot.in_use && !slot.is_ready) {
+            // Option C Multi-Frame Support: Get copy of picture indices
+            std::vector<int> picture_indices_copy;
+            {
+                std::lock_guard<std::mutex> lock(slot.slot_mutex);
+                picture_indices_copy = slot.picture_indices;
            }
-            else {
-                // cuvidGetDecodeStatus failed
-                char debug_buf[128];
-                sprintf_s(debug_buf, "[PollingThread] cuvidGetDecodeStatus failed for slot %zu (error=%d)\n",
-                          slot_idx, result);
+
+            // Check if ALL frames are decoded
+            bool all_complete = true;
+            for (int pic_idx : picture_indices_copy) {
+                CUVIDGETDECODESTATUS decodeStatus = {};
+                CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus);
+
+                if (result != CUDA_SUCCESS ||
+                    decodeStatus.decodeStatus != cuvidDecodeStatus_Success) {
+                    all_complete = false;
+                    break;
+                }
+            }
+
+            // If all frames complete, signal ready
+            if (all_complete && !picture_indices_copy.empty()) {
+                {
+                    std::lock_guard<std::mutex> lock(slot.slot_mutex);
+                    slot.is_ready = true;
+                }
+                slot.frame_ready.notify_one();
+
+                char debug_buf[256];
+                sprintf_s(debug_buf, "[PollingThread] Option C: Slot %zu ALL frames complete (%zu frames)\n",
+                          slot_idx, picture_indices_copy.size());
                OutputDebugStringA(debug_buf);
                printf("%s", debug_buf);
            }
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
@@ -152,20 +152,21 @@ private:
        void* target_surface = nullptr;           // Destination D3D12 resource
        VavCoreSurfaceType surface_type = VAVCORE_SURFACE_CPU;

-        // NVDEC information (from HandlePictureDisplay callback)
-        int picture_index = -1;                   // NVDEC frame index for cuvidMapVideoFrame
+        // NVDEC information (from HandlePictureDecode callback)
+        // Multi-frame support: One packet can decode to multiple frames
+        std::vector<int> picture_indices;         // All NVDEC frame indices from this packet

        // Synchronization primitives
-        std::condition_variable frame_ready;      // Signaled when decode complete
+        std::condition_variable frame_ready;      // Signaled when ALL frames are decoded
        std::mutex slot_mutex;                    // Protects this slot's state
-        bool is_ready = false;                    // Decode completed flag
+        bool is_ready = false;                    // All frames decoded flag
    };

    DecodeSlot m_ringBuffer[RING_BUFFER_SIZE];

-    // Producer-consumer indices
-    std::atomic<size_t> m_submitIndex{0};         // Next slot to allocate (producer)
-    std::atomic<size_t> m_returnIndex{0};         // Next slot to return (consumer)
+    // Option C: Unified slot allocation counters (no mapping needed!)
+    std::atomic<uint64_t> m_slotIdCounter{0};     // Monotonically increasing slot ID (submission order)
+    std::atomic<uint64_t> m_returnIdCounter{0};   // Return order enforcement (FIFO)

    // Polling thread for cuvidGetDecodeStatus
    std::thread m_pollingThread;