diff --git a/prompt-redesign-ringbuffer.txt b/prompt-redesign-ringbuffer.txt
index 72c390a..265b3c0 100644
--- a/prompt-redesign-ringbuffer.txt
+++ b/prompt-redesign-ringbuffer.txt
@@ -1547,3 +1547,238 @@
   4. 변수명이 명확 (m_slotIdCounter, m_returnIdCounter)
 
   이 방법으로 구현하시겠습니까?
+
+
+
+
+
+
+> 사실  NVDEC API가 picture별 user_data를 지원하지 않는 이유는, picture 를 처리할 때, ringbuffer 의 index slot 기반으로 처리하고 그 슬롯을 이용해서
+후처리를 맡겼기 때문이라고 생각한다.
+
+● 완벽한 분석입니다! 그게 정확히 NVDEC의 설계 철학입니다.
+
+  NVDEC의 Ring Buffer 기반 설계:
+
+  // NVDEC이 제공하는 Picture Index (CurrPicIdx)
+  typedef struct _CUVIDPICPARAMS {
+      int CurrPicIdx;  // ← NVDEC이 할당한 ring buffer slot index (0~7)
+      // ...
+  } CUVIDPICPARAMS;
+
+  // NVDEC의 기대하는 사용 패턴:
+  // 1. HandlePictureDecode: CurrPicIdx 사용하여 디코딩
+  // 2. HandlePictureDisplay: picture_index로 어느 슬롯이 준비되었는지 알림
+  // 3. 애플리케이션: picture_index로 해당 슬롯의 surface에 접근
+
+  NVDEC이 user_data를 제공하지 않는 이유:
+
+  1. Ring Buffer Index가 이미 Context 역할:
+    - CurrPicIdx (0~7)가 고유 식별자
+    - 애플리케이션은 이 인덱스로 자체 메타데이터 배열에 접근
+  2. NVDEC의 기대하는 구조:
+  // NVIDIA가 기대하는 애플리케이션 설계:
+  struct AppFrameContext {
+      void* surface;
+      uint64_t timestamp;
+      // ... 기타 메타데이터
+  };
+
+  AppFrameContext g_frameContexts[8];  // Ring buffer와 동일한 크기
+
+  // HandlePictureDecode:
+  int idx = pic_params->CurrPicIdx;
+  g_frameContexts[idx].timestamp = current_timestamp;
+
+  // HandlePictureDisplay:
+  int idx = disp_info->picture_index;
+  DisplayFrame(g_frameContexts[idx]);
+
+  우리의 문제:
+
+  우리는 NVDEC의 ring buffer (8 slots)와 다른 ring buffer 구조를 사용하고 있습니다:
+  - NVDEC: CurrPicIdx (0~7, NVDEC 내부 관리)
+  - VavCore: slot_id (monotonic counter), slot_idx = slot_id % 8
+
+  충돌 지점:
+
+  // NVDEC이 제공하는 것:
+  CurrPicIdx = 3  // NVDEC 내부 ring buffer의 slot 3
+
+  // 우리가 원하는 것:
+  my_id = 42  // 42번째 프레임
+  slot_idx = 42 % 8 = 2  // 우리 ring buffer의 slot 2
+
+  // ❌ CurrPicIdx ≠ slot_idx (서로 다른 ring buffer!)
+
+  해결책:
+
+  Option 1: NVDEC의 CurrPicIdx를 직접 사용 (NVIDIA 권장 방식)
+  // DecodeSlot을 CurrPicIdx 기반으로 관리
+  DecodeSlot m_ringBuffer[8];  // NVDEC의 ring buffer와 1:1 대응
+
+  // HandlePictureDecode에서:
+  int nvdec_slot = pic_params->CurrPicIdx;
+  m_ringBuffer[nvdec_slot].target_surface = /* 어떻게 알지? */;
+  문제: DecodeToSurface → HandlePictureDecode 간 연결 방법 필요
+
+  Option 2: m_currentSubmittingSlotId (현재 방식)
+  // DecodeToSurface:
+  m_currentSubmittingSlotId.store(my_id);  // 제출 직전 설정
+  cuvidParseVideoData(...);  // 동기 호출
+
+  // HandlePictureDecode (cuvidParseVideoData 내부에서 즉시 호출):
+  uint64_t slot_id = decoder->m_currentSubmittingSlotId.load();
+  decoder->m_pictureIndexToSlotId[pic_params->CurrPicIdx] = slot_id;  // 매핑 저장
+
+  Option 3: Submission Queue
+  std::mutex m_submissionMutex;
+  std::queue<uint64_t> m_pendingSlotIds;
+
+  // DecodeToSurface:
+  {
+      std::lock_guard lock(m_submissionMutex);
+      m_pendingSlotIds.push(my_id);
+  }
+  cuvidParseVideoData(...);
+
+  // HandlePictureDecode:
+  {
+      std::lock_guard lock(m_submissionMutex);
+      uint64_t slot_id = m_pendingSlotIds.front();
+      m_pendingSlotIds.pop();
+      m_pictureIndexToSlotId[pic_params->CurrPicIdx] = slot_id;
+  }
+
+  결론:
+
+  당신 말이 맞습니다. NVDEC은 CurrPicIdx ring buffer index를 제공하므로, 별도의 user_data가 필요 없다고 생각했습니다. 하지만 우리는 자체 slot_id 시스템을
+  사용하므로, NVDEC의 CurrPicIdx와 우리의 slot_id를 매핑하는 중간 단계가 필요합니다.
+
+  m_currentSubmittingSlotId atomic 변수가 이 매핑을 위한 가장 간단하고 안전한 방법입니다. cuvidParseVideoData()가 동기 호출이므로 race condition도
+  없습니다.
+
+  ParseContext 제거하고 m_currentSubmittingSlotId로 되돌리는 것이 맞다고 생각합니다.
+
+> CurrPicIdx 값과 slot_id 를 하나로 합쳐서 쓰지 못하는 이유가 있던가?
+
+● 훌륭한 질문입니다! 실제로 합칠 수 없는 이유를 다시 생각해보니, 합치지 못할 근본적인 이유가 없습니다.
+
+  현재 우리가 두 개의 인덱스를 사용하는 이유를 분석해보겠습니다:
+
+● Search(pattern: "CurrPicIdx|slot_id|slot_idx", path: "vav2\platforms\windows\vavcore\src\Decoder\NVDECAV1Decoder.cpp", output_mode: "content", head_limit:
+        50)
+  ⎿  Found 50 lines (ctrl+o to expand)
+
+● Search(pattern: "slot_idx = |DecodeSlot\[|m_ringBuffer\[", path: "vav2\platforms\windows\vavcore\src\Decoder\NVDECAV1Decoder.cpp", output_mode: "content",
+        head_limit: 30)
+  ⎿  Found 6 lines (ctrl+o to expand)
+
+● 완벽한 지적입니다! CurrPicIdx를 직접 사용할 수 있습니다.
+
+  현재 구조의 문제점:
+
+  // 현재: 불필요하게 복잡함
+  slot_id = m_slotIdCounter.fetch_add(1);  // 0, 1, 2, 3, ... (monotonic)
+  slot_idx = slot_id % 8;                   // 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, ...
+
+  // NVDEC이 제공하는 것:
+  CurrPicIdx = 0~7  // 이미 ring buffer index!
+
+  핵심 깨달음:
+
+  NVDEC의 CurrPicIdx (0~7)가 이미 ring buffer index 역할을 합니다!
+
+  왜 우리가 slot_id를 만들었는가?
+  - FIFO 순서 보장을 위해 (m_returnIdCounter)
+  - 제출 순서 추적을 위해
+
+  하지만:
+  - NVDEC은 이미 자체 ring buffer로 순서를 관리합니다
+  - CurrPicIdx는 NVDEC 내부 할당 순서를 반영합니다
+
+  제안하는 단순화:
+
+  // Option: CurrPicIdx를 직접 사용
+  DecodeSlot m_ringBuffer[8];  // NVDEC과 동일한 크기
+
+  // DecodeToSurface:
+  // 1. 빈 슬롯 찾기 (모든 슬롯 스캔)
+  for (int i = 0; i < 8; i++) {
+      if (!m_ringBuffer[i].in_use.load()) {
+          // 슬롯 예약
+          m_ringBuffer[i].in_use = true;
+          m_ringBuffer[i].target_surface = target_surface;
+          break;
+      }
+  }
+
+  // 2. cuvidParseVideoData 호출
+  cuvidParseVideoData(...);
+
+  // HandlePictureDecode:
+  // NVDEC이 CurrPicIdx를 알려줌
+  int nvdec_slot = pic_params->CurrPicIdx;
+  m_ringBuffer[nvdec_slot]./* 메타데이터 설정 */;
+
+  // HandlePictureDisplay:
+  int nvdec_slot = disp_info->picture_index;
+  m_ringBuffer[nvdec_slot].is_ready = true;
+
+  문제:
+  DecodeToSurface에서 어느 슬롯이 사용될지 미리 알 수 없습니다. NVDEC이 CurrPicIdx를 할당하는 것은 HandlePictureDecode 콜백에서입니다.
+
+  따라서 여전히 매핑이 필요합니다:
+
+  // DecodeToSurface → HandlePictureDecode 간 정보 전달 방법:
+
+  // 방법 1: m_currentSubmittingSlotId (기존)
+  m_currentSubmittingSlotId.store(my_id);
+
+  // 방법 2: target_surface를 임시 저장
+  m_pendingTargetSurface = target_surface;
+  cuvidParseVideoData(...);
+
+  // HandlePictureDecode:
+  int nvdec_slot = pic_params->CurrPicIdx;
+  m_ringBuffer[nvdec_slot].target_surface = decoder->m_pendingTargetSurface;
+
+  결론:
+
+  CurrPicIdx를 직접 ring buffer index로 사용할 수 있지만, DecodeToSurface에서 미리 슬롯을 할당할 수 없는 문제가 남아있습니다. NVDEC이 어느 CurrPicIdx를
+  사용할지는 HandlePictureDecode에서만 알 수 있기 때문입니다.
+
+  최선의 단순화:
+
+  // m_currentSubmittingContext로 단순화 (slot_id 대신 전체 컨텍스트)
+  struct SubmittingContext {
+      void* target_surface;
+      VavCoreSurfaceType surface_type;
+  };
+
+  std::atomic<SubmittingContext*> m_currentSubmittingContext;
+
+  // DecodeToSurface:
+  SubmittingContext ctx{target_surface, target_type};
+  m_currentSubmittingContext.store(&ctx);
+  cuvidParseVideoData(...);
+
+  // HandlePictureDecode:
+  int nvdec_slot = pic_params->CurrPicIdx;
+  auto* ctx = decoder->m_currentSubmittingContext.load();
+  decoder->m_ringBuffer[nvdec_slot].target_surface = ctx->target_surface;
+  decoder->m_ringBuffer[nvdec_slot].surface_type = ctx->surface_type;
+  decoder->m_ringBuffer[nvdec_slot].in_use = true;
+
+  이렇게 하면 slot_id, m_slotIdCounter, m_pictureIndexToSlotId 매핑을 모두 제거하고 NVDEC의 CurrPicIdx를 직접 사용할 수 있습니다!
+
+  이 방향으로 리팩토링할까요?
+
+> 좋아. 우리가 먼저 ring buffer 의 번호를 할당하는 것이 아니라, nvdec api 에서 제공하는 CurrPicIdx 를 ringbuffer index 로 사용하자는 얘기군. 좋다.
+  ⎿  Interrupted · What should Claude do instead?
+
+> 작업을 진행하기 전에 위의 개선된 내용을 docs/working/NVDEC_RingBuffer_Decode_Design.md 에 반영해둬서 나중에 설계 문서를 다시 읽을 때 헷갈리지 않도록
+해줘.
+
+● NVDEC RingBuffer 설계 문서에 개선된 내용을 반영하겠습니다:
+
diff --git a/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md b/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md
index 90d0015..ae836c7 100644
--- a/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md
+++ b/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md
@@ -1,8 +1,8 @@
 # NVDEC RingBuffer-based Asynchronous Decoding Design
 
-**Date**: 2025-10-05 (Updated)
-**Status**: ✅ **Design Finalized - Ready for Implementation**
-**Target**: NVDECAV1Decoder DecodeToSurface() optimization with multi-frame packet support
+**Date**: 2025-10-05 (Updated - CurrPicIdx Simplification)
+**Status**: ✅ **Design Finalized - Ready for Implementation (Improved Architecture)**
+**Target**: NVDECAV1Decoder DecodeToSurface() optimization using NVDEC's native CurrPicIdx
 
 ---
 
@@ -10,12 +10,13 @@
 
 1. [Background & Problem Statement](#background--problem-statement)
 2. [Design Goals](#design-goals)
-3. [Architecture Overview](#architecture-overview)
-4. [Detailed Design](#detailed-design)
-5. [Implementation Plan](#implementation-plan)
-6. [Performance Analysis](#performance-analysis)
-7. [Risk Assessment](#risk-assessment)
-8. [References](#references)
+3. [NVDEC CurrPicIdx Design Philosophy](#nvdec-currpicidx-design-philosophy) ⭐ **NEW**
+4. [Architecture Overview](#architecture-overview)
+5. [Detailed Design](#detailed-design)
+6. [Implementation Plan](#implementation-plan)
+7. [Performance Analysis](#performance-analysis)
+8. [Risk Assessment](#risk-assessment)
+9. [References](#references)
 
 ---
 
@@ -149,6 +150,102 @@ m_createInfo.ulNumOutputSurfaces = 8;  // 8 output buffers allocated
 
 ---
 
+## 🎯 NVDEC CurrPicIdx Design Philosophy
+
+### **Discovery: Why ParseContext Cannot Work**
+
+During implementation, we attempted to pass per-picture user data through NVDEC callbacks using a `ParseContext` approach:
+
+```cpp
+// ❌ Failed Approach: Trying to pass per-picture context
+struct ParseContext {
+    NVDECAV1Decoder* decoder;
+    uint64_t slot_id;
+};
+
+// In DecodeToSurface():
+ParseContext context(this, my_id);
+m_parserParams.pUserData = &context;  // Try to change user_data
+cuvidParseVideoData(m_parser, &packet);
+```
+
+**Why It Failed**:
+1. `cuvidCreateVideoParser()` makes an **internal copy** of `CUVIDPARSERPARAMS` at parser creation time
+2. Changing `m_parserParams.pUserData` after parser creation **does not affect** NVDEC's internal copy
+3. NVDEC API provides **parser-level user_data only**, not picture-level user_data
+4. The `pUserData` field is set once at parser creation and used for all callbacks
+
+**Evidence from Testing**:
+```
+[DecodeToSurface] ParseContext created: magic=0xDEADBEEFCAFEBABE, decoder=000002E42CD9E4A0, slot_id=0
+[HandlePictureDecode] ParseContext: magic=0x00007FFF71168468, decoder=0000000000000000, slot_id=0
+[HandlePictureDecode] ERROR: Invalid ParseContext magic!
+```
+
+### **NVDEC's Native Ring Buffer Design**
+
+NVDEC is designed around a **fixed 8-slot ring buffer** architecture:
+
+```cpp
+m_createInfo.ulNumOutputSurfaces = 8;  // NVDEC allocates 8 internal surfaces
+```
+
+**NVDEC's Philosophy**:
+- Applications should use **`CurrPicIdx` (0-7)** as the direct ring buffer index
+- NVDEC manages surface allocation and provides the index via `pic_params->CurrPicIdx`
+- No custom slot_id allocation is needed - **NVDEC already provides the slot index**
+- Applications maintain metadata arrays indexed by `CurrPicIdx`
+
+**Data Flow**:
+```
+cuvidParseVideoData(packet)
+    ↓
+HandlePictureDecode(pic_params)
+    ↓
+CurrPicIdx = pic_params->CurrPicIdx  // 0-7 (NVDEC's ring buffer index)
+    ↓
+Use CurrPicIdx directly:
+    m_ringBuffer[CurrPicIdx].target_surface = ...
+    m_ringBuffer[CurrPicIdx].metadata = ...
+```
+
+### **Eliminated Complexity**
+
+**Previous Approach (Unnecessary)**:
+```cpp
+// ❌ Custom slot_id allocation
+std::atomic<uint64_t> m_slotIdCounter{0};
+uint64_t my_id = m_slotIdCounter.fetch_add(1);
+size_t slot_idx = my_id % 8;
+
+// ❌ Mapping CurrPicIdx → slot_id
+std::unordered_map<int, uint64_t> m_pictureIndexToSlotId;
+m_pictureIndexToSlotId[CurrPicIdx] = my_id;
+```
+
+**New Approach (Direct CurrPicIdx)**:
+```cpp
+// ✅ Use NVDEC's CurrPicIdx directly
+int nvdec_slot = pic_params->CurrPicIdx;  // 0-7
+m_ringBuffer[nvdec_slot].target_surface = ...;
+m_ringBuffer[nvdec_slot].is_ready = true;
+```
+
+### **Key Insight**
+
+**NVDEC provides CurrPicIdx for a reason**:
+- It's the canonical ring buffer index that NVDEC uses internally
+- Applications are expected to maintain side-channel metadata indexed by CurrPicIdx
+- No need for custom ID allocation or mapping tables
+- Simpler, more efficient, and aligned with NVDEC's design
+
+**What We Need Instead**:
+- Store submission context (target_surface, surface_type) in a **temporary variable**
+- Pass context to `HandlePictureDecode` via stack/member variable (not user_data)
+- Use `CurrPicIdx` directly as ring buffer index in callback
+
+---
+
 ## 🏗️ Architecture Overview
 
 ### High-Level Design
@@ -187,29 +284,34 @@ m_createInfo.ulNumOutputSurfaces = 8;  // 8 output buffers allocated
 
 ### Data Structures
 
-#### DecodeSlot Structure
+#### DecodeSlot Structure (Simplified)
 
 ```cpp
 struct DecodeSlot {
     // Slot state
-    bool in_use;                          // Is this slot allocated?
+    std::atomic<bool> in_use{false};      // Is this NVDEC slot currently decoding?
 
-    // Submitted information (from DecodeToSurface args)
+    // Submitted information (set by DecodeToSurface)
     void* target_surface;                 // Destination D3D12 resource
     VavCoreSurfaceType surface_type;      // Surface type
+    uint64_t submission_id;               // For FIFO ordering (monotonic counter)
 
-    // NVDEC information (from HandlePictureDecode callback)
-    // ⚠️ Multi-frame support: One packet can decode to multiple frames
-    std::vector<int> picture_indices;     // All NVDEC frame indices from this packet
+    // NVDEC information (set by HandlePictureDecode callback)
+    int picture_index;                    // CurrPicIdx from NVDEC (same as slot index)
 
     // Synchronization primitives
-    std::condition_variable frame_ready;  // Signaled when ALL frames are decoded
+    std::condition_variable frame_ready;  // Signaled when decode complete
     std::mutex slot_mutex;                // Protects this slot's state
-    bool is_ready;                        // All frames decoded flag
+    std::atomic<bool> is_ready{false};    // Decode completed flag
 };
 ```
 
-#### RingBuffer Members
+**Key Simplifications**:
+- ✅ No `std::vector<int> picture_indices` - single picture per slot
+- ✅ `picture_index == slot_index` (CurrPicIdx is the slot index)
+- ✅ No mapping tables needed
+
+#### RingBuffer Members (Simplified)
 
 ```cpp
 class NVDECAV1Decoder {
@@ -218,9 +320,19 @@ private:
 
     DecodeSlot m_ringBuffer[RING_BUFFER_SIZE];
 
-    // 🎯 Option C: Unified slot allocation counter (no mapping needed!)
-    std::atomic<uint64_t> m_slotIdCounter{0};   // Monotonically increasing slot ID
-    std::atomic<uint64_t> m_returnIdCounter{0}; // Return order enforcement (FIFO)
+    // 🎯 Simplified: Only submission ID for FIFO ordering
+    std::atomic<uint64_t> m_submissionCounter{0};   // Monotonic submission ID
+    std::atomic<uint64_t> m_returnCounter{0};       // Return order enforcement (FIFO)
+
+    // ⚠️ CRITICAL FIX: Ring buffer for pending submissions (multi-thread safe)
+    struct PendingSubmission {
+        void* target_surface;
+        VavCoreSurfaceType surface_type;
+        uint64_t submission_id;
+        std::atomic<bool> in_use{false};
+    };
+    PendingSubmission m_pendingSubmissions[RING_BUFFER_SIZE];  // Ring buffer for contexts
+    std::mutex m_submissionMutex;  // Protects pending submission allocation
 
     // Polling thread
     std::thread m_pollingThread;
@@ -228,466 +340,569 @@ private:
 };
 ```
 
+**Key Fix**:
+- ✅ `m_pendingSubmissions[8]` ring buffer (not single struct!)
+- ✅ Each slot has its own pending context
+- ✅ `submission_id % 8` determines which pending slot to use
+- ✅ Thread-safe: Different threads use different pending slots
+
+**Eliminated Variables**:
+- ❌ `m_slotIdCounter` (replaced with `m_submissionCounter`)
+- ❌ `m_pictureIndexToSlotId` mapping (not needed!)
+- ❌ `m_currentSubmittingSlotId` atomic (use pending submission ring buffer)
+
 ---
 
 ## 🔧 Detailed Design
 
-### Component 1: Slot Allocation (Producer)
+### Component 1: Submission Preparation (Producer)
 
-**Purpose**: Assign RingBuffer slot to each DecodeToSurface call using Option C design
+**Purpose**: Prepare submission context before calling cuvidParseVideoData
 
 ```cpp
 // In DecodeToSurface()
 
-// 🎯 Option C: Allocate unique ID (serves as both slot ID and submission order)
-uint64_t my_id = m_slotIdCounter.fetch_add(1);
-size_t slot_idx = my_id % RING_BUFFER_SIZE;
-DecodeSlot& my_slot = m_ringBuffer[slot_idx];
+// 1. Allocate submission ID for FIFO ordering
+uint64_t my_submission_id = m_submissionCounter.fetch_add(1);
+size_t pending_idx = my_submission_id % RING_BUFFER_SIZE;
 
-// 2. Check for overflow
-{
-    std::lock_guard<std::mutex> lock(my_slot.slot_mutex);
-
-    if (my_slot.in_use) {
-        // RingBuffer full! Too many concurrent decodes
-        LogError("RingBuffer overflow - max 8 concurrent decodes");
-        return false;
-    }
-
-    // 3. Initialize slot
-    my_slot.in_use = true;
-    my_slot.target_surface = target_surface;
-    my_slot.surface_type = target_type;
-    my_slot.picture_indices.clear();  // Multi-frame support: clear previous frames
-    my_slot.is_ready = false;
+// 2. Wait if pending slot is still in use (overflow protection)
+while (m_pendingSubmissions[pending_idx].in_use.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
 }
+
+// 3. Store submission context in ring buffer slot
+{
+    std::lock_guard<std::mutex> lock(m_submissionMutex);
+    auto& pending = m_pendingSubmissions[pending_idx];
+
+    pending.target_surface = target_surface;
+    pending.surface_type = target_type;
+    pending.submission_id = my_submission_id;
+    pending.in_use.store(true);
+}
+
+LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu",
+           my_submission_id, pending_idx);
 ```
 
-**🎯 Option C: No Mapping Needed - Direct Calculation**:
-```
-Thread 1: m_slotIdCounter.fetch_add(1) → ID=0, slot_idx = 0 % 8 = slot[0]
-Thread 2: m_slotIdCounter.fetch_add(1) → ID=1, slot_idx = 1 % 8 = slot[1]
-Thread 3: m_slotIdCounter.fetch_add(1) → ID=2, slot_idx = 2 % 8 = slot[2]
-...
-Thread 9: m_slotIdCounter.fetch_add(1) → ID=8, slot_idx = 8 % 8 = slot[0] (wrap around)
-```
-
-**Key Advantage**: ID → slot_idx calculation is **deterministic**, no map storage needed!
-
-**Overflow Protection**: If `slot[0].in_use == true` when Thread 9 arrives → error
+**Key Design Points**:
+- ✅ **Ring buffer for pending contexts** - Multi-thread safe, no overwrite
+- ✅ **pending_idx = submission_id % 8** - Deterministic slot allocation
+- ✅ **Overflow protection** - Wait if pending slot still in use
+- ✅ **No slot allocation yet** - NVDEC will provide CurrPicIdx in callback
+- ✅ **Thread-safe** - Each thread uses different pending slot
 
 ---
 
 ### Component 2: Packet Submission
 
-**Purpose**: Submit packet to NVDEC with slot ID tracking via timestamp
+**Purpose**: Submit packet to NVDEC parser (cuvidParseVideoData is synchronous)
 
 ```cpp
-// 4. Submit packet to NVDEC parser
+// 3. Submit packet to NVDEC parser
 CUVIDSOURCEDATAPACKET packet = {};
 packet.payload = packet_data;
 packet.payload_size = packet_size;
 packet.flags = CUVID_PKT_ENDOFPICTURE;
-packet.timestamp = static_cast<int64_t>(my_id);  // 🎯 Pass full slot_id (NOT modulo!)
+packet.timestamp = 0;  // Not used - we use m_pendingSubmission instead
 
 CUresult result = cuvidParseVideoData(m_parser, &packet);
+// cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return
+
 if (result != CUDA_SUCCESS) {
-    my_slot.in_use = false;  // Release slot on error
+    LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed");
     return false;
 }
+
+LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed");
 ```
 
-**🎯 Option C Timestamp Flow** (multi-frame packet support):
+**Simplified Flow**:
 ```
-cuvidParseVideoData(packet, timestamp=my_id=17)
+cuvidParseVideoData(packet)
+    ↓ (synchronous callback)
+HandlePictureDecode(pic_params)
     ↓
-HandleVideoSequence() (first time only)
+CurrPicIdx = pic_params->CurrPicIdx  // NVDEC provides slot index (0-7)
     ↓
-HandlePictureDecode(timestamp=17, CurrPicIdx=0) → slot_idx = 17 % 8 = 1
-  → m_ringBuffer[1].picture_indices.push_back(0)
+pending_idx = submission_id % 8
     ↓
-HandlePictureDecode(timestamp=17, CurrPicIdx=1) → slot_idx = 17 % 8 = 1
-  → m_ringBuffer[1].picture_indices.push_back(1)  // Same packet, multiple frames!
+Copy m_pendingSubmissions[pending_idx] → m_ringBuffer[CurrPicIdx]
     ↓
-HandlePictureDecode(timestamp=17, CurrPicIdx=2) → slot_idx = 17 % 8 = 1
-  → m_ringBuffer[1].picture_indices.push_back(2)
+Release m_pendingSubmissions[pending_idx].in_use = false
     ↓
-PollingThread checks ALL picture_indices for slot[1]
+Return from HandlePictureDecode
     ↓
-When all complete: slot[1].is_ready = true, notify thread
+Return from cuvidParseVideoData
 ```
 
-**Key Point**: Timestamp carries **full slot_id**, HandlePictureDecode calculates slot_idx directly
+**Key Points**:
+- ✅ **cuvidParseVideoData is synchronous** - callbacks complete before return
+- ✅ **CurrPicIdx is the slot index** - no calculation needed
+- ✅ **pending_idx = submission_id % 8** - find correct pending context
+- ✅ **Ring buffer prevents overwrites** - multi-thread safe
+- ✅ **Release pending slot after copy** - allow reuse for next submission
 
 ---
 
 ### Component 3: Polling Thread (Background Status Checker)
 
-**Purpose**: Continuously poll `m_returnIdCounter` slot for decode completion (multi-frame support)
+**Purpose**: Continuously poll pending slots for decode completion
 
 ```cpp
 void NVDECAV1Decoder::PollingThreadFunc() {
     while (m_pollingRunning) {
-        // 1. Get current return ID and calculate slot index
-        uint64_t current_return_id = m_returnIdCounter.load();
-        size_t slot_idx = current_return_id % RING_BUFFER_SIZE;
-        DecodeSlot& slot = m_ringBuffer[slot_idx];
+        // 1. Check all slots for pending decodes
+        for (int slot_idx = 0; slot_idx < RING_BUFFER_SIZE; ++slot_idx) {
+            DecodeSlot& slot = m_ringBuffer[slot_idx];
 
-        // 2. Check if slot is in use and not yet ready
-        if (slot.in_use && !slot.is_ready) {
-
-            // 3. Get copy of picture indices (multi-frame support)
-            std::vector<int> picture_indices_copy;
-            {
-                std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                picture_indices_copy = slot.picture_indices;
+            // 2. Skip slots not in use or already ready
+            if (!slot.in_use.load() || slot.is_ready.load()) {
+                continue;
             }
 
-            // 4. Check if ALL frames are decoded
-            bool all_complete = true;
-            for (int pic_idx : picture_indices_copy) {
-                CUVIDGETDECODESTATUS decodeStatus = {};
-                CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus);
+            // 3. Query NVDEC decode status for this slot's picture
+            CUVIDGETDECODESTATUS decodeStatus = {};
+            CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus);
 
-                if (result != CUDA_SUCCESS ||
-                    decodeStatus.decodeStatus != cuvidDecodeStatus_Success) {
-                    all_complete = false;
-                    break;
-                }
-            }
+            // 4. If decode complete, mark slot ready
+            if (result == CUDA_SUCCESS &&
+                decodeStatus.decodeStatus == cuvidDecodeStatus_Success) {
 
-            // 5. If all frames complete, signal ready
-            if (all_complete && !picture_indices_copy.empty()) {
                 {
                     std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                    slot.is_ready = true;
+                    slot.is_ready.store(true);
                 }
                 slot.frame_ready.notify_one();
+
+                LOGF_DEBUG("[PollingThread] Slot %d ready (submission_id=%llu)",
+                           slot_idx, slot.submission_id);
             }
         }
 
-        // 6. Sleep to avoid busy-wait
+        // 5. Sleep to avoid busy-wait
         std::this_thread::sleep_for(std::chrono::microseconds(100));
     }
 }
 ```
 
 **Key Points**:
-- ✅ Only polls `m_returnIdCounter` slot (not all 8 slots) → efficient
-- ✅ **Multi-frame support**: Checks ALL picture_indices for completion
+- ✅ **Simple slot iteration** - check all 8 slots (minimal overhead)
+- ✅ **Single picture per slot** - no vector iteration needed
+- ✅ **CurrPicIdx == slot_idx** - direct correspondence
 - ✅ Uses `cuvidGetDecodeStatus()` non-blocking query
 - ✅ 100us sleep → ~10,000 checks/second (low CPU usage)
-- ✅ Thread-safe picture_indices copy to avoid lock contention
 
 ---
 
-### Component 4: Sequential Return Wait (FIFO Guarantee)
+### Component 4: Wait for Decode and Retrieve Slot
 
-**Purpose**: Enforce FIFO order even when decodes complete out-of-order using Option C
+**Purpose**: Wait for NVDEC to complete decoding and find which slot was used
 
 ```cpp
-// In DecodeToSurface() - PHASE 2
+// In DecodeToSurface() - After cuvidParseVideoData returns
 
-// 5. Wait for my turn (FIFO order enforcement)
-while (m_returnIdCounter.load() != my_id) {
+// 4. Find which slot NVDEC used (check all slots for our submission_id)
+int my_slot_idx = -1;
+for (int i = 0; i < RING_BUFFER_SIZE; ++i) {
+    if (m_ringBuffer[i].submission_id == my_submission_id) {
+        my_slot_idx = i;
+        break;
+    }
+}
+
+if (my_slot_idx == -1) {
+    LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id);
+    return false;
+}
+
+DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
+LOGF_DEBUG("[DecodeToSurface] Found slot_idx=%d for submission_id=%llu", my_slot_idx, my_submission_id);
+
+// 5. Wait for my turn in FIFO order
+while (m_returnCounter.load() != my_submission_id) {
     std::this_thread::sleep_for(std::chrono::milliseconds(1));
 }
 
-// Now I'm first in line!
+LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id);
 
 // 6. Wait for decode to complete
 {
     std::unique_lock<std::mutex> lock(my_slot.slot_mutex);
 
     if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500),
-        [&my_slot]() { return my_slot.is_ready; })) {
+        [&my_slot]() { return my_slot.is_ready.load(); })) {
         // Timeout - decode took too long
-        LogError("Decode timeout");
-        my_slot.in_use = false;
-        m_returnIdCounter.fetch_add(1);  // Skip this slot to avoid deadlock
+        LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx);
+        my_slot.in_use.store(false);
+        m_returnCounter.fetch_add(1);  // Skip to avoid deadlock
         return false;
     }
 }
+
+LOGF_DEBUG("[DecodeToSurface] Decode complete for slot %d", my_slot_idx);
 ```
 
-**🎯 Option C Timeline Example** (using slot ID, not slot index):
+**Simplified Timeline Example**:
 ```
-Thread 1 (ID=17, slot 1): Wait for returnIdCounter==17 ✅ (immediate)
-                          Wait for is_ready...
+Thread 1 (submission_id=0, CurrPicIdx=3): Wait for returnCounter==0 ✅
+                                          Wait for slot[3].is_ready...
 
-Thread 2 (ID=18, slot 2): Wait for returnIdCounter==18 ⏸️ (blocked)
+Thread 2 (submission_id=1, CurrPicIdx=5): Wait for returnCounter==1 ⏸️ (blocked)
 
-Thread 3 (ID=19, slot 3): Wait for returnIdCounter==19 ⏸️ (blocked)
+GPU: slot[5] completes first @ t=3ms
+     → slot[5].is_ready = true
+     → Thread 2 still blocked (returnCounter=0)
 
-GPU: ID=18 completes first @ t=3ms
-     → slot[2].is_ready = true
-     → Thread 2 still blocked (returnIdCounter=17)
-
-GPU: ID=17 completes @ t=15ms
-     → slot[1].is_ready = true
+GPU: slot[3] completes @ t=15ms
+     → slot[3].is_ready = true
      → Thread 1 wakes up ✅
-     → Thread 1 processes → returnIdCounter = 18
+     → Thread 1 processes → returnCounter = 1
      → Thread 2 now unblocked ✅
 ```
 
-**Key Point**: Wait on **slot_id** (my_id), not slot_idx, for correct FIFO ordering
+**Key Points**:
+- ✅ **Find slot by submission_id** - simple linear search
+- ✅ **FIFO using submission_id** - not slot index
+- ✅ **CurrPicIdx can be any value** - NVDEC chooses which slot to use
 
 ---
 
-### Component 5: Frame Retrieval & Cleanup (Multi-Frame Support)
+### Component 5: Frame Retrieval & Cleanup
 
 **Purpose**: Map decoded frame, copy to surface, release slot
 
 ```cpp
-// In DecodeToSurface() - PHASE 3
+// In DecodeToSurface() - After decode complete
 
-// 7. Get first frame from multi-frame packet
-int frameIdx = my_slot.picture_indices[0];  // Return first frame only
-
-// 8. Map decoded frame from NVDEC
+// 7. Map decoded frame from NVDEC using the slot's picture_index
 CUVIDPROCPARAMS procParams = {};
 procParams.progressive_frame = 1;
 
 CUdeviceptr srcDevicePtr = 0;
 unsigned int srcPitch = 0;
 
-CUresult result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams);
+int pic_idx = my_slot.picture_index;  // CurrPicIdx from NVDEC
+CUresult result = cuvidMapVideoFrame(m_decoder, pic_idx, &srcDevicePtr, &srcPitch, &procParams);
 if (result != CUDA_SUCCESS) {
-    my_slot.in_use = false;
-    m_returnIdCounter.fetch_add(1);
+    LOGF_ERROR("[DecodeToSurface] cuvidMapVideoFrame failed for pic_idx=%d", pic_idx);
+    my_slot.in_use.store(false);
+    m_returnCounter.fetch_add(1);
     return false;
 }
 
-// 9. Copy to D3D12 surface
-ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(target_surface);
+// 8. Copy to D3D12 surface (use target_surface from slot)
+ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(my_slot.target_surface);
 bool copySuccess = m_d3d12Handler->CopyNV12Frame(
     srcDevicePtr, srcPitch, d3d12Resource, m_width, m_height
 );
 
-// 10. Unmap frame
+// 9. Unmap frame
 cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
 
-// 11. Release slot
+// 10. Release slot
 {
     std::lock_guard<std::mutex> lock(my_slot.slot_mutex);
-    my_slot.in_use = false;
+    my_slot.in_use.store(false);
 }
 
-// 12. Advance return ID counter (FIFO order)
-m_returnIdCounter.fetch_add(1);
+LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx);
+
+// 11. Advance return counter (FIFO order)
+m_returnCounter.fetch_add(1);
 
 return copySuccess;
 ```
 
-**Multi-Frame Decision**: Return **first frame only** from multi-frame packet (indices[0])
+**Key Points**:
+- ✅ **Single picture per slot** - no multi-frame complexity
+- ✅ **picture_index == CurrPicIdx** - direct NVDEC slot index
+- ✅ **target_surface from slot** - stored during HandlePictureDecode
 
 ---
 
-### Component 6: NVDEC Callback Integration (Option C)
+### Component 6: NVDEC Callback Integration (Simplified)
 
-**Purpose**: Link NVDEC picture_index to RingBuffer slot using direct slot_id calculation
+**Purpose**: Use NVDEC's CurrPicIdx directly as ring buffer slot index
 
 ```cpp
 int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS* pic_params) {
     auto* decoder = static_cast<NVDECAV1Decoder*>(user_data);
 
-    // 🎯 Option C: Direct slot_id → slot_idx calculation (no map lookup!)
-    uint64_t slot_id = static_cast<uint64_t>(pic_params->nTimeStamp);
-    size_t slot_idx = slot_id % RING_BUFFER_SIZE;
+    // 🎯 Use NVDEC's CurrPicIdx directly as slot index
+    int slot_idx = pic_params->CurrPicIdx;  // 0-7 (NVDEC's ring buffer index)
+
+    LOGF_DEBUG("[HandlePictureDecode] CurrPicIdx=%d, IntraPicFlag=%d",
+               slot_idx, pic_params->intra_pic_flag);
+
+    // Validate slot index (should always be 0-7)
+    if (slot_idx < 0 || slot_idx >= RING_BUFFER_SIZE) {
+        LOGF_ERROR("[HandlePictureDecode] Invalid CurrPicIdx=%d", slot_idx);
+        return 0;
+    }
 
     DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
 
+    // Find pending submission context using submission_id
+    uint64_t submission_id;
+    size_t pending_idx;
+    {
+        std::lock_guard<std::mutex> lock(decoder->m_submissionMutex);
+
+        // Find the most recent pending submission
+        // (In practice, we use the last allocated submission_id)
+        submission_id = decoder->m_submissionCounter.load() - 1;
+        pending_idx = submission_id % RING_BUFFER_SIZE;
+
+        auto& pending = decoder->m_pendingSubmissions[pending_idx];
+
+        // Copy pending submission context to decode slot
+        slot.target_surface = pending.target_surface;
+        slot.surface_type = pending.surface_type;
+        slot.submission_id = pending.submission_id;
+
+        // Release pending slot for reuse
+        pending.in_use.store(false);
+    }
+
+    slot.picture_index = slot_idx;  // Store CurrPicIdx (same as slot_idx)
+    slot.in_use.store(true);
+    slot.is_ready.store(false);
+
+    LOGF_INFO("[HandlePictureDecode] Calling cuvidDecodePicture (slot=%d, submission_id=%llu, pending_idx=%zu)",
+              slot_idx, slot.submission_id, pending_idx);
+
     // Submit frame to NVDEC decoder
     CUresult result = cuvidDecodePicture(decoder->m_decoder, pic_params);
     if (result != CUDA_SUCCESS) {
-        decoder->LogCUDAError(result, "cuvidDecodePicture failed");
+        LOGF_ERROR("[HandlePictureDecode] cuvidDecodePicture failed: %d", result);
+        slot.in_use.store(false);
         return 0;
     }
 
-    // Store picture_index for polling (multi-frame support)
-    {
-        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-        slot.picture_indices.push_back(pic_params->CurrPicIdx);
-    }
-
-    // Polling thread will check cuvidGetDecodeStatus() for ALL picture_indices
+    // Polling thread will check cuvidGetDecodeStatus() for this picture_index
 
     return 1;
 }
 ```
 
-**🎯 Option C Key Advantages**:
-- ✅ **No mapping overhead**: Direct modulo calculation, no unordered_map lookup
-- ✅ **No mutex contention**: No global map mutex needed
-- ✅ **Multi-frame support**: Automatically handles multiple frames per packet
-- ✅ **Deterministic**: Same slot_id always maps to same slot_idx
+**Key Advantages of Simplified Design**:
+- ✅ **Direct CurrPicIdx usage** - no calculation, no mapping
+- ✅ **Ring buffer for pending contexts** - multi-thread safe, no overwrites
+- ✅ **pending_idx = submission_id % 8** - deterministic context lookup
+- ✅ **Release after copy** - pending slots can be reused immediately
+- ✅ **Single picture per slot** - no multi-frame vector complexity
+- ✅ **Aligned with NVDEC design** - use the API as intended
 
 ---
 
-## 📐 Implementation Plan (Updated for Option C)
+## 🎯 Simplified Design Summary
 
-### Phase 1: Data Structure Setup ✅
+### **What Changed from Original Design**
+
+**Eliminated Complexity**:
+1. ❌ Custom `slot_id` allocation system (`m_slotIdCounter`)
+2. ❌ `m_pictureIndexToSlotId` mapping table
+3. ❌ `std::vector<int> picture_indices` (multi-frame support)
+4. ❌ ParseContext struct and user_data manipulation
+5. ❌ Timestamp tricks to pass slot_id through callbacks
+
+**New Simplified Approach**:
+1. ✅ Use NVDEC's `CurrPicIdx` (0-7) directly as ring buffer index
+2. ✅ Pass submission context via `m_pendingSubmission` member variable
+3. ✅ Single picture per slot (one packet → one frame)
+4. ✅ `submission_id` only for FIFO ordering, not slot allocation
+
+### **Data Flow Comparison**
+
+**Old (Complex)**:
+```
+DecodeToSurface:
+  my_id = m_slotIdCounter++
+  slot_idx = my_id % 8
+  packet.timestamp = my_id
+
+HandlePictureDecode:
+  slot_id = pic_params->nTimeStamp
+  slot_idx = slot_id % 8
+  m_pictureIndexToSlotId[CurrPicIdx] = slot_id
+  slot.picture_indices.push_back(CurrPicIdx)
+
+PollingThread:
+  Check ALL picture_indices in slot
+```
+
+**New (Simplified)**:
+```
+DecodeToSurface:
+  submission_id = m_submissionCounter++
+  pending_idx = submission_id % 8
+  m_pendingSubmissions[pending_idx] = {target_surface, surface_type, submission_id}
+
+HandlePictureDecode:
+  slot_idx = pic_params->CurrPicIdx  // 0-7 directly!
+  pending_idx = submission_id % 8
+  slot = m_ringBuffer[slot_idx]
+  slot.copy(m_pendingSubmissions[pending_idx])
+  m_pendingSubmissions[pending_idx].in_use = false  // Release for reuse
+  slot.picture_index = slot_idx
+
+PollingThread:
+  Check slot.picture_index (single value)
+```
+
+### **Why This is Better**
+
+1. **Aligned with NVDEC's Design**: NVDEC provides CurrPicIdx as the canonical slot index
+2. **Multi-thread Safe**: Ring buffer for pending contexts prevents overwrites
+3. **Less Code**: Fewer data structures, less synchronization overhead
+4. **No Mapping Tables**: Direct indexing, no hash map lookups
+5. **Simpler Debugging**: CurrPicIdx == slot_idx == picture_index
+6. **Easier to Understand**: Follow NVDEC's intended usage pattern
+7. **Deterministic Context Lookup**: pending_idx = submission_id % 8
+
+---
+
+## 📐 Implementation Plan (Updated for Simplified Design)
+
+### Phase 1: Data Structure Setup
 
 **Files to Modify**:
-- `NVDECAV1Decoder.h` - Add RingBuffer members with Option C design
+- `NVDECAV1Decoder.h` - Add simplified RingBuffer members
 - `NVDECAV1Decoder.cpp` - Initialize RingBuffer in constructor
 
 **Tasks**:
-- [x] Define `DecodeSlot` structure with `std::vector<int> picture_indices`
-- [x] Add `m_ringBuffer[8]` array
-- [x] **🎯 Option C**: Add `m_slotIdCounter`, `m_returnIdCounter` atomic counters (NOT submitIndex/returnIndex)
-- [x] Add `m_pollingThread`, `m_pollingRunning` members
+- [ ] Define simplified `DecodeSlot` structure (no picture_indices vector)
+- [ ] Add `m_ringBuffer[8]` array
+- [ ] Add `m_submissionCounter`, `m_returnCounter` atomic counters
+- [ ] **⚠️ CRITICAL**: Add `PendingSubmission` struct with `in_use` flag
+- [ ] **⚠️ CRITICAL**: Add `m_pendingSubmissions[8]` ring buffer array (NOT single struct!)
+- [ ] Add `m_submissionMutex` for pending submission allocation
+- [ ] Add `m_pollingThread`, `m_pollingRunning` members
+- [ ] Remove old variables: `m_currentSubmittingSlotId`, any mapping tables
 
 **Estimated Time**: 30 minutes
 
 ---
 
-### Phase 2: Polling Thread Implementation (Multi-Frame Support)
+### Phase 2: Polling Thread Implementation (Simplified)
 
 **Files to Modify**:
 - `NVDECAV1Decoder.cpp` - Implement `PollingThreadFunc()`
 
 **Tasks**:
-- [ ] Implement polling loop with `cuvidGetDecodeStatus()` checking ALL picture_indices
-- [ ] Poll slot at `m_returnIdCounter % RING_BUFFER_SIZE` (NOT all slots)
+- [ ] Implement simple polling loop iterating all 8 slots
+- [ ] Check `slot.in_use` and `!slot.is_ready` to skip idle slots
+- [ ] Use `cuvidGetDecodeStatus()` for single `slot.picture_index`
 - [ ] Add thread start in `Initialize()`
 - [ ] Add thread stop in `Cleanup()`
-- [ ] Add debug logging for multi-frame packet events
+- [ ] Add debug logging for slot readiness
 
 **Testing**:
 - Verify thread starts/stops correctly
-- Verify multi-frame packet handling (all frames checked)
+- Verify single picture per slot handling
 - Verify `cuvidGetDecodeStatus()` calls work
 
-**Estimated Time**: 1 hour
+**Estimated Time**: 45 minutes
 
 ---
 
-### Phase 3: DecodeToSurface Refactoring (Option C Implementation)
+### Phase 3: DecodeToSurface Refactoring (Simplified Implementation)
 
 **Files to Modify**:
-- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` with Option C design
+- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` with simplified design
 
 **Tasks**:
-- [ ] **🎯 Phase 1**: Slot allocation using `my_id = m_slotIdCounter.fetch_add(1)`
-- [ ] **🎯 Phase 2**: Packet submission with `packet.timestamp = my_id` (full slot_id, NOT modulo!)
-- [ ] **🎯 Phase 3**: FIFO wait using `while(m_returnIdCounter != my_id)` (NOT slot_idx!)
-- [ ] **🎯 Phase 4**: Frame retrieval from `picture_indices[0]` (first frame only)
-- [ ] **🎯 Phase 5**: Cleanup and `m_returnIdCounter.fetch_add(1)`
+- [ ] **Phase 1**: Allocate `submission_id` from `m_submissionCounter`
+- [ ] **Phase 2**: Calculate `pending_idx = submission_id % 8`
+- [ ] **Phase 3**: Wait if `m_pendingSubmissions[pending_idx].in_use` (overflow protection)
+- [ ] **Phase 4**: Store submission context in `m_pendingSubmissions[pending_idx]`
+- [ ] **Phase 5**: Call `cuvidParseVideoData()` (synchronous, callback completes)
+- [ ] **Phase 6**: Find slot by searching for `submission_id` in ring buffer
+- [ ] **Phase 7**: Wait for FIFO turn using `m_returnCounter`
+- [ ] **Phase 8**: Wait for `slot.is_ready` condition variable
+- [ ] **Phase 9**: Map/copy frame using `slot.picture_index`
+- [ ] **Phase 10**: Release slot and increment `m_returnCounter`
 - [ ] Error handling for all failure paths
 
 **Testing**:
 - Single-threaded decode test
 - Multi-threaded decode test (2-3 threads)
-- Verify packet-surface mapping correctness with multi-frame packets
+- **⚠️ Multi-thread race test**: Verify no pending context overwrites
+- Verify CurrPicIdx is used correctly as slot index
 - Verify FIFO ordering with out-of-order completion
 
-**Estimated Time**: 2 hours
+**Estimated Time**: 1.5 hours
 
 ---
 
-### Phase 4: HandlePictureDecode Update (Option C)
+### Phase 4: HandlePictureDecode Update (Simplified)
 
 **Files to Modify**:
 - `NVDECAV1Decoder.cpp` - Modify `HandlePictureDecode()` callback
 
 **Tasks**:
-- [ ] **🎯 Option C**: Extract `slot_id` from `pic_params->nTimeStamp`
-- [ ] **🎯 Option C**: Calculate `slot_idx = slot_id % RING_BUFFER_SIZE` (NO map lookup!)
-- [ ] **🎯 Multi-frame**: Use `slot.picture_indices.push_back(CurrPicIdx)` (NOT single index!)
-- [ ] Add debug logging for multi-frame packets
+- [ ] **Direct CurrPicIdx**: Use `slot_idx = pic_params->CurrPicIdx` (0-7)
+- [ ] **Find pending context**: Calculate `pending_idx = submission_id % 8`
+- [ ] **Copy submission context**: Copy `m_pendingSubmissions[pending_idx]` to `slot`
+- [ ] **Release pending slot**: Set `m_pendingSubmissions[pending_idx].in_use = false`
+- [ ] **Store picture_index**: Set `slot.picture_index = slot_idx`
+- [ ] **Mark slot active**: `slot.in_use = true`, `slot.is_ready = false`
+- [ ] Add debug logging for slot assignment and pending release
 
 **Testing**:
-- Verify timestamp → slot_idx direct calculation works
-- Verify picture_indices vector correctly stores multiple frames
-- Test with video that has multi-frame packets (test_720p_stripe.webm)
+- Verify CurrPicIdx maps correctly to slot index
+- **⚠️ Critical**: Verify pending_idx calculation is correct
+- Verify submission context is copied correctly
+- Verify pending slot is released immediately after copy
+- Test with both simple and complex videos
 
-**Estimated Time**: 30 minutes
+**Estimated Time**: 25 minutes
 
 ---
 
-### Phase 5: Integration Testing (Option C Validation)
+### Phase 5: Integration Testing (Simplified Design Validation)
 
 **Test Scenarios**:
-1. **Single packet decode** - Verify Option C basic functionality
-2. **Multi-frame packet** - Verify vector-based picture_indices handling (test_720p_stripe.webm)
-3. **Sequential 3 packets** - Verify FIFO order using m_returnIdCounter
-4. **Out-of-order completion** - Verify slot_id → slot_idx mapping (I-frame after P-frame)
-5. **RingBuffer overflow** - Verify error handling (9+ concurrent calls)
-6. **Decode errors** - Verify graceful failure with multi-frame packets
-7. **Performance benchmark** - Measure latency reduction vs old queue-based approach
+1. **Single packet decode** - Verify CurrPicIdx direct usage
+2. **Sequential 3 packets** - Verify FIFO order using m_returnCounter
+3. **Out-of-order completion** - Verify submission_id FIFO (I-frame after P-frame)
+4. **RingBuffer overflow** - Verify error handling (9+ concurrent calls)
+5. **Decode errors** - Verify graceful failure and slot release
+6. **Performance benchmark** - Measure latency reduction vs old queue-based approach
 
 **Test Files**:
 - Simple test video (simple_test.webm) - basic validation
-- **Multi-frame packet video (test_720p_stripe.webm)** - critical multi-frame test ⚠️
+- Complex video (test_720p_stripe.webm) - stress testing
 
 **Validation Criteria**:
-- ✅ No slot ID → slot_idx mapping errors
-- ✅ All frames from multi-frame packets detected and polled
+- ✅ CurrPicIdx used correctly as slot index (no calculation errors)
+- ✅ submission_id correctly tracks FIFO order
 - ✅ FIFO order maintained even with out-of-order GPU completion
 - ✅ No memory corruption or race conditions
+- ✅ No mapping table lookups (simpler code path)
 
-**Estimated Time**: 2 hours
+**Estimated Time**: 1.5 hours
 
 ---
 
 ### Phase 6: Documentation & Cleanup
 
 **Tasks**:
-- [x] Update NVDEC design documentation with Option C and multi-frame support
-- [ ] Add inline code comments explaining Option C design choices
-- [ ] Remove old queue-based code and any ID→Index mapping attempts
+- [x] Update NVDEC design documentation with simplified CurrPicIdx design
+- [ ] Add inline code comments explaining CurrPicIdx direct usage
+- [ ] Remove old queue-based code and any mapping table attempts
+- [ ] Remove ParseContext struct and related code
 - [ ] Move design doc to `docs/completed/` after successful implementation
-- [ ] Document multi-frame packet behavior and first-frame-only decision
+- [ ] Document NVDEC's ring buffer philosophy for future reference
 
-**Estimated Time**: 1 hour
+**Estimated Time**: 45 minutes
 
 ---
 
-## 🎯 Option C Design Summary
+**Total Estimated Time**: ~5 hours (reduced from ~7 hours with previous design)
 
-### **Core Principle**: Eliminate mapping overhead through deterministic calculation
-
-**Key Components**:
-1. **Single Counter for Dual Purpose**: `m_slotIdCounter` serves as both unique ID and submission order
-2. **Direct Slot Calculation**: `slot_idx = slot_id % RING_BUFFER_SIZE` (no map needed)
-3. **FIFO via ID Comparison**: `while(m_returnIdCounter != my_id)` ensures ordering
-4. **Multi-Frame Vector**: `std::vector<int> picture_indices` handles packets with multiple frames
-
-**Data Flow**:
-```
-DecodeToSurface:
-  my_id = m_slotIdCounter++ (e.g., 17)
-  slot_idx = 17 % 8 = 1
-  packet.timestamp = 17
-
-HandlePictureDecode:
-  slot_id = pic_params->nTimeStamp (17)
-  slot_idx = 17 % 8 = 1
-  m_ringBuffer[1].picture_indices.push_back(CurrPicIdx)
-
-PollingThread:
-  return_id = m_returnIdCounter (17)
-  slot_idx = 17 % 8 = 1
-  Check all m_ringBuffer[1].picture_indices[]
-
-DecodeToSurface (wait):
-  while(m_returnIdCounter != 17) { wait }
-  Process frame from picture_indices[0]
-  m_returnIdCounter++ (18)
-```
-
-**Eliminated Complexity**:
-- ❌ No `std::unordered_map<uint64_t, size_t>` mapping
-- ❌ No global map mutex
-- ❌ No map insert/erase operations
-- ❌ No lookup failures or stale entries
-
-**Multi-Frame Packet Handling**:
-- One packet → multiple HandlePictureDecode calls
-- All frames stored in `picture_indices` vector
-- PollingThread checks ALL frames complete
-- Return first frame only (`picture_indices[0]`)
 
 ---
 
@@ -907,6 +1122,16 @@ void PollingThreadFunc() {
 | 2025-10-04 | Claude | Added RingBuffer architecture details |
 | 2025-10-04 | Claude | Added polling thread design |
 | 2025-10-04 | Claude | Added performance analysis |
+| 2025-10-05 | Claude | **MAJOR UPDATE**: Simplified design to use NVDEC's CurrPicIdx directly |
+| 2025-10-05 | Claude | Removed ParseContext approach (failed due to NVDEC parameter copying) |
+| 2025-10-05 | Claude | Removed custom slot_id allocation and mapping tables |
+| 2025-10-05 | Claude | Added "NVDEC CurrPicIdx Design Philosophy" section |
+| 2025-10-05 | Claude | Updated all components to use simplified architecture |
+| 2025-10-05 | Claude | Reduced implementation time estimate from 7h to 5h |
+| 2025-10-05 | Claude | **CRITICAL FIX**: Changed m_pendingSubmission to ring buffer array |
+| 2025-10-05 | Claude | Added PendingSubmission[8] to prevent multi-thread overwrites |
+| 2025-10-05 | Claude | Updated all components to use pending_idx = submission_id % 8 |
+| 2025-10-05 | Claude | Added overflow protection and pending slot release logic |
 
 ---
 
diff --git a/vav2/platforms/windows/vavcore/VavCore.vcxproj b/vav2/platforms/windows/vavcore/VavCore.vcxproj
index 7670e2b..48efdcd 100644
--- a/vav2/platforms/windows/vavcore/VavCore.vcxproj
+++ b/vav2/platforms/windows/vavcore/VavCore.vcxproj
@@ -118,6 +118,7 @@
   <ItemGroup>
     <ClInclude Include="include\VavCore\VavCore.h" />
     <ClInclude Include="src\Common\VideoTypes.h" />
+    <ClInclude Include="src\Common\VavCoreLogger.h" />
     <ClInclude Include="src\Decoder\IVideoDecoder.h" />
     <ClInclude Include="src\pch.h" />
     <ClInclude Include="src\Common\AdaptiveTypes.h" />
@@ -141,6 +142,7 @@
     </ClCompile>
     <ClCompile Include="src\VavCore.cpp" />
     <ClCompile Include="src\DllMain.cpp" />
+    <ClCompile Include="src\Common\VavCoreLogger.cpp" />
     <ClCompile Include="src\Decoder\AV1Decoder.cpp" />
     <ClCompile Include="src\Decoder\AdaptiveAV1Decoder.cpp" />
     <ClCompile Include="src\Decoder\AdaptiveNVDECDecoder.cpp" />
diff --git a/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.cpp b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.cpp
new file mode 100644
index 0000000..ae43f6b
--- /dev/null
+++ b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.cpp
@@ -0,0 +1,121 @@
+#include "pch.h"
+#include "VavCoreLogger.h"
+
+#include <cstdio>
+#include <iostream>
+
+#ifdef _WIN32
+#include <Windows.h>
+#endif
+
+namespace VavCore {
+
+VavCoreLogger& VavCoreLogger::GetInstance() {
+    static VavCoreLogger instance;
+    return instance;
+}
+
+void VavCoreLogger::LogDebugF(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    LogFormattedV(VC_LogLevel::VC_DEBUG, format, args);
+    va_end(args);
+}
+
+void VavCoreLogger::LogInfoF(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    LogFormattedV(VC_LogLevel::VC_INFO, format, args);
+    va_end(args);
+}
+
+void VavCoreLogger::LogWarningF(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    LogFormattedV(VC_LogLevel::VC_WARNING, format, args);
+    va_end(args);
+}
+
+void VavCoreLogger::LogErrorF(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    LogFormattedV(VC_LogLevel::VC_ERROR, format, args);
+    va_end(args);
+}
+
+void VavCoreLogger::LogDebug(const std::string& message, const char* source) {
+    LogString(VC_LogLevel::VC_DEBUG, message, source);
+}
+
+void VavCoreLogger::LogInfo(const std::string& message, const char* source) {
+    LogString(VC_LogLevel::VC_INFO, message, source);
+}
+
+void VavCoreLogger::LogWarning(const std::string& message, const char* source) {
+    LogString(VC_LogLevel::VC_WARNING, message, source);
+}
+
+void VavCoreLogger::LogError(const std::string& message, const char* source) {
+    LogString(VC_LogLevel::VC_ERROR, message, source);
+}
+
+void VavCoreLogger::LogFormattedV(VC_LogLevel level, const char* format, va_list args) {
+    // Format message
+    char buffer[1024];
+    vsnprintf(buffer, sizeof(buffer), format, args);
+
+    // Output to console
+    if (level == VC_LogLevel::VC_ERROR || level == VC_LogLevel::VC_WARNING) {
+        std::cerr << buffer;
+        if (buffer[strlen(buffer) - 1] != '\n') {
+            std::cerr << std::endl;
+        }
+    } else {
+        std::cout << buffer;
+        if (buffer[strlen(buffer) - 1] != '\n') {
+            std::cout << std::endl;
+        }
+    }
+
+#ifdef _WIN32
+    // Output to Visual Studio debug console
+    OutputDebugStringA(buffer);
+    if (buffer[strlen(buffer) - 1] != '\n') {
+        OutputDebugStringA("\n");
+    }
+#endif
+}
+
+void VavCoreLogger::LogString(VC_LogLevel level, const std::string& message, const char* source) {
+    std::string fullMessage;
+    if (source) {
+        fullMessage = "[" + std::string(source) + "] " + message;
+    } else {
+        fullMessage = message;
+    }
+
+    // Output to console
+    if (level == VC_LogLevel::VC_ERROR || level == VC_LogLevel::VC_WARNING) {
+        std::cerr << fullMessage << std::endl;
+    } else {
+        std::cout << fullMessage << std::endl;
+    }
+
+#ifdef _WIN32
+    // Output to Visual Studio debug console
+    OutputDebugStringA(fullMessage.c_str());
+    OutputDebugStringA("\n");
+#endif
+}
+
+const char* VavCoreLogger::GetLevelString(VC_LogLevel level) {
+    switch (level) {
+        case VC_LogLevel::VC_DEBUG:   return "[DEBUG]";
+        case VC_LogLevel::VC_INFO:    return "[INFO]";
+        case VC_LogLevel::VC_WARNING: return "[WARNING]";
+        case VC_LogLevel::VC_ERROR:   return "[ERROR]";
+        default:                      return "[UNKNOWN]";
+    }
+}
+
+} // namespace VavCore
diff --git a/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.h b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.h
new file mode 100644
index 0000000..1c26821
--- /dev/null
+++ b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <string>
+#include <cstdarg>
+
+namespace VavCore {
+
+// Simple singleton logger for VavCore
+class VavCoreLogger {
+public:
+    enum class VC_LogLevel {
+        VC_DEBUG,
+        VC_INFO,
+        VC_WARNING,
+        VC_ERROR
+    };
+
+    static VavCoreLogger& GetInstance();
+
+    // Delete copy/move constructors
+    VavCoreLogger(const VavCoreLogger&) = delete;
+    VavCoreLogger& operator=(const VavCoreLogger&) = delete;
+
+    // Printf-style formatted logging
+    void LogDebugF(const char* format, ...);
+    void LogInfoF(const char* format, ...);
+    void LogWarningF(const char* format, ...);
+    void LogErrorF(const char* format, ...);
+
+    // std::string-style logging with optional source
+    void LogDebug(const std::string& message, const char* source = nullptr);
+    void LogInfo(const std::string& message, const char* source = nullptr);
+    void LogWarning(const std::string& message, const char* source = nullptr);
+    void LogError(const std::string& message, const char* source = nullptr);
+
+private:
+    VavCoreLogger() = default;
+
+    void LogFormattedV(VC_LogLevel level, const char* format, va_list args);
+    void LogString(VC_LogLevel level, const std::string& message, const char* source);
+    const char* GetLevelString(VC_LogLevel level);
+};
+
+} // namespace VavCore
+
+// Global logging macros for VavCore
+
+// LOGF_* : For printf-style formatted messages (const char* format, ...)
+// Usage: LOGF_INFO("[Component] Value: %d", value)
+#define LOGF_DEBUG(...) VavCore::VavCoreLogger::GetInstance().LogDebugF(__VA_ARGS__)
+#define LOGF_INFO(...) VavCore::VavCoreLogger::GetInstance().LogInfoF(__VA_ARGS__)
+#define LOGF_WARNING(...) VavCore::VavCoreLogger::GetInstance().LogWarningF(__VA_ARGS__)
+#define LOGF_ERROR(...) VavCore::VavCoreLogger::GetInstance().LogErrorF(__VA_ARGS__)
+
+// LOGS_* : For std::string messages with optional source parameter
+// Usage: LOGS_INFO(errorMessage, "NetworkModule")
+#define LOGS_DEBUG(msg, source) VavCore::VavCoreLogger::GetInstance().LogDebug(msg, source)
+#define LOGS_INFO(msg, source) VavCore::VavCoreLogger::GetInstance().LogInfo(msg, source)
+#define LOGS_WARNING(msg, source) VavCore::VavCoreLogger::GetInstance().LogWarning(msg, source)
+#define LOGS_ERROR(msg, source) VavCore::VavCoreLogger::GetInstance().LogError(msg, source)
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
index 8b10bc5..77f2d49 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -20,6 +20,7 @@
 #include "NVDECAV1Decoder.h"
 #include "D3D12SurfaceHandler.h"
 #include "VideoDecoderFactory.h"
+#include "../Common/VavCoreLogger.h"
 
 const char* g_deinterleave_kernel_ptx = R"PTX(
 //
@@ -835,8 +836,13 @@ void NVDECAV1Decoder::CleanupCUDA() {
 
 // NVDEC Callbacks
 int CUDAAPI NVDECAV1Decoder::HandleVideoSequence(void* user_data, CUVIDEOFORMAT* format) {
+    if (!user_data || !format) {
+        return 0;
+    }
+
+    // user_data is always NVDECAV1Decoder* in the simplified design
     auto* decoder = static_cast<NVDECAV1Decoder*>(user_data);
-    if (!decoder || !format) {
+    if (!decoder) {
         return 0;
     }
 
@@ -918,120 +924,97 @@ int CUDAAPI NVDECAV1Decoder::HandleVideoSequence(void* user_data, CUVIDEOFORMAT*
     return format->min_num_decode_surfaces;
 }
 
+// Simplified HandlePictureDecode: Use NVDEC's CurrPicIdx directly as ring buffer slot index
 int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS* pic_params) {
-    OutputDebugStringA("[HandlePictureDecode] === CALLBACK CALLED ===\n");
+    if (!user_data || !pic_params) {
+        LOGF_ERROR("[HandlePictureDecode] Invalid user_data or pic_params");
+        return 0;
+    }
 
     auto* decoder = static_cast<NVDECAV1Decoder*>(user_data);
-    if (!decoder || !pic_params) {
-        OutputDebugStringA("[HandlePictureDecode] ERROR: Invalid user_data or pic_params\n");
+
+    // Use NVDEC's CurrPicIdx directly as slot index (0-7)
+    int slot_idx = pic_params->CurrPicIdx;
+
+    LOGF_DEBUG("[HandlePictureDecode] CurrPicIdx=%d, IntraPicFlag=%d",
+               slot_idx, pic_params->intra_pic_flag);
+
+    // Validate slot index (should always be 0-7)
+    if (slot_idx < 0 || slot_idx >= RING_BUFFER_SIZE) {
+        LOGF_ERROR("[HandlePictureDecode] Invalid CurrPicIdx=%d", slot_idx);
         return 0;
     }
 
-    if (!decoder->m_decoder) {
-        OutputDebugStringA("[HandlePictureDecode] ERROR: m_decoder is nullptr!\n");
-        return 0;
+    DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
+
+    // Find pending submission context using submission_id
+    uint64_t submission_id;
+    size_t pending_idx;
+    {
+        std::lock_guard<std::mutex> lock(decoder->m_submissionMutex);
+
+        // Find the most recent pending submission
+        submission_id = decoder->m_submissionCounter.load() - 1;
+        pending_idx = submission_id % RING_BUFFER_SIZE;
+
+        auto& pending = decoder->m_pendingSubmissions[pending_idx];
+
+        // Copy pending submission context to decode slot
+        slot.target_surface = pending.target_surface;
+        slot.surface_type = pending.surface_type;
+        slot.submission_id = pending.submission_id;
+
+        // Release pending slot for reuse
+        pending.in_use.store(false);
     }
 
+    slot.picture_index = slot_idx;  // Store CurrPicIdx (same as slot_idx)
+    slot.in_use.store(true);
+    slot.is_ready.store(false);
+
+    LOGF_INFO("[HandlePictureDecode] Calling cuvidDecodePicture (slot=%d, submission_id=%llu, pending_idx=%zu)",
+              slot_idx, slot.submission_id, pending_idx);
+
     // Ensure CUDA context is current before decoding
     if (decoder->m_cuContext) {
         CUresult ctxResult = cuCtxSetCurrent(decoder->m_cuContext);
         if (ctxResult != CUDA_SUCCESS) {
-            char debug_buf[256];
-            sprintf_s(debug_buf, "[HandlePictureDecode] Failed to set CUDA context: %d\n", ctxResult);
-            OutputDebugStringA(debug_buf);
+            LOGF_ERROR("[HandlePictureDecode] Failed to set CUDA context: %d", ctxResult);
+            slot.in_use.store(false);
             return 0;
         }
     }
 
-    // Store mapping: picture_index (CurrPicIdx) �� slot_id (last allocated)
-    // NVDEC ignores our timestamp and uses WebM PTS, so we need this mapping
-    uint64_t current_slot_id = decoder->m_slotIdCounter.load() - 1;
-    {
-        std::lock_guard<std::mutex> lock(decoder->m_mappingMutex);
-        decoder->m_pictureIndexToSlotId[pic_params->CurrPicIdx] = current_slot_id;
-    }
-
-    char debug_buf[512];
-    sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, slot_id=%llu, IntraPicFlag=%d)\n",
-        decoder->m_decoder, pic_params->CurrPicIdx, current_slot_id, pic_params->intra_pic_flag);
-    OutputDebugStringA(debug_buf);
-    printf("%s", debug_buf);
-
+    // Submit frame to NVDEC decoder
     CUresult result = cuvidDecodePicture(decoder->m_decoder, pic_params);
     if (result != CUDA_SUCCESS) {
-        sprintf_s(debug_buf, "[HandlePictureDecode] cuvidDecodePicture failed with code %d\n", result);
-        OutputDebugStringA(debug_buf);
-        printf("%s", debug_buf);
-        decoder->LogCUDAError(result, "cuvidDecodePicture");
+        LOGF_ERROR("[HandlePictureDecode] cuvidDecodePicture failed: %d", result);
+        slot.in_use.store(false);
         return 0;
     }
 
-    sprintf_s(debug_buf, "[HandlePictureDecode] cuvidDecodePicture succeeded (CurrPicIdx=%d)\n", pic_params->CurrPicIdx);
-    OutputDebugStringA(debug_buf);
-    printf("%s", debug_buf);
+    // Polling thread will check cuvidGetDecodeStatus() for this picture_index
 
-    return 1; // Success
+    return 1;
 }
 
 int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDISPINFO* disp_info) {
-    auto* decoder = static_cast<NVDECAV1Decoder*>(user_data);
-    if (!decoder || !disp_info) {
-        OutputDebugStringA("[HandlePictureDisplay] ERROR: Invalid user_data or disp_info\n");
+    if (!user_data || !disp_info) {
+        LOGF_ERROR("[HandlePictureDisplay] Invalid user_data or disp_info");
         return 0;
     }
 
-    // Lookup slot_id from mapping table (picture_index �� slot_id)
-    uint64_t slot_id;
-    {
-        std::lock_guard<std::mutex> lock(decoder->m_mappingMutex);
-        auto it = decoder->m_pictureIndexToSlotId.find(disp_info->picture_index);
-        if (it == decoder->m_pictureIndexToSlotId.end()) {
-            char err_buf[256];
-            sprintf_s(err_buf, "[HandlePictureDisplay] ERROR: picture_index=%d not found in mapping table\n",
-                disp_info->picture_index);
-            OutputDebugStringA(err_buf);
-            printf("%s", err_buf);
-            return 0;
-        }
-        slot_id = it->second;
-    }
-    size_t slot_idx = slot_id % decoder->RING_BUFFER_SIZE;
-    DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
+    auto* decoder = static_cast<NVDECAV1Decoder*>(user_data);
 
-    char debug_buf[256];
-    sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: slot_id=%llu, slot_idx=%zu, picture_index=%d\n",
-              slot_id, slot_idx, disp_info->picture_index);
-    OutputDebugStringA(debug_buf);
-    printf("%s", debug_buf);
+    // Note: In the simplified design, the polling thread handles marking frames as ready
+    // HandlePictureDisplay is still called but we rely on cuvidGetDecodeStatus polling
+    // This callback just confirms the frame is ready for display
 
-    // Option C Multi-Frame Support: Store picture_index in vector (one packet can have multiple frames)
-    {
-        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-        slot.picture_indices.push_back(disp_info->picture_index);
-    }
+    int pic_idx = disp_info->picture_index;
+    LOGF_DEBUG("[HandlePictureDisplay] picture_index=%d ready for display", pic_idx);
 
-    sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Pushed picture_index=%d to slot %zu (total frames: %zu)\n",
-              disp_info->picture_index, slot_idx, slot.picture_indices.size());
-    OutputDebugStringA(debug_buf);
-    printf("%s", debug_buf);
-
-    // IMPORTANT: HandlePictureDisplay is called AFTER decoding completes
-    // So we can mark the slot as ready immediately (no polling needed for this callback)
-    bool should_notify = false;
-    {
-        std::lock_guard<std::mutex> lock(slot.slot_mutex);
-        if (!slot.is_ready) {
-            slot.is_ready = true;
-            should_notify = true;
-        }
-    }
-
-    if (should_notify) {
-        slot.frame_ready.notify_one();
-        sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Slot %zu marked ready and notified\n", slot_idx);
-        OutputDebugStringA(debug_buf);
-        printf("%s", debug_buf);
-    }
+    // The polling thread will mark the slot as ready via cuvidGetDecodeStatus
+    // No action needed here in the simplified design
 
     return 1;
 }
@@ -1202,175 +1185,147 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                                      VavCoreSurfaceType target_type,
                                      void* target_surface,
                                      VideoFrame& output_frame) {
-    char debug_buf[256];
-    sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Called with target_type=%d (0=CPU, 1=D3D11, 2=D3D12, 3=CUDA, 4=AMF)\n",
-              static_cast<int>(target_type));
-    OutputDebugStringA(debug_buf);
+    LOGF_DEBUG("[DecodeToSurface] Called with target_type=%d", static_cast<int>(target_type));
 
     if (!m_initialized || !packet_data) {
-        OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] ERROR: Not initialized or null packet_data\n");
+        LOGF_ERROR("[DecodeToSurface] Not initialized or null packet_data");
         return false;
     }
 
-    // CRITICAL: Set CUDA context for current thread (may be background thread)
-    // Thread safety: Lock mutex ONLY for context switching, then release immediately
+    // Set CUDA context for current thread
     {
         std::lock_guard<std::mutex> contextLock(m_cudaContextMutex);
         CUresult ctxResult = cuCtxSetCurrent(m_cuContext);
         if (ctxResult != CUDA_SUCCESS) {
-            sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] ERROR: cuCtxSetCurrent failed with code %d\n", ctxResult);
-            OutputDebugStringA(debug_buf);
+            LOGF_ERROR("[DecodeToSurface] cuCtxSetCurrent failed with code %d", ctxResult);
             return false;
         }
-    } // Unlock mutex immediately after context switch
+    }
 
     if (!SupportsSurfaceType(target_type)) {
-        sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] ERROR: Surface type %d not supported\n",
-                  static_cast<int>(target_type));
-        OutputDebugStringA(debug_buf);
+        LOGF_ERROR("[DecodeToSurface] Surface type %d not supported", static_cast<int>(target_type));
         return false;
     }
 
     try {
         auto start_time = std::chrono::high_resolution_clock::now();
 
-        // ===== Phase 1: Option C Slot Allocation =====
-        // 1. Allocate unique slot ID (serves as both ID and submission order)
-        uint64_t my_id = m_slotIdCounter.fetch_add(1);
-        size_t slot_idx = my_id % RING_BUFFER_SIZE;
-        DecodeSlot& my_slot = m_ringBuffer[slot_idx];
+        // ===== Component 1: Submission Preparation =====
+        // 1. Allocate submission ID for FIFO ordering
+        uint64_t my_submission_id = m_submissionCounter.fetch_add(1);
+        size_t pending_idx = my_submission_id % RING_BUFFER_SIZE;
 
-        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Allocated slot_id=%llu, slot_idx=%zu\n", my_id, slot_idx);
-        OutputDebugStringA(debug_buf);
+        LOGF_DEBUG("[DecodeToSurface] Allocated submission_id=%llu, pending_idx=%zu",
+                   my_submission_id, pending_idx);
 
-        // 2. Check for overflow (RingBuffer full)
-        {
-            std::lock_guard<std::mutex> lock(my_slot.slot_mutex);
-
-            if (my_slot.in_use) {
-                // RingBuffer overflow - too many concurrent decodes
-                sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", slot_idx);
-                OutputDebugStringA(debug_buf);
-                LogError("RingBuffer overflow - max 8 concurrent decodes");
-                return false;
-            }
-
-            // 3. Initialize slot (multi-frame support)
-            my_slot.in_use = true;
-            my_slot.target_surface = target_surface;
-            my_slot.surface_type = target_type;
-            my_slot.picture_indices.clear();  // Clear previous frames
-            my_slot.is_ready = false;
-        }
-
-        // ===== Component 2: Packet Submission =====
-        // 4. Send packet directly without prepending codec private data
-        const uint8_t* final_packet_data = packet_data;
-        size_t final_packet_size = packet_size;
-
-        if (!m_firstFrameSent) {
-            m_firstFrameSent = true;
-            sprintf_s(debug_buf, "[DecodeToSurface] First frame: sending packet directly without codec private data (%zu bytes)\n", packet_size);
-            OutputDebugStringA(debug_buf);
-            printf("%s", debug_buf);
-        }
-
-        // 5. Option C: Submit packet with full slot_id in timestamp (NOT modulo!)
-        CUVIDSOURCEDATAPACKET packet = {};
-        packet.payload = final_packet_data;
-        packet.payload_size = static_cast<unsigned long>(final_packet_size);
-        packet.flags = CUVID_PKT_ENDOFPICTURE;
-        packet.timestamp = static_cast<int64_t>(my_id);  // Pass full slot_id for HandlePictureDecode
-
-        sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot_id=%llu, slot_idx=%zu)...\n", packet.timestamp, my_id, slot_idx);
-        OutputDebugStringA(debug_buf);
-
-        CUresult result = cuvidParseVideoData(m_parser, &packet);
-        if (result != CUDA_SUCCESS) {
-            sprintf_s(debug_buf, "[DecodeToSurface] cuvidParseVideoData failed with code %d\n", result);
-            OutputDebugStringA(debug_buf);
-            LogCUDAError(result, "cuvidParseVideoData");
-            m_decodeErrors++;
-
-            // Release slot on error
-            {
-                std::lock_guard<std::mutex> lock(my_slot.slot_mutex);
-                my_slot.in_use = false;
-            }
-            return false;
-        }
-        OutputDebugStringA("[DecodeToSurface] cuvidParseVideoData succeeded\n");
-
-        // ===== Component 4: Wait for decode to complete FIRST =====
-        // 6. Wait for decode to complete (HandlePictureDisplay signals this)
-        {
-            std::unique_lock<std::mutex> lock(my_slot.slot_mutex);
-
-            sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot_idx=%zu, is_ready=%d)...\n",
-                      slot_idx, my_slot.is_ready);
-            OutputDebugStringA(debug_buf);
-
-            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(10000),
-                [&my_slot]() { return my_slot.is_ready; })) {
-                // Timeout - decode took too long
-                sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot_idx=%zu (is_ready=%d)\n",
-                          slot_idx, my_slot.is_ready);
-                OutputDebugStringA(debug_buf);
-                printf("%s", debug_buf);
-                LogError("Decode timeout");
-                my_slot.in_use = false;
-                m_returnIdCounter.fetch_add(1);  // Option C: Advance to unblock next thread
-                return false;
-            }
-
-            sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot_idx=%zu)\n", slot_idx);
-            OutputDebugStringA(debug_buf);
-        }
-
-        // ===== Component 5: Option C Sequential Return Wait (FIFO Guarantee) =====
-        // 5. Wait for my turn using slot_id (NOT slot_idx!) AFTER decoding completes
-        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Waiting for FIFO turn slot_id=%llu (current returnIdCounter=%llu)...\n",
-                  my_id, m_returnIdCounter.load());
-        OutputDebugStringA(debug_buf);
-
-        while (m_returnIdCounter.load() != my_id) {
+        // 2. Wait if pending slot is still in use (overflow protection)
+        while (m_pendingSubmissions[pending_idx].in_use.load()) {
             std::this_thread::sleep_for(std::chrono::milliseconds(1));
         }
 
-        sprintf_s(debug_buf, "[DecodeToSurface] Option C: slot_id=%llu is now return head\n", my_id);
-        OutputDebugStringA(debug_buf);
+        // 3. Store submission context in ring buffer slot
+        {
+            std::lock_guard<std::mutex> lock(m_submissionMutex);
+            auto& pending = m_pendingSubmissions[pending_idx];
 
-        // ===== Component 5: Option C Frame Retrieval (Multi-Frame Support) =====
-        // Get first frame from multi-frame packet (picture_indices[0])
-        if (my_slot.picture_indices.empty()) {
-            sprintf_s(debug_buf, "[DecodeToSurface] ERROR: No picture_indices in slot %zu\n", slot_idx);
-            OutputDebugStringA(debug_buf);
-            LogError("No frames decoded");
-            my_slot.in_use = false;
-            m_returnIdCounter.fetch_add(1);
+            pending.target_surface = target_surface;
+            pending.surface_type = target_type;
+            pending.submission_id = my_submission_id;
+            pending.in_use.store(true);
+        }
+
+        LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu",
+                   my_submission_id, pending_idx);
+
+        // ===== Component 2: Packet Submission =====
+        // 4. Submit packet to NVDEC parser (synchronous)
+        CUVIDSOURCEDATAPACKET packet = {};
+        packet.payload = packet_data;
+        packet.payload_size = static_cast<unsigned long>(packet_size);
+        packet.flags = CUVID_PKT_ENDOFPICTURE;
+        packet.timestamp = 0;  // Not used - we use submission_id instead
+
+        LOGF_INFO("[DecodeToSurface] Calling cuvidParseVideoData (submission_id=%llu)...",
+                  my_submission_id);
+
+        CUresult result = cuvidParseVideoData(m_parser, &packet);
+        // cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return
+
+        if (result != CUDA_SUCCESS) {
+            LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed with code %d", result);
+            LogCUDAError(result, "cuvidParseVideoData");
+            m_decodeErrors++;
+
+            // Release pending slot on error
+            {
+                std::lock_guard<std::mutex> lock(m_submissionMutex);
+                m_pendingSubmissions[pending_idx].in_use.store(false);
+            }
             return false;
         }
 
-        int frameIdx = my_slot.picture_indices[0];  // Return first frame only
-        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu decode complete, using first frame (picture_index=%d, total_frames=%zu)\n",
-                  slot_idx, frameIdx, my_slot.picture_indices.size());
-        OutputDebugStringA(debug_buf);
+        LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed");
+
+        // ===== Component 4: Wait and Retrieve =====
+        // 5. Find which slot NVDEC used (check all slots for our submission_id)
+        int my_slot_idx = -1;
+        for (int i = 0; i < RING_BUFFER_SIZE; ++i) {
+            if (m_ringBuffer[i].submission_id == my_submission_id) {
+                my_slot_idx = i;
+                break;
+            }
+        }
+
+        if (my_slot_idx == -1) {
+            LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id);
+            return false;
+        }
+
+        DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
+        LOGF_DEBUG("[DecodeToSurface] Found slot_idx=%d for submission_id=%llu", my_slot_idx, my_submission_id);
+
+        // 6. Wait for my turn in FIFO order
+        while (m_returnCounter.load() != my_submission_id) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+        }
+
+        LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id);
+
+        // 7. Wait for decode to complete
+        {
+            std::unique_lock<std::mutex> lock(my_slot.slot_mutex);
+
+            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500),
+                [&my_slot]() { return my_slot.is_ready.load(); })) {
+                // Timeout - decode took too long
+                LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx);
+                my_slot.in_use.store(false);
+                m_returnCounter.fetch_add(1);  // Skip to avoid deadlock
+                return false;
+            }
+        }
+
+        LOGF_DEBUG("[DecodeToSurface] Decode complete for slot %d", my_slot_idx);
+
+        // ===== Component 5: Frame Retrieval & Cleanup =====
+        // 8. Map decoded frame from NVDEC using the slot's picture_index
+        int pic_idx = my_slot.picture_index;  // CurrPicIdx from NVDEC
 
-        // For CUDA device surface, we need to map the decoded frame
         if (target_type == VAVCORE_SURFACE_CUDA_DEVICE) {
-            OutputDebugStringA("[DecodeToSurface] CUDA device path\n");
+            // CUDA device surface path
+            LOGF_DEBUG("[DecodeToSurface] CUDA device path");
 
-            // 7. Map decoded frame from NVDEC
             CUdeviceptr devicePtr = 0;
             unsigned int pitch = 0;
             CUVIDPROCPARAMS procParams = {};
             procParams.progressive_frame = 1;
 
-            result = cuvidMapVideoFrame(m_decoder, frameIdx, &devicePtr, &pitch, &procParams);
+            CUresult result = cuvidMapVideoFrame(m_decoder, pic_idx, &devicePtr, &pitch, &procParams);
             if (result != CUDA_SUCCESS) {
+                LOGF_ERROR("[DecodeToSurface] cuvidMapVideoFrame failed for pic_idx=%d", pic_idx);
                 LogCUDAError(result, "cuvidMapVideoFrame");
-                my_slot.in_use = false;
-                m_returnIdCounter.fetch_add(1);
+                my_slot.in_use.store(false);
+                m_returnCounter.fetch_add(1);
                 return false;
             }
 
@@ -1385,18 +1340,17 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
             output_frame.frame_index = m_framesDecoded;
             output_frame.timestamp_seconds = static_cast<double>(m_framesDecoded) / 30.0;
 
-            // 9. Unmap frame
+            // Unmap frame
             cuvidUnmapVideoFrame(m_decoder, devicePtr);
 
         } else if (target_type == VAVCORE_SURFACE_D3D12_RESOURCE) {
             // D3D12 resource path using D3D12SurfaceHandler
-            OutputDebugStringA("[DecodeToSurface] D3D12 resource path\n");
+            LOGF_DEBUG("[DecodeToSurface] D3D12 resource path");
 
             if (!target_surface || !m_d3d12Device) {
-                OutputDebugStringA("[DecodeToSurface] ERROR: target_surface or m_d3d12Device is null\n");
-                LogError("D3D12 resource or device not available");
-                my_slot.in_use = false;
-                m_returnIdCounter.fetch_add(1);
+                LOGF_ERROR("[DecodeToSurface] target_surface or m_d3d12Device is null");
+                my_slot.in_use.store(false);
+                m_returnCounter.fetch_add(1);
                 return false;
             }
 
@@ -1414,55 +1368,43 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                 m_cachedD3D12Device = m_d3d12Device;
                 m_cachedCuContext = m_cuContext;
 
-                OutputDebugStringA("[DecodeToSurface] D3D12SurfaceHandler (re)created\n");
+                LOGF_DEBUG("[DecodeToSurface] D3D12SurfaceHandler (re)created");
             }
 
-            sprintf_s(debug_buf, "[DecodeToSurface] Using decoded frame: frameIdx=%d\n", frameIdx);
-            OutputDebugStringA(debug_buf);
-
-            // 7. Map decoded NVDEC frame
+            // Map decoded NVDEC frame
             CUVIDPROCPARAMS procParams = {};
             procParams.progressive_frame = 1;
-            procParams.top_field_first = 0;
-            procParams.unpaired_field = 0;
 
             CUdeviceptr srcDevicePtr = 0;
             unsigned int srcPitch = 0;
 
-            result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams);
+            CUresult result = cuvidMapVideoFrame(m_decoder, pic_idx, &srcDevicePtr, &srcPitch, &procParams);
             if (result != CUDA_SUCCESS) {
-                sprintf_s(debug_buf, "[DecodeToSurface] cuvidMapVideoFrame failed with code %d\n", result);
-                OutputDebugStringA(debug_buf);
+                LOGF_ERROR("[DecodeToSurface] cuvidMapVideoFrame failed for pic_idx=%d", pic_idx);
                 LogCUDAError(result, "cuvidMapVideoFrame");
-                my_slot.in_use = false;
-                m_returnIdCounter.fetch_add(1);
+                my_slot.in_use.store(false);
+                m_returnCounter.fetch_add(1);
                 return false;
             }
 
-            sprintf_s(debug_buf, "[DecodeToSurface] cuvidMapVideoFrame succeeded: srcDevicePtr=%p, srcPitch=%u\n",
-                      (void*)srcDevicePtr, srcPitch);
-            OutputDebugStringA(debug_buf);
+            LOGF_DEBUG("[DecodeToSurface] cuvidMapVideoFrame succeeded: srcDevicePtr=%p, srcPitch=%u",
+                       (void*)srcDevicePtr, srcPitch);
 
-            // 8. Copy to D3D12 surface
-            ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(target_surface);
+            // Copy to D3D12 surface (use target_surface from slot)
+            ID3D12Resource* d3d12Resource = static_cast<ID3D12Resource*>(my_slot.target_surface);
             bool copySuccess = m_d3d12Handler->CopyNV12Frame(
                 srcDevicePtr, srcPitch,
                 d3d12Resource,
                 m_width, m_height
             );
 
-            // 9. Unmap frame
-            result = cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
-            if (result != CUDA_SUCCESS) {
-                sprintf_s(debug_buf, "[DecodeToSurface] cuvidUnmapVideoFrame failed with code %d\n", result);
-                OutputDebugStringA(debug_buf);
-                LogCUDAError(result, "cuvidUnmapVideoFrame");
-            }
+            // Unmap frame
+            cuvidUnmapVideoFrame(m_decoder, srcDevicePtr);
 
             if (!copySuccess) {
-                OutputDebugStringA("[DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed\n");
-                my_slot.in_use = false;
-                m_returnIdCounter.fetch_add(1);
+                LOGF_ERROR("[DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed");
+                my_slot.in_use.store(false);
+                m_returnCounter.fetch_add(1);
                 return false;
             }
 
@@ -1470,7 +1412,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
             m_d3d12Handler->SignalD3D12Fence(++m_fenceValue);
             output_frame.sync_fence_value = m_fenceValue;
 
-            OutputDebugStringA("[DecodeToSurface] D3D12 frame processing complete\n");
+            LOGF_DEBUG("[DecodeToSurface] D3D12 frame processing complete");
 
             // Fill output frame metadata
             output_frame.width = m_width;
@@ -1480,17 +1422,16 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
             output_frame.timestamp_seconds = static_cast<double>(m_framesDecoded) / 30.0;
         }
 
-        // 10. Release slot
+        // 9. Release slot
         {
             std::lock_guard<std::mutex> lock(my_slot.slot_mutex);
-            my_slot.in_use = false;
+            my_slot.in_use.store(false);
         }
 
-        // 11. Option C: Advance return ID counter (FIFO order)
-        m_returnIdCounter.fetch_add(1);
+        LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx);
 
-        sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu released, returnIdCounter advanced to %llu\n", slot_idx, m_returnIdCounter.load());
-        OutputDebugStringA(debug_buf);
+        // 10. Advance return counter (FIFO order)
+        m_returnCounter.fetch_add(1);
 
         // Update statistics
         auto end_time = std::chrono::high_resolution_clock::now();
@@ -1537,77 +1478,51 @@ bool NVDECAV1Decoder::SetupCUDAD3D12Interop(void* d3d_device) {
     return true;
 }
 
+// Simplified Polling Thread: Check all 8 slots for decode completion
 void NVDECAV1Decoder::PollingThreadFunc() {
-    OutputDebugStringA("[PollingThread] Started\n");
-    printf("[PollingThread] Started\n");
+    LOGF_INFO("[PollingThread] Started");
 
     // Set CUDA context once at thread start
     if (m_cuContext) {
         std::lock_guard<std::mutex> lock(m_cudaContextMutex);
         cuCtxSetCurrent(m_cuContext);
-        OutputDebugStringA("[PollingThread] CUDA context set\n");
+        LOGF_DEBUG("[PollingThread] CUDA context set");
     }
 
-    int poll_count = 0;
     while (m_pollingRunning) {
-        // Option C: Get current return ID and calculate slot index
-        uint64_t current_return_id = m_returnIdCounter.load();
-        size_t slot_idx = current_return_id % RING_BUFFER_SIZE;
-        DecodeSlot& slot = m_ringBuffer[slot_idx];
+        // 1. Check all slots for pending decodes
+        for (int slot_idx = 0; slot_idx < RING_BUFFER_SIZE; ++slot_idx) {
+            DecodeSlot& slot = m_ringBuffer[slot_idx];
 
-        // Debug logging every 100 iterations
-        if (poll_count % 100 == 0) {
-            char debug_buf[256];
-            sprintf_s(debug_buf, "[PollingThread] Option C: Poll #%d: returnId=%llu, slot=%zu, in_use=%d, is_ready=%d, frames=%zu\n",
-                      poll_count, current_return_id, slot_idx, slot.in_use, slot.is_ready, slot.picture_indices.size());
-            OutputDebugStringA(debug_buf);
-        }
-        poll_count++;
-
-        // Check if slot is in use and not yet ready
-        if (slot.in_use && !slot.is_ready) {
-            // Option C Multi-Frame Support: Get copy of picture indices
-            std::vector<int> picture_indices_copy;
-            {
-                std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                picture_indices_copy = slot.picture_indices;
+            // 2. Skip slots not in use or already ready
+            if (!slot.in_use.load() || slot.is_ready.load()) {
+                continue;
             }
 
-            // Check if ALL frames are decoded
-            bool all_complete = true;
-            for (int pic_idx : picture_indices_copy) {
-                CUVIDGETDECODESTATUS decodeStatus = {};
-                CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus);
+            // 3. Query NVDEC decode status for this slot's picture
+            CUVIDGETDECODESTATUS decodeStatus = {};
+            CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus);
 
-                if (result != CUDA_SUCCESS ||
-                    decodeStatus.decodeStatus != cuvidDecodeStatus_Success) {
-                    all_complete = false;
-                    break;
-                }
-            }
+            // 4. If decode complete, mark slot ready
+            if (result == CUDA_SUCCESS &&
+                decodeStatus.decodeStatus == cuvidDecodeStatus_Success) {
 
-            // If all frames complete, signal ready
-            if (all_complete && !picture_indices_copy.empty()) {
                 {
                     std::lock_guard<std::mutex> lock(slot.slot_mutex);
-                    slot.is_ready = true;
+                    slot.is_ready.store(true);
                 }
                 slot.frame_ready.notify_one();
 
-                char debug_buf[256];
-                sprintf_s(debug_buf, "[PollingThread] Option C: Slot %zu ALL frames complete (%zu frames)\n",
-                          slot_idx, picture_indices_copy.size());
-                OutputDebugStringA(debug_buf);
-                printf("%s", debug_buf);
+                LOGF_DEBUG("[PollingThread] Slot %d ready (submission_id=%llu)",
+                           slot_idx, slot.submission_id);
             }
         }
 
-        // Sleep to avoid busy-wait (~1% CPU usage)
+        // 5. Sleep to avoid busy-wait (~1% CPU usage)
         std::this_thread::sleep_for(std::chrono::microseconds(100));
     }
 
-    OutputDebugStringA("[PollingThread] Stopped\n");
-    printf("[PollingThread] Stopped\n");
+    LOGF_INFO("[PollingThread] Stopped");
 }
 } // namespace VavCore
 
diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
index c44d3cf..8da4cab 100644
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h
@@ -20,6 +20,8 @@ typedef struct CUexternalSemaphore_st* cudaExternalSemaphore_t;
 
 namespace VavCore {
 
+// Removed ParseContext - not needed with CurrPicIdx direct usage
+
 // NVIDIA NVDEC-based AV1 decoder for hardware acceleration
 class NVDECAV1Decoder : public IVideoDecoder {
 public:
@@ -145,33 +147,40 @@ private:
     // RingBuffer for asynchronous decoding (ulNumOutputSurfaces = 8)
     static constexpr size_t RING_BUFFER_SIZE = 8;
 
+    // Simplified DecodeSlot structure (no picture_indices vector)
     struct DecodeSlot {
         // Slot state
-        bool in_use = false;                      // Is this slot allocated?
+        std::atomic<bool> in_use{false};          // Is this NVDEC slot currently decoding?
 
-        // Submitted information
+        // Submitted information (set by DecodeToSurface)
         void* target_surface = nullptr;           // Destination D3D12 resource
         VavCoreSurfaceType surface_type = VAVCORE_SURFACE_CPU;
+        uint64_t submission_id = 0;               // For FIFO ordering (monotonic counter)
 
-        // NVDEC information (from HandlePictureDecode callback)
-        // Multi-frame support: One packet can decode to multiple frames
-        std::vector<int> picture_indices;         // All NVDEC frame indices from this packet
+        // NVDEC information (set by HandlePictureDecode callback)
+        int picture_index = -1;                   // CurrPicIdx from NVDEC (same as slot index)
 
         // Synchronization primitives
-        std::condition_variable frame_ready;      // Signaled when ALL frames are decoded
+        std::condition_variable frame_ready;      // Signaled when decode complete
         std::mutex slot_mutex;                    // Protects this slot's state
-        bool is_ready = false;                    // All frames decoded flag
+        std::atomic<bool> is_ready{false};        // Decode completed flag
     };
 
     DecodeSlot m_ringBuffer[RING_BUFFER_SIZE];
 
-    // Option C: Unified slot allocation counters (no mapping needed!)
-    std::atomic<uint64_t> m_slotIdCounter{0};     // Monotonically increasing slot ID (submission order)
-    std::atomic<uint64_t> m_returnIdCounter{0};   // Return order enforcement (FIFO)
+    // Simplified: Only submission ID for FIFO ordering
+    std::atomic<uint64_t> m_submissionCounter{0}; // Monotonic submission ID
+    std::atomic<uint64_t> m_returnCounter{0};     // Return order enforcement (FIFO)
 
-    // Mapping: picture_index → slot_id (NVDEC ignores our timestamp and uses WebM PTS)
-    std::unordered_map<int, uint64_t> m_pictureIndexToSlotId;
-    std::mutex m_mappingMutex;
+    // CRITICAL: Ring buffer for pending submissions (multi-thread safe)
+    struct PendingSubmission {
+        void* target_surface = nullptr;
+        VavCoreSurfaceType surface_type = VAVCORE_SURFACE_CPU;
+        uint64_t submission_id = 0;
+        std::atomic<bool> in_use{false};
+    };
+    PendingSubmission m_pendingSubmissions[RING_BUFFER_SIZE]; // Ring buffer for contexts
+    std::mutex m_submissionMutex;                 // Protects pending submission allocation
 
     // Polling thread for cuvidGetDecodeStatus
     std::thread m_pollingThread;