diff --git a/prompt-redesign-ringbuffer.txt b/prompt-redesign-ringbuffer.txt index 72c390a..265b3c0 100644 --- a/prompt-redesign-ringbuffer.txt +++ b/prompt-redesign-ringbuffer.txt @@ -1547,3 +1547,238 @@ 4. 변수명이 명확 (m_slotIdCounter, m_returnIdCounter) 이 방법으로 구현하시겠습니까? + + + + + + +> 사실 NVDEC API가 picture별 user_data를 지원하지 않는 이유는, picture 를 처리할 때, ringbuffer 의 index slot 기반으로 처리하고 그 슬롯을 이용해서 +후처리를 맡겼기 때문이라고 생각한다. + +● 완벽한 분석입니다! 그게 정확히 NVDEC의 설계 철학입니다. + + NVDEC의 Ring Buffer 기반 설계: + + // NVDEC이 제공하는 Picture Index (CurrPicIdx) + typedef struct _CUVIDPICPARAMS { + int CurrPicIdx; // ← NVDEC이 할당한 ring buffer slot index (0~7) + // ... + } CUVIDPICPARAMS; + + // NVDEC의 기대하는 사용 패턴: + // 1. HandlePictureDecode: CurrPicIdx 사용하여 디코딩 + // 2. HandlePictureDisplay: picture_index로 어느 슬롯이 준비되었는지 알림 + // 3. 애플리케이션: picture_index로 해당 슬롯의 surface에 접근 + + NVDEC이 user_data를 제공하지 않는 이유: + + 1. Ring Buffer Index가 이미 Context 역할: + - CurrPicIdx (0~7)가 고유 식별자 + - 애플리케이션은 이 인덱스로 자체 메타데이터 배열에 접근 + 2. NVDEC의 기대하는 구조: + // NVIDIA가 기대하는 애플리케이션 설계: + struct AppFrameContext { + void* surface; + uint64_t timestamp; + // ... 기타 메타데이터 + }; + + AppFrameContext g_frameContexts[8]; // Ring buffer와 동일한 크기 + + // HandlePictureDecode: + int idx = pic_params->CurrPicIdx; + g_frameContexts[idx].timestamp = current_timestamp; + + // HandlePictureDisplay: + int idx = disp_info->picture_index; + DisplayFrame(g_frameContexts[idx]); + + 우리의 문제: + + 우리는 NVDEC의 ring buffer (8 slots)와 다른 ring buffer 구조를 사용하고 있습니다: + - NVDEC: CurrPicIdx (0~7, NVDEC 내부 관리) + - VavCore: slot_id (monotonic counter), slot_idx = slot_id % 8 + + 충돌 지점: + + // NVDEC이 제공하는 것: + CurrPicIdx = 3 // NVDEC 내부 ring buffer의 slot 3 + + // 우리가 원하는 것: + my_id = 42 // 42번째 프레임 + slot_idx = 42 % 8 = 2 // 우리 ring buffer의 slot 2 + + // ❌ CurrPicIdx ≠ slot_idx (서로 다른 ring buffer!) + + 해결책: + + Option 1: NVDEC의 CurrPicIdx를 직접 사용 (NVIDIA 권장 방식) + // DecodeSlot을 CurrPicIdx 기반으로 관리 + DecodeSlot m_ringBuffer[8]; // NVDEC의 ring buffer와 1:1 대응 + + // HandlePictureDecode에서: + int nvdec_slot = pic_params->CurrPicIdx; + m_ringBuffer[nvdec_slot].target_surface = /* 어떻게 알지? */; + 문제: DecodeToSurface → HandlePictureDecode 간 연결 방법 필요 + + Option 2: m_currentSubmittingSlotId (현재 방식) + // DecodeToSurface: + m_currentSubmittingSlotId.store(my_id); // 제출 직전 설정 + cuvidParseVideoData(...); // 동기 호출 + + // HandlePictureDecode (cuvidParseVideoData 내부에서 즉시 호출): + uint64_t slot_id = decoder->m_currentSubmittingSlotId.load(); + decoder->m_pictureIndexToSlotId[pic_params->CurrPicIdx] = slot_id; // 매핑 저장 + + Option 3: Submission Queue + std::mutex m_submissionMutex; + std::queue m_pendingSlotIds; + + // DecodeToSurface: + { + std::lock_guard lock(m_submissionMutex); + m_pendingSlotIds.push(my_id); + } + cuvidParseVideoData(...); + + // HandlePictureDecode: + { + std::lock_guard lock(m_submissionMutex); + uint64_t slot_id = m_pendingSlotIds.front(); + m_pendingSlotIds.pop(); + m_pictureIndexToSlotId[pic_params->CurrPicIdx] = slot_id; + } + + 결론: + + 당신 말이 맞습니다. NVDEC은 CurrPicIdx ring buffer index를 제공하므로, 별도의 user_data가 필요 없다고 생각했습니다. 하지만 우리는 자체 slot_id 시스템을 + 사용하므로, NVDEC의 CurrPicIdx와 우리의 slot_id를 매핑하는 중간 단계가 필요합니다. + + m_currentSubmittingSlotId atomic 변수가 이 매핑을 위한 가장 간단하고 안전한 방법입니다. cuvidParseVideoData()가 동기 호출이므로 race condition도 + 없습니다. + + ParseContext 제거하고 m_currentSubmittingSlotId로 되돌리는 것이 맞다고 생각합니다. + +> CurrPicIdx 값과 slot_id 를 하나로 합쳐서 쓰지 못하는 이유가 있던가? + +● 훌륭한 질문입니다! 실제로 합칠 수 없는 이유를 다시 생각해보니, 합치지 못할 근본적인 이유가 없습니다. + + 현재 우리가 두 개의 인덱스를 사용하는 이유를 분석해보겠습니다: + +● Search(pattern: "CurrPicIdx|slot_id|slot_idx", path: "vav2\platforms\windows\vavcore\src\Decoder\NVDECAV1Decoder.cpp", output_mode: "content", head_limit: + 50) + ⎿  Found 50 lines (ctrl+o to expand) + +● Search(pattern: "slot_idx = |DecodeSlot\[|m_ringBuffer\[", path: "vav2\platforms\windows\vavcore\src\Decoder\NVDECAV1Decoder.cpp", output_mode: "content", + head_limit: 30) + ⎿  Found 6 lines (ctrl+o to expand) + +● 완벽한 지적입니다! CurrPicIdx를 직접 사용할 수 있습니다. + + 현재 구조의 문제점: + + // 현재: 불필요하게 복잡함 + slot_id = m_slotIdCounter.fetch_add(1); // 0, 1, 2, 3, ... (monotonic) + slot_idx = slot_id % 8; // 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, ... + + // NVDEC이 제공하는 것: + CurrPicIdx = 0~7 // 이미 ring buffer index! + + 핵심 깨달음: + + NVDEC의 CurrPicIdx (0~7)가 이미 ring buffer index 역할을 합니다! + + 왜 우리가 slot_id를 만들었는가? + - FIFO 순서 보장을 위해 (m_returnIdCounter) + - 제출 순서 추적을 위해 + + 하지만: + - NVDEC은 이미 자체 ring buffer로 순서를 관리합니다 + - CurrPicIdx는 NVDEC 내부 할당 순서를 반영합니다 + + 제안하는 단순화: + + // Option: CurrPicIdx를 직접 사용 + DecodeSlot m_ringBuffer[8]; // NVDEC과 동일한 크기 + + // DecodeToSurface: + // 1. 빈 슬롯 찾기 (모든 슬롯 스캔) + for (int i = 0; i < 8; i++) { + if (!m_ringBuffer[i].in_use.load()) { + // 슬롯 예약 + m_ringBuffer[i].in_use = true; + m_ringBuffer[i].target_surface = target_surface; + break; + } + } + + // 2. cuvidParseVideoData 호출 + cuvidParseVideoData(...); + + // HandlePictureDecode: + // NVDEC이 CurrPicIdx를 알려줌 + int nvdec_slot = pic_params->CurrPicIdx; + m_ringBuffer[nvdec_slot]./* 메타데이터 설정 */; + + // HandlePictureDisplay: + int nvdec_slot = disp_info->picture_index; + m_ringBuffer[nvdec_slot].is_ready = true; + + 문제: + DecodeToSurface에서 어느 슬롯이 사용될지 미리 알 수 없습니다. NVDEC이 CurrPicIdx를 할당하는 것은 HandlePictureDecode 콜백에서입니다. + + 따라서 여전히 매핑이 필요합니다: + + // DecodeToSurface → HandlePictureDecode 간 정보 전달 방법: + + // 방법 1: m_currentSubmittingSlotId (기존) + m_currentSubmittingSlotId.store(my_id); + + // 방법 2: target_surface를 임시 저장 + m_pendingTargetSurface = target_surface; + cuvidParseVideoData(...); + + // HandlePictureDecode: + int nvdec_slot = pic_params->CurrPicIdx; + m_ringBuffer[nvdec_slot].target_surface = decoder->m_pendingTargetSurface; + + 결론: + + CurrPicIdx를 직접 ring buffer index로 사용할 수 있지만, DecodeToSurface에서 미리 슬롯을 할당할 수 없는 문제가 남아있습니다. NVDEC이 어느 CurrPicIdx를 + 사용할지는 HandlePictureDecode에서만 알 수 있기 때문입니다. + + 최선의 단순화: + + // m_currentSubmittingContext로 단순화 (slot_id 대신 전체 컨텍스트) + struct SubmittingContext { + void* target_surface; + VavCoreSurfaceType surface_type; + }; + + std::atomic m_currentSubmittingContext; + + // DecodeToSurface: + SubmittingContext ctx{target_surface, target_type}; + m_currentSubmittingContext.store(&ctx); + cuvidParseVideoData(...); + + // HandlePictureDecode: + int nvdec_slot = pic_params->CurrPicIdx; + auto* ctx = decoder->m_currentSubmittingContext.load(); + decoder->m_ringBuffer[nvdec_slot].target_surface = ctx->target_surface; + decoder->m_ringBuffer[nvdec_slot].surface_type = ctx->surface_type; + decoder->m_ringBuffer[nvdec_slot].in_use = true; + + 이렇게 하면 slot_id, m_slotIdCounter, m_pictureIndexToSlotId 매핑을 모두 제거하고 NVDEC의 CurrPicIdx를 직접 사용할 수 있습니다! + + 이 방향으로 리팩토링할까요? + +> 좋아. 우리가 먼저 ring buffer 의 번호를 할당하는 것이 아니라, nvdec api 에서 제공하는 CurrPicIdx 를 ringbuffer index 로 사용하자는 얘기군. 좋다. + ⎿  Interrupted · What should Claude do instead? + +> 작업을 진행하기 전에 위의 개선된 내용을 docs/working/NVDEC_RingBuffer_Decode_Design.md 에 반영해둬서 나중에 설계 문서를 다시 읽을 때 헷갈리지 않도록 +해줘. + +● NVDEC RingBuffer 설계 문서에 개선된 내용을 반영하겠습니다: + diff --git a/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md b/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md index 90d0015..ae836c7 100644 --- a/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md +++ b/vav2/docs/working/NVDEC_RingBuffer_Decode_Design.md @@ -1,8 +1,8 @@ # NVDEC RingBuffer-based Asynchronous Decoding Design -**Date**: 2025-10-05 (Updated) -**Status**: ✅ **Design Finalized - Ready for Implementation** -**Target**: NVDECAV1Decoder DecodeToSurface() optimization with multi-frame packet support +**Date**: 2025-10-05 (Updated - CurrPicIdx Simplification) +**Status**: ✅ **Design Finalized - Ready for Implementation (Improved Architecture)** +**Target**: NVDECAV1Decoder DecodeToSurface() optimization using NVDEC's native CurrPicIdx --- @@ -10,12 +10,13 @@ 1. [Background & Problem Statement](#background--problem-statement) 2. [Design Goals](#design-goals) -3. [Architecture Overview](#architecture-overview) -4. [Detailed Design](#detailed-design) -5. [Implementation Plan](#implementation-plan) -6. [Performance Analysis](#performance-analysis) -7. [Risk Assessment](#risk-assessment) -8. [References](#references) +3. [NVDEC CurrPicIdx Design Philosophy](#nvdec-currpicidx-design-philosophy) ⭐ **NEW** +4. [Architecture Overview](#architecture-overview) +5. [Detailed Design](#detailed-design) +6. [Implementation Plan](#implementation-plan) +7. [Performance Analysis](#performance-analysis) +8. [Risk Assessment](#risk-assessment) +9. [References](#references) --- @@ -149,6 +150,102 @@ m_createInfo.ulNumOutputSurfaces = 8; // 8 output buffers allocated --- +## 🎯 NVDEC CurrPicIdx Design Philosophy + +### **Discovery: Why ParseContext Cannot Work** + +During implementation, we attempted to pass per-picture user data through NVDEC callbacks using a `ParseContext` approach: + +```cpp +// ❌ Failed Approach: Trying to pass per-picture context +struct ParseContext { + NVDECAV1Decoder* decoder; + uint64_t slot_id; +}; + +// In DecodeToSurface(): +ParseContext context(this, my_id); +m_parserParams.pUserData = &context; // Try to change user_data +cuvidParseVideoData(m_parser, &packet); +``` + +**Why It Failed**: +1. `cuvidCreateVideoParser()` makes an **internal copy** of `CUVIDPARSERPARAMS` at parser creation time +2. Changing `m_parserParams.pUserData` after parser creation **does not affect** NVDEC's internal copy +3. NVDEC API provides **parser-level user_data only**, not picture-level user_data +4. The `pUserData` field is set once at parser creation and used for all callbacks + +**Evidence from Testing**: +``` +[DecodeToSurface] ParseContext created: magic=0xDEADBEEFCAFEBABE, decoder=000002E42CD9E4A0, slot_id=0 +[HandlePictureDecode] ParseContext: magic=0x00007FFF71168468, decoder=0000000000000000, slot_id=0 +[HandlePictureDecode] ERROR: Invalid ParseContext magic! +``` + +### **NVDEC's Native Ring Buffer Design** + +NVDEC is designed around a **fixed 8-slot ring buffer** architecture: + +```cpp +m_createInfo.ulNumOutputSurfaces = 8; // NVDEC allocates 8 internal surfaces +``` + +**NVDEC's Philosophy**: +- Applications should use **`CurrPicIdx` (0-7)** as the direct ring buffer index +- NVDEC manages surface allocation and provides the index via `pic_params->CurrPicIdx` +- No custom slot_id allocation is needed - **NVDEC already provides the slot index** +- Applications maintain metadata arrays indexed by `CurrPicIdx` + +**Data Flow**: +``` +cuvidParseVideoData(packet) + ↓ +HandlePictureDecode(pic_params) + ↓ +CurrPicIdx = pic_params->CurrPicIdx // 0-7 (NVDEC's ring buffer index) + ↓ +Use CurrPicIdx directly: + m_ringBuffer[CurrPicIdx].target_surface = ... + m_ringBuffer[CurrPicIdx].metadata = ... +``` + +### **Eliminated Complexity** + +**Previous Approach (Unnecessary)**: +```cpp +// ❌ Custom slot_id allocation +std::atomic m_slotIdCounter{0}; +uint64_t my_id = m_slotIdCounter.fetch_add(1); +size_t slot_idx = my_id % 8; + +// ❌ Mapping CurrPicIdx → slot_id +std::unordered_map m_pictureIndexToSlotId; +m_pictureIndexToSlotId[CurrPicIdx] = my_id; +``` + +**New Approach (Direct CurrPicIdx)**: +```cpp +// ✅ Use NVDEC's CurrPicIdx directly +int nvdec_slot = pic_params->CurrPicIdx; // 0-7 +m_ringBuffer[nvdec_slot].target_surface = ...; +m_ringBuffer[nvdec_slot].is_ready = true; +``` + +### **Key Insight** + +**NVDEC provides CurrPicIdx for a reason**: +- It's the canonical ring buffer index that NVDEC uses internally +- Applications are expected to maintain side-channel metadata indexed by CurrPicIdx +- No need for custom ID allocation or mapping tables +- Simpler, more efficient, and aligned with NVDEC's design + +**What We Need Instead**: +- Store submission context (target_surface, surface_type) in a **temporary variable** +- Pass context to `HandlePictureDecode` via stack/member variable (not user_data) +- Use `CurrPicIdx` directly as ring buffer index in callback + +--- + ## 🏗️ Architecture Overview ### High-Level Design @@ -187,29 +284,34 @@ m_createInfo.ulNumOutputSurfaces = 8; // 8 output buffers allocated ### Data Structures -#### DecodeSlot Structure +#### DecodeSlot Structure (Simplified) ```cpp struct DecodeSlot { // Slot state - bool in_use; // Is this slot allocated? + std::atomic in_use{false}; // Is this NVDEC slot currently decoding? - // Submitted information (from DecodeToSurface args) + // Submitted information (set by DecodeToSurface) void* target_surface; // Destination D3D12 resource VavCoreSurfaceType surface_type; // Surface type + uint64_t submission_id; // For FIFO ordering (monotonic counter) - // NVDEC information (from HandlePictureDecode callback) - // ⚠️ Multi-frame support: One packet can decode to multiple frames - std::vector picture_indices; // All NVDEC frame indices from this packet + // NVDEC information (set by HandlePictureDecode callback) + int picture_index; // CurrPicIdx from NVDEC (same as slot index) // Synchronization primitives - std::condition_variable frame_ready; // Signaled when ALL frames are decoded + std::condition_variable frame_ready; // Signaled when decode complete std::mutex slot_mutex; // Protects this slot's state - bool is_ready; // All frames decoded flag + std::atomic is_ready{false}; // Decode completed flag }; ``` -#### RingBuffer Members +**Key Simplifications**: +- ✅ No `std::vector picture_indices` - single picture per slot +- ✅ `picture_index == slot_index` (CurrPicIdx is the slot index) +- ✅ No mapping tables needed + +#### RingBuffer Members (Simplified) ```cpp class NVDECAV1Decoder { @@ -218,9 +320,19 @@ private: DecodeSlot m_ringBuffer[RING_BUFFER_SIZE]; - // 🎯 Option C: Unified slot allocation counter (no mapping needed!) - std::atomic m_slotIdCounter{0}; // Monotonically increasing slot ID - std::atomic m_returnIdCounter{0}; // Return order enforcement (FIFO) + // 🎯 Simplified: Only submission ID for FIFO ordering + std::atomic m_submissionCounter{0}; // Monotonic submission ID + std::atomic m_returnCounter{0}; // Return order enforcement (FIFO) + + // ⚠️ CRITICAL FIX: Ring buffer for pending submissions (multi-thread safe) + struct PendingSubmission { + void* target_surface; + VavCoreSurfaceType surface_type; + uint64_t submission_id; + std::atomic in_use{false}; + }; + PendingSubmission m_pendingSubmissions[RING_BUFFER_SIZE]; // Ring buffer for contexts + std::mutex m_submissionMutex; // Protects pending submission allocation // Polling thread std::thread m_pollingThread; @@ -228,466 +340,569 @@ private: }; ``` +**Key Fix**: +- ✅ `m_pendingSubmissions[8]` ring buffer (not single struct!) +- ✅ Each slot has its own pending context +- ✅ `submission_id % 8` determines which pending slot to use +- ✅ Thread-safe: Different threads use different pending slots + +**Eliminated Variables**: +- ❌ `m_slotIdCounter` (replaced with `m_submissionCounter`) +- ❌ `m_pictureIndexToSlotId` mapping (not needed!) +- ❌ `m_currentSubmittingSlotId` atomic (use pending submission ring buffer) + --- ## 🔧 Detailed Design -### Component 1: Slot Allocation (Producer) +### Component 1: Submission Preparation (Producer) -**Purpose**: Assign RingBuffer slot to each DecodeToSurface call using Option C design +**Purpose**: Prepare submission context before calling cuvidParseVideoData ```cpp // In DecodeToSurface() -// 🎯 Option C: Allocate unique ID (serves as both slot ID and submission order) -uint64_t my_id = m_slotIdCounter.fetch_add(1); -size_t slot_idx = my_id % RING_BUFFER_SIZE; -DecodeSlot& my_slot = m_ringBuffer[slot_idx]; +// 1. Allocate submission ID for FIFO ordering +uint64_t my_submission_id = m_submissionCounter.fetch_add(1); +size_t pending_idx = my_submission_id % RING_BUFFER_SIZE; -// 2. Check for overflow -{ - std::lock_guard lock(my_slot.slot_mutex); - - if (my_slot.in_use) { - // RingBuffer full! Too many concurrent decodes - LogError("RingBuffer overflow - max 8 concurrent decodes"); - return false; - } - - // 3. Initialize slot - my_slot.in_use = true; - my_slot.target_surface = target_surface; - my_slot.surface_type = target_type; - my_slot.picture_indices.clear(); // Multi-frame support: clear previous frames - my_slot.is_ready = false; +// 2. Wait if pending slot is still in use (overflow protection) +while (m_pendingSubmissions[pending_idx].in_use.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + +// 3. Store submission context in ring buffer slot +{ + std::lock_guard lock(m_submissionMutex); + auto& pending = m_pendingSubmissions[pending_idx]; + + pending.target_surface = target_surface; + pending.surface_type = target_type; + pending.submission_id = my_submission_id; + pending.in_use.store(true); +} + +LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu", + my_submission_id, pending_idx); ``` -**🎯 Option C: No Mapping Needed - Direct Calculation**: -``` -Thread 1: m_slotIdCounter.fetch_add(1) → ID=0, slot_idx = 0 % 8 = slot[0] -Thread 2: m_slotIdCounter.fetch_add(1) → ID=1, slot_idx = 1 % 8 = slot[1] -Thread 3: m_slotIdCounter.fetch_add(1) → ID=2, slot_idx = 2 % 8 = slot[2] -... -Thread 9: m_slotIdCounter.fetch_add(1) → ID=8, slot_idx = 8 % 8 = slot[0] (wrap around) -``` - -**Key Advantage**: ID → slot_idx calculation is **deterministic**, no map storage needed! - -**Overflow Protection**: If `slot[0].in_use == true` when Thread 9 arrives → error +**Key Design Points**: +- ✅ **Ring buffer for pending contexts** - Multi-thread safe, no overwrite +- ✅ **pending_idx = submission_id % 8** - Deterministic slot allocation +- ✅ **Overflow protection** - Wait if pending slot still in use +- ✅ **No slot allocation yet** - NVDEC will provide CurrPicIdx in callback +- ✅ **Thread-safe** - Each thread uses different pending slot --- ### Component 2: Packet Submission -**Purpose**: Submit packet to NVDEC with slot ID tracking via timestamp +**Purpose**: Submit packet to NVDEC parser (cuvidParseVideoData is synchronous) ```cpp -// 4. Submit packet to NVDEC parser +// 3. Submit packet to NVDEC parser CUVIDSOURCEDATAPACKET packet = {}; packet.payload = packet_data; packet.payload_size = packet_size; packet.flags = CUVID_PKT_ENDOFPICTURE; -packet.timestamp = static_cast(my_id); // 🎯 Pass full slot_id (NOT modulo!) +packet.timestamp = 0; // Not used - we use m_pendingSubmission instead CUresult result = cuvidParseVideoData(m_parser, &packet); +// cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return + if (result != CUDA_SUCCESS) { - my_slot.in_use = false; // Release slot on error + LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed"); return false; } + +LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed"); ``` -**🎯 Option C Timestamp Flow** (multi-frame packet support): +**Simplified Flow**: ``` -cuvidParseVideoData(packet, timestamp=my_id=17) +cuvidParseVideoData(packet) + ↓ (synchronous callback) +HandlePictureDecode(pic_params) ↓ -HandleVideoSequence() (first time only) +CurrPicIdx = pic_params->CurrPicIdx // NVDEC provides slot index (0-7) ↓ -HandlePictureDecode(timestamp=17, CurrPicIdx=0) → slot_idx = 17 % 8 = 1 - → m_ringBuffer[1].picture_indices.push_back(0) +pending_idx = submission_id % 8 ↓ -HandlePictureDecode(timestamp=17, CurrPicIdx=1) → slot_idx = 17 % 8 = 1 - → m_ringBuffer[1].picture_indices.push_back(1) // Same packet, multiple frames! +Copy m_pendingSubmissions[pending_idx] → m_ringBuffer[CurrPicIdx] ↓ -HandlePictureDecode(timestamp=17, CurrPicIdx=2) → slot_idx = 17 % 8 = 1 - → m_ringBuffer[1].picture_indices.push_back(2) +Release m_pendingSubmissions[pending_idx].in_use = false ↓ -PollingThread checks ALL picture_indices for slot[1] +Return from HandlePictureDecode ↓ -When all complete: slot[1].is_ready = true, notify thread +Return from cuvidParseVideoData ``` -**Key Point**: Timestamp carries **full slot_id**, HandlePictureDecode calculates slot_idx directly +**Key Points**: +- ✅ **cuvidParseVideoData is synchronous** - callbacks complete before return +- ✅ **CurrPicIdx is the slot index** - no calculation needed +- ✅ **pending_idx = submission_id % 8** - find correct pending context +- ✅ **Ring buffer prevents overwrites** - multi-thread safe +- ✅ **Release pending slot after copy** - allow reuse for next submission --- ### Component 3: Polling Thread (Background Status Checker) -**Purpose**: Continuously poll `m_returnIdCounter` slot for decode completion (multi-frame support) +**Purpose**: Continuously poll pending slots for decode completion ```cpp void NVDECAV1Decoder::PollingThreadFunc() { while (m_pollingRunning) { - // 1. Get current return ID and calculate slot index - uint64_t current_return_id = m_returnIdCounter.load(); - size_t slot_idx = current_return_id % RING_BUFFER_SIZE; - DecodeSlot& slot = m_ringBuffer[slot_idx]; + // 1. Check all slots for pending decodes + for (int slot_idx = 0; slot_idx < RING_BUFFER_SIZE; ++slot_idx) { + DecodeSlot& slot = m_ringBuffer[slot_idx]; - // 2. Check if slot is in use and not yet ready - if (slot.in_use && !slot.is_ready) { - - // 3. Get copy of picture indices (multi-frame support) - std::vector picture_indices_copy; - { - std::lock_guard lock(slot.slot_mutex); - picture_indices_copy = slot.picture_indices; + // 2. Skip slots not in use or already ready + if (!slot.in_use.load() || slot.is_ready.load()) { + continue; } - // 4. Check if ALL frames are decoded - bool all_complete = true; - for (int pic_idx : picture_indices_copy) { - CUVIDGETDECODESTATUS decodeStatus = {}; - CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus); + // 3. Query NVDEC decode status for this slot's picture + CUVIDGETDECODESTATUS decodeStatus = {}; + CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus); - if (result != CUDA_SUCCESS || - decodeStatus.decodeStatus != cuvidDecodeStatus_Success) { - all_complete = false; - break; - } - } + // 4. If decode complete, mark slot ready + if (result == CUDA_SUCCESS && + decodeStatus.decodeStatus == cuvidDecodeStatus_Success) { - // 5. If all frames complete, signal ready - if (all_complete && !picture_indices_copy.empty()) { { std::lock_guard lock(slot.slot_mutex); - slot.is_ready = true; + slot.is_ready.store(true); } slot.frame_ready.notify_one(); + + LOGF_DEBUG("[PollingThread] Slot %d ready (submission_id=%llu)", + slot_idx, slot.submission_id); } } - // 6. Sleep to avoid busy-wait + // 5. Sleep to avoid busy-wait std::this_thread::sleep_for(std::chrono::microseconds(100)); } } ``` **Key Points**: -- ✅ Only polls `m_returnIdCounter` slot (not all 8 slots) → efficient -- ✅ **Multi-frame support**: Checks ALL picture_indices for completion +- ✅ **Simple slot iteration** - check all 8 slots (minimal overhead) +- ✅ **Single picture per slot** - no vector iteration needed +- ✅ **CurrPicIdx == slot_idx** - direct correspondence - ✅ Uses `cuvidGetDecodeStatus()` non-blocking query - ✅ 100us sleep → ~10,000 checks/second (low CPU usage) -- ✅ Thread-safe picture_indices copy to avoid lock contention --- -### Component 4: Sequential Return Wait (FIFO Guarantee) +### Component 4: Wait for Decode and Retrieve Slot -**Purpose**: Enforce FIFO order even when decodes complete out-of-order using Option C +**Purpose**: Wait for NVDEC to complete decoding and find which slot was used ```cpp -// In DecodeToSurface() - PHASE 2 +// In DecodeToSurface() - After cuvidParseVideoData returns -// 5. Wait for my turn (FIFO order enforcement) -while (m_returnIdCounter.load() != my_id) { +// 4. Find which slot NVDEC used (check all slots for our submission_id) +int my_slot_idx = -1; +for (int i = 0; i < RING_BUFFER_SIZE; ++i) { + if (m_ringBuffer[i].submission_id == my_submission_id) { + my_slot_idx = i; + break; + } +} + +if (my_slot_idx == -1) { + LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id); + return false; +} + +DecodeSlot& my_slot = m_ringBuffer[my_slot_idx]; +LOGF_DEBUG("[DecodeToSurface] Found slot_idx=%d for submission_id=%llu", my_slot_idx, my_submission_id); + +// 5. Wait for my turn in FIFO order +while (m_returnCounter.load() != my_submission_id) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } -// Now I'm first in line! +LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id); // 6. Wait for decode to complete { std::unique_lock lock(my_slot.slot_mutex); if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500), - [&my_slot]() { return my_slot.is_ready; })) { + [&my_slot]() { return my_slot.is_ready.load(); })) { // Timeout - decode took too long - LogError("Decode timeout"); - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); // Skip this slot to avoid deadlock + LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); // Skip to avoid deadlock return false; } } + +LOGF_DEBUG("[DecodeToSurface] Decode complete for slot %d", my_slot_idx); ``` -**🎯 Option C Timeline Example** (using slot ID, not slot index): +**Simplified Timeline Example**: ``` -Thread 1 (ID=17, slot 1): Wait for returnIdCounter==17 ✅ (immediate) - Wait for is_ready... +Thread 1 (submission_id=0, CurrPicIdx=3): Wait for returnCounter==0 ✅ + Wait for slot[3].is_ready... -Thread 2 (ID=18, slot 2): Wait for returnIdCounter==18 ⏸️ (blocked) +Thread 2 (submission_id=1, CurrPicIdx=5): Wait for returnCounter==1 ⏸️ (blocked) -Thread 3 (ID=19, slot 3): Wait for returnIdCounter==19 ⏸️ (blocked) +GPU: slot[5] completes first @ t=3ms + → slot[5].is_ready = true + → Thread 2 still blocked (returnCounter=0) -GPU: ID=18 completes first @ t=3ms - → slot[2].is_ready = true - → Thread 2 still blocked (returnIdCounter=17) - -GPU: ID=17 completes @ t=15ms - → slot[1].is_ready = true +GPU: slot[3] completes @ t=15ms + → slot[3].is_ready = true → Thread 1 wakes up ✅ - → Thread 1 processes → returnIdCounter = 18 + → Thread 1 processes → returnCounter = 1 → Thread 2 now unblocked ✅ ``` -**Key Point**: Wait on **slot_id** (my_id), not slot_idx, for correct FIFO ordering +**Key Points**: +- ✅ **Find slot by submission_id** - simple linear search +- ✅ **FIFO using submission_id** - not slot index +- ✅ **CurrPicIdx can be any value** - NVDEC chooses which slot to use --- -### Component 5: Frame Retrieval & Cleanup (Multi-Frame Support) +### Component 5: Frame Retrieval & Cleanup **Purpose**: Map decoded frame, copy to surface, release slot ```cpp -// In DecodeToSurface() - PHASE 3 +// In DecodeToSurface() - After decode complete -// 7. Get first frame from multi-frame packet -int frameIdx = my_slot.picture_indices[0]; // Return first frame only - -// 8. Map decoded frame from NVDEC +// 7. Map decoded frame from NVDEC using the slot's picture_index CUVIDPROCPARAMS procParams = {}; procParams.progressive_frame = 1; CUdeviceptr srcDevicePtr = 0; unsigned int srcPitch = 0; -CUresult result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams); +int pic_idx = my_slot.picture_index; // CurrPicIdx from NVDEC +CUresult result = cuvidMapVideoFrame(m_decoder, pic_idx, &srcDevicePtr, &srcPitch, &procParams); if (result != CUDA_SUCCESS) { - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); + LOGF_ERROR("[DecodeToSurface] cuvidMapVideoFrame failed for pic_idx=%d", pic_idx); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); return false; } -// 9. Copy to D3D12 surface -ID3D12Resource* d3d12Resource = static_cast(target_surface); +// 8. Copy to D3D12 surface (use target_surface from slot) +ID3D12Resource* d3d12Resource = static_cast(my_slot.target_surface); bool copySuccess = m_d3d12Handler->CopyNV12Frame( srcDevicePtr, srcPitch, d3d12Resource, m_width, m_height ); -// 10. Unmap frame +// 9. Unmap frame cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); -// 11. Release slot +// 10. Release slot { std::lock_guard lock(my_slot.slot_mutex); - my_slot.in_use = false; + my_slot.in_use.store(false); } -// 12. Advance return ID counter (FIFO order) -m_returnIdCounter.fetch_add(1); +LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx); + +// 11. Advance return counter (FIFO order) +m_returnCounter.fetch_add(1); return copySuccess; ``` -**Multi-Frame Decision**: Return **first frame only** from multi-frame packet (indices[0]) +**Key Points**: +- ✅ **Single picture per slot** - no multi-frame complexity +- ✅ **picture_index == CurrPicIdx** - direct NVDEC slot index +- ✅ **target_surface from slot** - stored during HandlePictureDecode --- -### Component 6: NVDEC Callback Integration (Option C) +### Component 6: NVDEC Callback Integration (Simplified) -**Purpose**: Link NVDEC picture_index to RingBuffer slot using direct slot_id calculation +**Purpose**: Use NVDEC's CurrPicIdx directly as ring buffer slot index ```cpp int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS* pic_params) { auto* decoder = static_cast(user_data); - // 🎯 Option C: Direct slot_id → slot_idx calculation (no map lookup!) - uint64_t slot_id = static_cast(pic_params->nTimeStamp); - size_t slot_idx = slot_id % RING_BUFFER_SIZE; + // 🎯 Use NVDEC's CurrPicIdx directly as slot index + int slot_idx = pic_params->CurrPicIdx; // 0-7 (NVDEC's ring buffer index) + + LOGF_DEBUG("[HandlePictureDecode] CurrPicIdx=%d, IntraPicFlag=%d", + slot_idx, pic_params->intra_pic_flag); + + // Validate slot index (should always be 0-7) + if (slot_idx < 0 || slot_idx >= RING_BUFFER_SIZE) { + LOGF_ERROR("[HandlePictureDecode] Invalid CurrPicIdx=%d", slot_idx); + return 0; + } DecodeSlot& slot = decoder->m_ringBuffer[slot_idx]; + // Find pending submission context using submission_id + uint64_t submission_id; + size_t pending_idx; + { + std::lock_guard lock(decoder->m_submissionMutex); + + // Find the most recent pending submission + // (In practice, we use the last allocated submission_id) + submission_id = decoder->m_submissionCounter.load() - 1; + pending_idx = submission_id % RING_BUFFER_SIZE; + + auto& pending = decoder->m_pendingSubmissions[pending_idx]; + + // Copy pending submission context to decode slot + slot.target_surface = pending.target_surface; + slot.surface_type = pending.surface_type; + slot.submission_id = pending.submission_id; + + // Release pending slot for reuse + pending.in_use.store(false); + } + + slot.picture_index = slot_idx; // Store CurrPicIdx (same as slot_idx) + slot.in_use.store(true); + slot.is_ready.store(false); + + LOGF_INFO("[HandlePictureDecode] Calling cuvidDecodePicture (slot=%d, submission_id=%llu, pending_idx=%zu)", + slot_idx, slot.submission_id, pending_idx); + // Submit frame to NVDEC decoder CUresult result = cuvidDecodePicture(decoder->m_decoder, pic_params); if (result != CUDA_SUCCESS) { - decoder->LogCUDAError(result, "cuvidDecodePicture failed"); + LOGF_ERROR("[HandlePictureDecode] cuvidDecodePicture failed: %d", result); + slot.in_use.store(false); return 0; } - // Store picture_index for polling (multi-frame support) - { - std::lock_guard lock(slot.slot_mutex); - slot.picture_indices.push_back(pic_params->CurrPicIdx); - } - - // Polling thread will check cuvidGetDecodeStatus() for ALL picture_indices + // Polling thread will check cuvidGetDecodeStatus() for this picture_index return 1; } ``` -**🎯 Option C Key Advantages**: -- ✅ **No mapping overhead**: Direct modulo calculation, no unordered_map lookup -- ✅ **No mutex contention**: No global map mutex needed -- ✅ **Multi-frame support**: Automatically handles multiple frames per packet -- ✅ **Deterministic**: Same slot_id always maps to same slot_idx +**Key Advantages of Simplified Design**: +- ✅ **Direct CurrPicIdx usage** - no calculation, no mapping +- ✅ **Ring buffer for pending contexts** - multi-thread safe, no overwrites +- ✅ **pending_idx = submission_id % 8** - deterministic context lookup +- ✅ **Release after copy** - pending slots can be reused immediately +- ✅ **Single picture per slot** - no multi-frame vector complexity +- ✅ **Aligned with NVDEC design** - use the API as intended --- -## 📐 Implementation Plan (Updated for Option C) +## 🎯 Simplified Design Summary -### Phase 1: Data Structure Setup ✅ +### **What Changed from Original Design** + +**Eliminated Complexity**: +1. ❌ Custom `slot_id` allocation system (`m_slotIdCounter`) +2. ❌ `m_pictureIndexToSlotId` mapping table +3. ❌ `std::vector picture_indices` (multi-frame support) +4. ❌ ParseContext struct and user_data manipulation +5. ❌ Timestamp tricks to pass slot_id through callbacks + +**New Simplified Approach**: +1. ✅ Use NVDEC's `CurrPicIdx` (0-7) directly as ring buffer index +2. ✅ Pass submission context via `m_pendingSubmission` member variable +3. ✅ Single picture per slot (one packet → one frame) +4. ✅ `submission_id` only for FIFO ordering, not slot allocation + +### **Data Flow Comparison** + +**Old (Complex)**: +``` +DecodeToSurface: + my_id = m_slotIdCounter++ + slot_idx = my_id % 8 + packet.timestamp = my_id + +HandlePictureDecode: + slot_id = pic_params->nTimeStamp + slot_idx = slot_id % 8 + m_pictureIndexToSlotId[CurrPicIdx] = slot_id + slot.picture_indices.push_back(CurrPicIdx) + +PollingThread: + Check ALL picture_indices in slot +``` + +**New (Simplified)**: +``` +DecodeToSurface: + submission_id = m_submissionCounter++ + pending_idx = submission_id % 8 + m_pendingSubmissions[pending_idx] = {target_surface, surface_type, submission_id} + +HandlePictureDecode: + slot_idx = pic_params->CurrPicIdx // 0-7 directly! + pending_idx = submission_id % 8 + slot = m_ringBuffer[slot_idx] + slot.copy(m_pendingSubmissions[pending_idx]) + m_pendingSubmissions[pending_idx].in_use = false // Release for reuse + slot.picture_index = slot_idx + +PollingThread: + Check slot.picture_index (single value) +``` + +### **Why This is Better** + +1. **Aligned with NVDEC's Design**: NVDEC provides CurrPicIdx as the canonical slot index +2. **Multi-thread Safe**: Ring buffer for pending contexts prevents overwrites +3. **Less Code**: Fewer data structures, less synchronization overhead +4. **No Mapping Tables**: Direct indexing, no hash map lookups +5. **Simpler Debugging**: CurrPicIdx == slot_idx == picture_index +6. **Easier to Understand**: Follow NVDEC's intended usage pattern +7. **Deterministic Context Lookup**: pending_idx = submission_id % 8 + +--- + +## 📐 Implementation Plan (Updated for Simplified Design) + +### Phase 1: Data Structure Setup **Files to Modify**: -- `NVDECAV1Decoder.h` - Add RingBuffer members with Option C design +- `NVDECAV1Decoder.h` - Add simplified RingBuffer members - `NVDECAV1Decoder.cpp` - Initialize RingBuffer in constructor **Tasks**: -- [x] Define `DecodeSlot` structure with `std::vector picture_indices` -- [x] Add `m_ringBuffer[8]` array -- [x] **🎯 Option C**: Add `m_slotIdCounter`, `m_returnIdCounter` atomic counters (NOT submitIndex/returnIndex) -- [x] Add `m_pollingThread`, `m_pollingRunning` members +- [ ] Define simplified `DecodeSlot` structure (no picture_indices vector) +- [ ] Add `m_ringBuffer[8]` array +- [ ] Add `m_submissionCounter`, `m_returnCounter` atomic counters +- [ ] **⚠️ CRITICAL**: Add `PendingSubmission` struct with `in_use` flag +- [ ] **⚠️ CRITICAL**: Add `m_pendingSubmissions[8]` ring buffer array (NOT single struct!) +- [ ] Add `m_submissionMutex` for pending submission allocation +- [ ] Add `m_pollingThread`, `m_pollingRunning` members +- [ ] Remove old variables: `m_currentSubmittingSlotId`, any mapping tables **Estimated Time**: 30 minutes --- -### Phase 2: Polling Thread Implementation (Multi-Frame Support) +### Phase 2: Polling Thread Implementation (Simplified) **Files to Modify**: - `NVDECAV1Decoder.cpp` - Implement `PollingThreadFunc()` **Tasks**: -- [ ] Implement polling loop with `cuvidGetDecodeStatus()` checking ALL picture_indices -- [ ] Poll slot at `m_returnIdCounter % RING_BUFFER_SIZE` (NOT all slots) +- [ ] Implement simple polling loop iterating all 8 slots +- [ ] Check `slot.in_use` and `!slot.is_ready` to skip idle slots +- [ ] Use `cuvidGetDecodeStatus()` for single `slot.picture_index` - [ ] Add thread start in `Initialize()` - [ ] Add thread stop in `Cleanup()` -- [ ] Add debug logging for multi-frame packet events +- [ ] Add debug logging for slot readiness **Testing**: - Verify thread starts/stops correctly -- Verify multi-frame packet handling (all frames checked) +- Verify single picture per slot handling - Verify `cuvidGetDecodeStatus()` calls work -**Estimated Time**: 1 hour +**Estimated Time**: 45 minutes --- -### Phase 3: DecodeToSurface Refactoring (Option C Implementation) +### Phase 3: DecodeToSurface Refactoring (Simplified Implementation) **Files to Modify**: -- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` with Option C design +- `NVDECAV1Decoder.cpp` - Rewrite `DecodeToSurface()` with simplified design **Tasks**: -- [ ] **🎯 Phase 1**: Slot allocation using `my_id = m_slotIdCounter.fetch_add(1)` -- [ ] **🎯 Phase 2**: Packet submission with `packet.timestamp = my_id` (full slot_id, NOT modulo!) -- [ ] **🎯 Phase 3**: FIFO wait using `while(m_returnIdCounter != my_id)` (NOT slot_idx!) -- [ ] **🎯 Phase 4**: Frame retrieval from `picture_indices[0]` (first frame only) -- [ ] **🎯 Phase 5**: Cleanup and `m_returnIdCounter.fetch_add(1)` +- [ ] **Phase 1**: Allocate `submission_id` from `m_submissionCounter` +- [ ] **Phase 2**: Calculate `pending_idx = submission_id % 8` +- [ ] **Phase 3**: Wait if `m_pendingSubmissions[pending_idx].in_use` (overflow protection) +- [ ] **Phase 4**: Store submission context in `m_pendingSubmissions[pending_idx]` +- [ ] **Phase 5**: Call `cuvidParseVideoData()` (synchronous, callback completes) +- [ ] **Phase 6**: Find slot by searching for `submission_id` in ring buffer +- [ ] **Phase 7**: Wait for FIFO turn using `m_returnCounter` +- [ ] **Phase 8**: Wait for `slot.is_ready` condition variable +- [ ] **Phase 9**: Map/copy frame using `slot.picture_index` +- [ ] **Phase 10**: Release slot and increment `m_returnCounter` - [ ] Error handling for all failure paths **Testing**: - Single-threaded decode test - Multi-threaded decode test (2-3 threads) -- Verify packet-surface mapping correctness with multi-frame packets +- **⚠️ Multi-thread race test**: Verify no pending context overwrites +- Verify CurrPicIdx is used correctly as slot index - Verify FIFO ordering with out-of-order completion -**Estimated Time**: 2 hours +**Estimated Time**: 1.5 hours --- -### Phase 4: HandlePictureDecode Update (Option C) +### Phase 4: HandlePictureDecode Update (Simplified) **Files to Modify**: - `NVDECAV1Decoder.cpp` - Modify `HandlePictureDecode()` callback **Tasks**: -- [ ] **🎯 Option C**: Extract `slot_id` from `pic_params->nTimeStamp` -- [ ] **🎯 Option C**: Calculate `slot_idx = slot_id % RING_BUFFER_SIZE` (NO map lookup!) -- [ ] **🎯 Multi-frame**: Use `slot.picture_indices.push_back(CurrPicIdx)` (NOT single index!) -- [ ] Add debug logging for multi-frame packets +- [ ] **Direct CurrPicIdx**: Use `slot_idx = pic_params->CurrPicIdx` (0-7) +- [ ] **Find pending context**: Calculate `pending_idx = submission_id % 8` +- [ ] **Copy submission context**: Copy `m_pendingSubmissions[pending_idx]` to `slot` +- [ ] **Release pending slot**: Set `m_pendingSubmissions[pending_idx].in_use = false` +- [ ] **Store picture_index**: Set `slot.picture_index = slot_idx` +- [ ] **Mark slot active**: `slot.in_use = true`, `slot.is_ready = false` +- [ ] Add debug logging for slot assignment and pending release **Testing**: -- Verify timestamp → slot_idx direct calculation works -- Verify picture_indices vector correctly stores multiple frames -- Test with video that has multi-frame packets (test_720p_stripe.webm) +- Verify CurrPicIdx maps correctly to slot index +- **⚠️ Critical**: Verify pending_idx calculation is correct +- Verify submission context is copied correctly +- Verify pending slot is released immediately after copy +- Test with both simple and complex videos -**Estimated Time**: 30 minutes +**Estimated Time**: 25 minutes --- -### Phase 5: Integration Testing (Option C Validation) +### Phase 5: Integration Testing (Simplified Design Validation) **Test Scenarios**: -1. **Single packet decode** - Verify Option C basic functionality -2. **Multi-frame packet** - Verify vector-based picture_indices handling (test_720p_stripe.webm) -3. **Sequential 3 packets** - Verify FIFO order using m_returnIdCounter -4. **Out-of-order completion** - Verify slot_id → slot_idx mapping (I-frame after P-frame) -5. **RingBuffer overflow** - Verify error handling (9+ concurrent calls) -6. **Decode errors** - Verify graceful failure with multi-frame packets -7. **Performance benchmark** - Measure latency reduction vs old queue-based approach +1. **Single packet decode** - Verify CurrPicIdx direct usage +2. **Sequential 3 packets** - Verify FIFO order using m_returnCounter +3. **Out-of-order completion** - Verify submission_id FIFO (I-frame after P-frame) +4. **RingBuffer overflow** - Verify error handling (9+ concurrent calls) +5. **Decode errors** - Verify graceful failure and slot release +6. **Performance benchmark** - Measure latency reduction vs old queue-based approach **Test Files**: - Simple test video (simple_test.webm) - basic validation -- **Multi-frame packet video (test_720p_stripe.webm)** - critical multi-frame test ⚠️ +- Complex video (test_720p_stripe.webm) - stress testing **Validation Criteria**: -- ✅ No slot ID → slot_idx mapping errors -- ✅ All frames from multi-frame packets detected and polled +- ✅ CurrPicIdx used correctly as slot index (no calculation errors) +- ✅ submission_id correctly tracks FIFO order - ✅ FIFO order maintained even with out-of-order GPU completion - ✅ No memory corruption or race conditions +- ✅ No mapping table lookups (simpler code path) -**Estimated Time**: 2 hours +**Estimated Time**: 1.5 hours --- ### Phase 6: Documentation & Cleanup **Tasks**: -- [x] Update NVDEC design documentation with Option C and multi-frame support -- [ ] Add inline code comments explaining Option C design choices -- [ ] Remove old queue-based code and any ID→Index mapping attempts +- [x] Update NVDEC design documentation with simplified CurrPicIdx design +- [ ] Add inline code comments explaining CurrPicIdx direct usage +- [ ] Remove old queue-based code and any mapping table attempts +- [ ] Remove ParseContext struct and related code - [ ] Move design doc to `docs/completed/` after successful implementation -- [ ] Document multi-frame packet behavior and first-frame-only decision +- [ ] Document NVDEC's ring buffer philosophy for future reference -**Estimated Time**: 1 hour +**Estimated Time**: 45 minutes --- -## 🎯 Option C Design Summary +**Total Estimated Time**: ~5 hours (reduced from ~7 hours with previous design) -### **Core Principle**: Eliminate mapping overhead through deterministic calculation - -**Key Components**: -1. **Single Counter for Dual Purpose**: `m_slotIdCounter` serves as both unique ID and submission order -2. **Direct Slot Calculation**: `slot_idx = slot_id % RING_BUFFER_SIZE` (no map needed) -3. **FIFO via ID Comparison**: `while(m_returnIdCounter != my_id)` ensures ordering -4. **Multi-Frame Vector**: `std::vector picture_indices` handles packets with multiple frames - -**Data Flow**: -``` -DecodeToSurface: - my_id = m_slotIdCounter++ (e.g., 17) - slot_idx = 17 % 8 = 1 - packet.timestamp = 17 - -HandlePictureDecode: - slot_id = pic_params->nTimeStamp (17) - slot_idx = 17 % 8 = 1 - m_ringBuffer[1].picture_indices.push_back(CurrPicIdx) - -PollingThread: - return_id = m_returnIdCounter (17) - slot_idx = 17 % 8 = 1 - Check all m_ringBuffer[1].picture_indices[] - -DecodeToSurface (wait): - while(m_returnIdCounter != 17) { wait } - Process frame from picture_indices[0] - m_returnIdCounter++ (18) -``` - -**Eliminated Complexity**: -- ❌ No `std::unordered_map` mapping -- ❌ No global map mutex -- ❌ No map insert/erase operations -- ❌ No lookup failures or stale entries - -**Multi-Frame Packet Handling**: -- One packet → multiple HandlePictureDecode calls -- All frames stored in `picture_indices` vector -- PollingThread checks ALL frames complete -- Return first frame only (`picture_indices[0]`) --- @@ -907,6 +1122,16 @@ void PollingThreadFunc() { | 2025-10-04 | Claude | Added RingBuffer architecture details | | 2025-10-04 | Claude | Added polling thread design | | 2025-10-04 | Claude | Added performance analysis | +| 2025-10-05 | Claude | **MAJOR UPDATE**: Simplified design to use NVDEC's CurrPicIdx directly | +| 2025-10-05 | Claude | Removed ParseContext approach (failed due to NVDEC parameter copying) | +| 2025-10-05 | Claude | Removed custom slot_id allocation and mapping tables | +| 2025-10-05 | Claude | Added "NVDEC CurrPicIdx Design Philosophy" section | +| 2025-10-05 | Claude | Updated all components to use simplified architecture | +| 2025-10-05 | Claude | Reduced implementation time estimate from 7h to 5h | +| 2025-10-05 | Claude | **CRITICAL FIX**: Changed m_pendingSubmission to ring buffer array | +| 2025-10-05 | Claude | Added PendingSubmission[8] to prevent multi-thread overwrites | +| 2025-10-05 | Claude | Updated all components to use pending_idx = submission_id % 8 | +| 2025-10-05 | Claude | Added overflow protection and pending slot release logic | --- diff --git a/vav2/platforms/windows/vavcore/VavCore.vcxproj b/vav2/platforms/windows/vavcore/VavCore.vcxproj index 7670e2b..48efdcd 100644 --- a/vav2/platforms/windows/vavcore/VavCore.vcxproj +++ b/vav2/platforms/windows/vavcore/VavCore.vcxproj @@ -118,6 +118,7 @@ + @@ -141,6 +142,7 @@ + diff --git a/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.cpp b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.cpp new file mode 100644 index 0000000..ae43f6b --- /dev/null +++ b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.cpp @@ -0,0 +1,121 @@ +#include "pch.h" +#include "VavCoreLogger.h" + +#include +#include + +#ifdef _WIN32 +#include +#endif + +namespace VavCore { + +VavCoreLogger& VavCoreLogger::GetInstance() { + static VavCoreLogger instance; + return instance; +} + +void VavCoreLogger::LogDebugF(const char* format, ...) { + va_list args; + va_start(args, format); + LogFormattedV(VC_LogLevel::VC_DEBUG, format, args); + va_end(args); +} + +void VavCoreLogger::LogInfoF(const char* format, ...) { + va_list args; + va_start(args, format); + LogFormattedV(VC_LogLevel::VC_INFO, format, args); + va_end(args); +} + +void VavCoreLogger::LogWarningF(const char* format, ...) { + va_list args; + va_start(args, format); + LogFormattedV(VC_LogLevel::VC_WARNING, format, args); + va_end(args); +} + +void VavCoreLogger::LogErrorF(const char* format, ...) { + va_list args; + va_start(args, format); + LogFormattedV(VC_LogLevel::VC_ERROR, format, args); + va_end(args); +} + +void VavCoreLogger::LogDebug(const std::string& message, const char* source) { + LogString(VC_LogLevel::VC_DEBUG, message, source); +} + +void VavCoreLogger::LogInfo(const std::string& message, const char* source) { + LogString(VC_LogLevel::VC_INFO, message, source); +} + +void VavCoreLogger::LogWarning(const std::string& message, const char* source) { + LogString(VC_LogLevel::VC_WARNING, message, source); +} + +void VavCoreLogger::LogError(const std::string& message, const char* source) { + LogString(VC_LogLevel::VC_ERROR, message, source); +} + +void VavCoreLogger::LogFormattedV(VC_LogLevel level, const char* format, va_list args) { + // Format message + char buffer[1024]; + vsnprintf(buffer, sizeof(buffer), format, args); + + // Output to console + if (level == VC_LogLevel::VC_ERROR || level == VC_LogLevel::VC_WARNING) { + std::cerr << buffer; + if (buffer[strlen(buffer) - 1] != '\n') { + std::cerr << std::endl; + } + } else { + std::cout << buffer; + if (buffer[strlen(buffer) - 1] != '\n') { + std::cout << std::endl; + } + } + +#ifdef _WIN32 + // Output to Visual Studio debug console + OutputDebugStringA(buffer); + if (buffer[strlen(buffer) - 1] != '\n') { + OutputDebugStringA("\n"); + } +#endif +} + +void VavCoreLogger::LogString(VC_LogLevel level, const std::string& message, const char* source) { + std::string fullMessage; + if (source) { + fullMessage = "[" + std::string(source) + "] " + message; + } else { + fullMessage = message; + } + + // Output to console + if (level == VC_LogLevel::VC_ERROR || level == VC_LogLevel::VC_WARNING) { + std::cerr << fullMessage << std::endl; + } else { + std::cout << fullMessage << std::endl; + } + +#ifdef _WIN32 + // Output to Visual Studio debug console + OutputDebugStringA(fullMessage.c_str()); + OutputDebugStringA("\n"); +#endif +} + +const char* VavCoreLogger::GetLevelString(VC_LogLevel level) { + switch (level) { + case VC_LogLevel::VC_DEBUG: return "[DEBUG]"; + case VC_LogLevel::VC_INFO: return "[INFO]"; + case VC_LogLevel::VC_WARNING: return "[WARNING]"; + case VC_LogLevel::VC_ERROR: return "[ERROR]"; + default: return "[UNKNOWN]"; + } +} + +} // namespace VavCore diff --git a/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.h b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.h new file mode 100644 index 0000000..1c26821 --- /dev/null +++ b/vav2/platforms/windows/vavcore/src/Common/VavCoreLogger.h @@ -0,0 +1,60 @@ +#pragma once + +#include +#include + +namespace VavCore { + +// Simple singleton logger for VavCore +class VavCoreLogger { +public: + enum class VC_LogLevel { + VC_DEBUG, + VC_INFO, + VC_WARNING, + VC_ERROR + }; + + static VavCoreLogger& GetInstance(); + + // Delete copy/move constructors + VavCoreLogger(const VavCoreLogger&) = delete; + VavCoreLogger& operator=(const VavCoreLogger&) = delete; + + // Printf-style formatted logging + void LogDebugF(const char* format, ...); + void LogInfoF(const char* format, ...); + void LogWarningF(const char* format, ...); + void LogErrorF(const char* format, ...); + + // std::string-style logging with optional source + void LogDebug(const std::string& message, const char* source = nullptr); + void LogInfo(const std::string& message, const char* source = nullptr); + void LogWarning(const std::string& message, const char* source = nullptr); + void LogError(const std::string& message, const char* source = nullptr); + +private: + VavCoreLogger() = default; + + void LogFormattedV(VC_LogLevel level, const char* format, va_list args); + void LogString(VC_LogLevel level, const std::string& message, const char* source); + const char* GetLevelString(VC_LogLevel level); +}; + +} // namespace VavCore + +// Global logging macros for VavCore + +// LOGF_* : For printf-style formatted messages (const char* format, ...) +// Usage: LOGF_INFO("[Component] Value: %d", value) +#define LOGF_DEBUG(...) VavCore::VavCoreLogger::GetInstance().LogDebugF(__VA_ARGS__) +#define LOGF_INFO(...) VavCore::VavCoreLogger::GetInstance().LogInfoF(__VA_ARGS__) +#define LOGF_WARNING(...) VavCore::VavCoreLogger::GetInstance().LogWarningF(__VA_ARGS__) +#define LOGF_ERROR(...) VavCore::VavCoreLogger::GetInstance().LogErrorF(__VA_ARGS__) + +// LOGS_* : For std::string messages with optional source parameter +// Usage: LOGS_INFO(errorMessage, "NetworkModule") +#define LOGS_DEBUG(msg, source) VavCore::VavCoreLogger::GetInstance().LogDebug(msg, source) +#define LOGS_INFO(msg, source) VavCore::VavCoreLogger::GetInstance().LogInfo(msg, source) +#define LOGS_WARNING(msg, source) VavCore::VavCoreLogger::GetInstance().LogWarning(msg, source) +#define LOGS_ERROR(msg, source) VavCore::VavCoreLogger::GetInstance().LogError(msg, source) diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp index 8b10bc5..77f2d49 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp @@ -20,6 +20,7 @@ #include "NVDECAV1Decoder.h" #include "D3D12SurfaceHandler.h" #include "VideoDecoderFactory.h" +#include "../Common/VavCoreLogger.h" const char* g_deinterleave_kernel_ptx = R"PTX( // @@ -835,8 +836,13 @@ void NVDECAV1Decoder::CleanupCUDA() { // NVDEC Callbacks int CUDAAPI NVDECAV1Decoder::HandleVideoSequence(void* user_data, CUVIDEOFORMAT* format) { + if (!user_data || !format) { + return 0; + } + + // user_data is always NVDECAV1Decoder* in the simplified design auto* decoder = static_cast(user_data); - if (!decoder || !format) { + if (!decoder) { return 0; } @@ -918,120 +924,97 @@ int CUDAAPI NVDECAV1Decoder::HandleVideoSequence(void* user_data, CUVIDEOFORMAT* return format->min_num_decode_surfaces; } +// Simplified HandlePictureDecode: Use NVDEC's CurrPicIdx directly as ring buffer slot index int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS* pic_params) { - OutputDebugStringA("[HandlePictureDecode] === CALLBACK CALLED ===\n"); + if (!user_data || !pic_params) { + LOGF_ERROR("[HandlePictureDecode] Invalid user_data or pic_params"); + return 0; + } auto* decoder = static_cast(user_data); - if (!decoder || !pic_params) { - OutputDebugStringA("[HandlePictureDecode] ERROR: Invalid user_data or pic_params\n"); + + // Use NVDEC's CurrPicIdx directly as slot index (0-7) + int slot_idx = pic_params->CurrPicIdx; + + LOGF_DEBUG("[HandlePictureDecode] CurrPicIdx=%d, IntraPicFlag=%d", + slot_idx, pic_params->intra_pic_flag); + + // Validate slot index (should always be 0-7) + if (slot_idx < 0 || slot_idx >= RING_BUFFER_SIZE) { + LOGF_ERROR("[HandlePictureDecode] Invalid CurrPicIdx=%d", slot_idx); return 0; } - if (!decoder->m_decoder) { - OutputDebugStringA("[HandlePictureDecode] ERROR: m_decoder is nullptr!\n"); - return 0; + DecodeSlot& slot = decoder->m_ringBuffer[slot_idx]; + + // Find pending submission context using submission_id + uint64_t submission_id; + size_t pending_idx; + { + std::lock_guard lock(decoder->m_submissionMutex); + + // Find the most recent pending submission + submission_id = decoder->m_submissionCounter.load() - 1; + pending_idx = submission_id % RING_BUFFER_SIZE; + + auto& pending = decoder->m_pendingSubmissions[pending_idx]; + + // Copy pending submission context to decode slot + slot.target_surface = pending.target_surface; + slot.surface_type = pending.surface_type; + slot.submission_id = pending.submission_id; + + // Release pending slot for reuse + pending.in_use.store(false); } + slot.picture_index = slot_idx; // Store CurrPicIdx (same as slot_idx) + slot.in_use.store(true); + slot.is_ready.store(false); + + LOGF_INFO("[HandlePictureDecode] Calling cuvidDecodePicture (slot=%d, submission_id=%llu, pending_idx=%zu)", + slot_idx, slot.submission_id, pending_idx); + // Ensure CUDA context is current before decoding if (decoder->m_cuContext) { CUresult ctxResult = cuCtxSetCurrent(decoder->m_cuContext); if (ctxResult != CUDA_SUCCESS) { - char debug_buf[256]; - sprintf_s(debug_buf, "[HandlePictureDecode] Failed to set CUDA context: %d\n", ctxResult); - OutputDebugStringA(debug_buf); + LOGF_ERROR("[HandlePictureDecode] Failed to set CUDA context: %d", ctxResult); + slot.in_use.store(false); return 0; } } - // Store mapping: picture_index (CurrPicIdx) slot_id (last allocated) - // NVDEC ignores our timestamp and uses WebM PTS, so we need this mapping - uint64_t current_slot_id = decoder->m_slotIdCounter.load() - 1; - { - std::lock_guard lock(decoder->m_mappingMutex); - decoder->m_pictureIndexToSlotId[pic_params->CurrPicIdx] = current_slot_id; - } - - char debug_buf[512]; - sprintf_s(debug_buf, "[HandlePictureDecode] Calling cuvidDecodePicture (decoder=%p, CurrPicIdx=%d, slot_id=%llu, IntraPicFlag=%d)\n", - decoder->m_decoder, pic_params->CurrPicIdx, current_slot_id, pic_params->intra_pic_flag); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - + // Submit frame to NVDEC decoder CUresult result = cuvidDecodePicture(decoder->m_decoder, pic_params); if (result != CUDA_SUCCESS) { - sprintf_s(debug_buf, "[HandlePictureDecode] cuvidDecodePicture failed with code %d\n", result); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - decoder->LogCUDAError(result, "cuvidDecodePicture"); + LOGF_ERROR("[HandlePictureDecode] cuvidDecodePicture failed: %d", result); + slot.in_use.store(false); return 0; } - sprintf_s(debug_buf, "[HandlePictureDecode] cuvidDecodePicture succeeded (CurrPicIdx=%d)\n", pic_params->CurrPicIdx); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); + // Polling thread will check cuvidGetDecodeStatus() for this picture_index - return 1; // Success + return 1; } int CUDAAPI NVDECAV1Decoder::HandlePictureDisplay(void* user_data, CUVIDPARSERDISPINFO* disp_info) { - auto* decoder = static_cast(user_data); - if (!decoder || !disp_info) { - OutputDebugStringA("[HandlePictureDisplay] ERROR: Invalid user_data or disp_info\n"); + if (!user_data || !disp_info) { + LOGF_ERROR("[HandlePictureDisplay] Invalid user_data or disp_info"); return 0; } - // Lookup slot_id from mapping table (picture_index slot_id) - uint64_t slot_id; - { - std::lock_guard lock(decoder->m_mappingMutex); - auto it = decoder->m_pictureIndexToSlotId.find(disp_info->picture_index); - if (it == decoder->m_pictureIndexToSlotId.end()) { - char err_buf[256]; - sprintf_s(err_buf, "[HandlePictureDisplay] ERROR: picture_index=%d not found in mapping table\n", - disp_info->picture_index); - OutputDebugStringA(err_buf); - printf("%s", err_buf); - return 0; - } - slot_id = it->second; - } - size_t slot_idx = slot_id % decoder->RING_BUFFER_SIZE; - DecodeSlot& slot = decoder->m_ringBuffer[slot_idx]; + auto* decoder = static_cast(user_data); - char debug_buf[256]; - sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: slot_id=%llu, slot_idx=%zu, picture_index=%d\n", - slot_id, slot_idx, disp_info->picture_index); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); + // Note: In the simplified design, the polling thread handles marking frames as ready + // HandlePictureDisplay is still called but we rely on cuvidGetDecodeStatus polling + // This callback just confirms the frame is ready for display - // Option C Multi-Frame Support: Store picture_index in vector (one packet can have multiple frames) - { - std::lock_guard lock(slot.slot_mutex); - slot.picture_indices.push_back(disp_info->picture_index); - } + int pic_idx = disp_info->picture_index; + LOGF_DEBUG("[HandlePictureDisplay] picture_index=%d ready for display", pic_idx); - sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Pushed picture_index=%d to slot %zu (total frames: %zu)\n", - disp_info->picture_index, slot_idx, slot.picture_indices.size()); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - - // IMPORTANT: HandlePictureDisplay is called AFTER decoding completes - // So we can mark the slot as ready immediately (no polling needed for this callback) - bool should_notify = false; - { - std::lock_guard lock(slot.slot_mutex); - if (!slot.is_ready) { - slot.is_ready = true; - should_notify = true; - } - } - - if (should_notify) { - slot.frame_ready.notify_one(); - sprintf_s(debug_buf, "[HandlePictureDisplay] Option C: Slot %zu marked ready and notified\n", slot_idx); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - } + // The polling thread will mark the slot as ready via cuvidGetDecodeStatus + // No action needed here in the simplified design return 1; } @@ -1202,175 +1185,147 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ VavCoreSurfaceType target_type, void* target_surface, VideoFrame& output_frame) { - char debug_buf[256]; - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] Called with target_type=%d (0=CPU, 1=D3D11, 2=D3D12, 3=CUDA, 4=AMF)\n", - static_cast(target_type)); - OutputDebugStringA(debug_buf); + LOGF_DEBUG("[DecodeToSurface] Called with target_type=%d", static_cast(target_type)); if (!m_initialized || !packet_data) { - OutputDebugStringA("[NVDECAV1Decoder::DecodeToSurface] ERROR: Not initialized or null packet_data\n"); + LOGF_ERROR("[DecodeToSurface] Not initialized or null packet_data"); return false; } - // CRITICAL: Set CUDA context for current thread (may be background thread) - // Thread safety: Lock mutex ONLY for context switching, then release immediately + // Set CUDA context for current thread { std::lock_guard contextLock(m_cudaContextMutex); CUresult ctxResult = cuCtxSetCurrent(m_cuContext); if (ctxResult != CUDA_SUCCESS) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] ERROR: cuCtxSetCurrent failed with code %d\n", ctxResult); - OutputDebugStringA(debug_buf); + LOGF_ERROR("[DecodeToSurface] cuCtxSetCurrent failed with code %d", ctxResult); return false; } - } // Unlock mutex immediately after context switch + } if (!SupportsSurfaceType(target_type)) { - sprintf_s(debug_buf, "[NVDECAV1Decoder::DecodeToSurface] ERROR: Surface type %d not supported\n", - static_cast(target_type)); - OutputDebugStringA(debug_buf); + LOGF_ERROR("[DecodeToSurface] Surface type %d not supported", static_cast(target_type)); return false; } try { auto start_time = std::chrono::high_resolution_clock::now(); - // ===== Phase 1: Option C Slot Allocation ===== - // 1. Allocate unique slot ID (serves as both ID and submission order) - uint64_t my_id = m_slotIdCounter.fetch_add(1); - size_t slot_idx = my_id % RING_BUFFER_SIZE; - DecodeSlot& my_slot = m_ringBuffer[slot_idx]; + // ===== Component 1: Submission Preparation ===== + // 1. Allocate submission ID for FIFO ordering + uint64_t my_submission_id = m_submissionCounter.fetch_add(1); + size_t pending_idx = my_submission_id % RING_BUFFER_SIZE; - sprintf_s(debug_buf, "[DecodeToSurface] Option C: Allocated slot_id=%llu, slot_idx=%zu\n", my_id, slot_idx); - OutputDebugStringA(debug_buf); + LOGF_DEBUG("[DecodeToSurface] Allocated submission_id=%llu, pending_idx=%zu", + my_submission_id, pending_idx); - // 2. Check for overflow (RingBuffer full) - { - std::lock_guard lock(my_slot.slot_mutex); - - if (my_slot.in_use) { - // RingBuffer overflow - too many concurrent decodes - sprintf_s(debug_buf, "[DecodeToSurface] ERROR: RingBuffer slot %zu still in use (overflow)\n", slot_idx); - OutputDebugStringA(debug_buf); - LogError("RingBuffer overflow - max 8 concurrent decodes"); - return false; - } - - // 3. Initialize slot (multi-frame support) - my_slot.in_use = true; - my_slot.target_surface = target_surface; - my_slot.surface_type = target_type; - my_slot.picture_indices.clear(); // Clear previous frames - my_slot.is_ready = false; - } - - // ===== Component 2: Packet Submission ===== - // 4. Send packet directly without prepending codec private data - const uint8_t* final_packet_data = packet_data; - size_t final_packet_size = packet_size; - - if (!m_firstFrameSent) { - m_firstFrameSent = true; - sprintf_s(debug_buf, "[DecodeToSurface] First frame: sending packet directly without codec private data (%zu bytes)\n", packet_size); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - } - - // 5. Option C: Submit packet with full slot_id in timestamp (NOT modulo!) - CUVIDSOURCEDATAPACKET packet = {}; - packet.payload = final_packet_data; - packet.payload_size = static_cast(final_packet_size); - packet.flags = CUVID_PKT_ENDOFPICTURE; - packet.timestamp = static_cast(my_id); // Pass full slot_id for HandlePictureDecode - - sprintf_s(debug_buf, "[DecodeToSurface] Calling cuvidParseVideoData with timestamp=%lld (slot_id=%llu, slot_idx=%zu)...\n", packet.timestamp, my_id, slot_idx); - OutputDebugStringA(debug_buf); - - CUresult result = cuvidParseVideoData(m_parser, &packet); - if (result != CUDA_SUCCESS) { - sprintf_s(debug_buf, "[DecodeToSurface] cuvidParseVideoData failed with code %d\n", result); - OutputDebugStringA(debug_buf); - LogCUDAError(result, "cuvidParseVideoData"); - m_decodeErrors++; - - // Release slot on error - { - std::lock_guard lock(my_slot.slot_mutex); - my_slot.in_use = false; - } - return false; - } - OutputDebugStringA("[DecodeToSurface] cuvidParseVideoData succeeded\n"); - - // ===== Component 4: Wait for decode to complete FIRST ===== - // 6. Wait for decode to complete (HandlePictureDisplay signals this) - { - std::unique_lock lock(my_slot.slot_mutex); - - sprintf_s(debug_buf, "[DecodeToSurface] Waiting for decode complete (slot_idx=%zu, is_ready=%d)...\n", - slot_idx, my_slot.is_ready); - OutputDebugStringA(debug_buf); - - if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(10000), - [&my_slot]() { return my_slot.is_ready; })) { - // Timeout - decode took too long - sprintf_s(debug_buf, "[DecodeToSurface] ERROR: Decode timeout for slot_idx=%zu (is_ready=%d)\n", - slot_idx, my_slot.is_ready); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); - LogError("Decode timeout"); - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); // Option C: Advance to unblock next thread - return false; - } - - sprintf_s(debug_buf, "[DecodeToSurface] Decode complete wait finished (slot_idx=%zu)\n", slot_idx); - OutputDebugStringA(debug_buf); - } - - // ===== Component 5: Option C Sequential Return Wait (FIFO Guarantee) ===== - // 5. Wait for my turn using slot_id (NOT slot_idx!) AFTER decoding completes - sprintf_s(debug_buf, "[DecodeToSurface] Option C: Waiting for FIFO turn slot_id=%llu (current returnIdCounter=%llu)...\n", - my_id, m_returnIdCounter.load()); - OutputDebugStringA(debug_buf); - - while (m_returnIdCounter.load() != my_id) { + // 2. Wait if pending slot is still in use (overflow protection) + while (m_pendingSubmissions[pending_idx].in_use.load()) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } - sprintf_s(debug_buf, "[DecodeToSurface] Option C: slot_id=%llu is now return head\n", my_id); - OutputDebugStringA(debug_buf); + // 3. Store submission context in ring buffer slot + { + std::lock_guard lock(m_submissionMutex); + auto& pending = m_pendingSubmissions[pending_idx]; - // ===== Component 5: Option C Frame Retrieval (Multi-Frame Support) ===== - // Get first frame from multi-frame packet (picture_indices[0]) - if (my_slot.picture_indices.empty()) { - sprintf_s(debug_buf, "[DecodeToSurface] ERROR: No picture_indices in slot %zu\n", slot_idx); - OutputDebugStringA(debug_buf); - LogError("No frames decoded"); - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); + pending.target_surface = target_surface; + pending.surface_type = target_type; + pending.submission_id = my_submission_id; + pending.in_use.store(true); + } + + LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu", + my_submission_id, pending_idx); + + // ===== Component 2: Packet Submission ===== + // 4. Submit packet to NVDEC parser (synchronous) + CUVIDSOURCEDATAPACKET packet = {}; + packet.payload = packet_data; + packet.payload_size = static_cast(packet_size); + packet.flags = CUVID_PKT_ENDOFPICTURE; + packet.timestamp = 0; // Not used - we use submission_id instead + + LOGF_INFO("[DecodeToSurface] Calling cuvidParseVideoData (submission_id=%llu)...", + my_submission_id); + + CUresult result = cuvidParseVideoData(m_parser, &packet); + // cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return + + if (result != CUDA_SUCCESS) { + LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed with code %d", result); + LogCUDAError(result, "cuvidParseVideoData"); + m_decodeErrors++; + + // Release pending slot on error + { + std::lock_guard lock(m_submissionMutex); + m_pendingSubmissions[pending_idx].in_use.store(false); + } return false; } - int frameIdx = my_slot.picture_indices[0]; // Return first frame only - sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu decode complete, using first frame (picture_index=%d, total_frames=%zu)\n", - slot_idx, frameIdx, my_slot.picture_indices.size()); - OutputDebugStringA(debug_buf); + LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed"); + + // ===== Component 4: Wait and Retrieve ===== + // 5. Find which slot NVDEC used (check all slots for our submission_id) + int my_slot_idx = -1; + for (int i = 0; i < RING_BUFFER_SIZE; ++i) { + if (m_ringBuffer[i].submission_id == my_submission_id) { + my_slot_idx = i; + break; + } + } + + if (my_slot_idx == -1) { + LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id); + return false; + } + + DecodeSlot& my_slot = m_ringBuffer[my_slot_idx]; + LOGF_DEBUG("[DecodeToSurface] Found slot_idx=%d for submission_id=%llu", my_slot_idx, my_submission_id); + + // 6. Wait for my turn in FIFO order + while (m_returnCounter.load() != my_submission_id) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + + LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id); + + // 7. Wait for decode to complete + { + std::unique_lock lock(my_slot.slot_mutex); + + if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500), + [&my_slot]() { return my_slot.is_ready.load(); })) { + // Timeout - decode took too long + LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); // Skip to avoid deadlock + return false; + } + } + + LOGF_DEBUG("[DecodeToSurface] Decode complete for slot %d", my_slot_idx); + + // ===== Component 5: Frame Retrieval & Cleanup ===== + // 8. Map decoded frame from NVDEC using the slot's picture_index + int pic_idx = my_slot.picture_index; // CurrPicIdx from NVDEC - // For CUDA device surface, we need to map the decoded frame if (target_type == VAVCORE_SURFACE_CUDA_DEVICE) { - OutputDebugStringA("[DecodeToSurface] CUDA device path\n"); + // CUDA device surface path + LOGF_DEBUG("[DecodeToSurface] CUDA device path"); - // 7. Map decoded frame from NVDEC CUdeviceptr devicePtr = 0; unsigned int pitch = 0; CUVIDPROCPARAMS procParams = {}; procParams.progressive_frame = 1; - result = cuvidMapVideoFrame(m_decoder, frameIdx, &devicePtr, &pitch, &procParams); + CUresult result = cuvidMapVideoFrame(m_decoder, pic_idx, &devicePtr, &pitch, &procParams); if (result != CUDA_SUCCESS) { + LOGF_ERROR("[DecodeToSurface] cuvidMapVideoFrame failed for pic_idx=%d", pic_idx); LogCUDAError(result, "cuvidMapVideoFrame"); - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); return false; } @@ -1385,18 +1340,17 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ output_frame.frame_index = m_framesDecoded; output_frame.timestamp_seconds = static_cast(m_framesDecoded) / 30.0; - // 9. Unmap frame + // Unmap frame cuvidUnmapVideoFrame(m_decoder, devicePtr); } else if (target_type == VAVCORE_SURFACE_D3D12_RESOURCE) { // D3D12 resource path using D3D12SurfaceHandler - OutputDebugStringA("[DecodeToSurface] D3D12 resource path\n"); + LOGF_DEBUG("[DecodeToSurface] D3D12 resource path"); if (!target_surface || !m_d3d12Device) { - OutputDebugStringA("[DecodeToSurface] ERROR: target_surface or m_d3d12Device is null\n"); - LogError("D3D12 resource or device not available"); - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); + LOGF_ERROR("[DecodeToSurface] target_surface or m_d3d12Device is null"); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); return false; } @@ -1414,55 +1368,43 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ m_cachedD3D12Device = m_d3d12Device; m_cachedCuContext = m_cuContext; - OutputDebugStringA("[DecodeToSurface] D3D12SurfaceHandler (re)created\n"); + LOGF_DEBUG("[DecodeToSurface] D3D12SurfaceHandler (re)created"); } - sprintf_s(debug_buf, "[DecodeToSurface] Using decoded frame: frameIdx=%d\n", frameIdx); - OutputDebugStringA(debug_buf); - - // 7. Map decoded NVDEC frame + // Map decoded NVDEC frame CUVIDPROCPARAMS procParams = {}; procParams.progressive_frame = 1; - procParams.top_field_first = 0; - procParams.unpaired_field = 0; CUdeviceptr srcDevicePtr = 0; unsigned int srcPitch = 0; - result = cuvidMapVideoFrame(m_decoder, frameIdx, &srcDevicePtr, &srcPitch, &procParams); + CUresult result = cuvidMapVideoFrame(m_decoder, pic_idx, &srcDevicePtr, &srcPitch, &procParams); if (result != CUDA_SUCCESS) { - sprintf_s(debug_buf, "[DecodeToSurface] cuvidMapVideoFrame failed with code %d\n", result); - OutputDebugStringA(debug_buf); + LOGF_ERROR("[DecodeToSurface] cuvidMapVideoFrame failed for pic_idx=%d", pic_idx); LogCUDAError(result, "cuvidMapVideoFrame"); - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); return false; } - sprintf_s(debug_buf, "[DecodeToSurface] cuvidMapVideoFrame succeeded: srcDevicePtr=%p, srcPitch=%u\n", - (void*)srcDevicePtr, srcPitch); - OutputDebugStringA(debug_buf); + LOGF_DEBUG("[DecodeToSurface] cuvidMapVideoFrame succeeded: srcDevicePtr=%p, srcPitch=%u", + (void*)srcDevicePtr, srcPitch); - // 8. Copy to D3D12 surface - ID3D12Resource* d3d12Resource = static_cast(target_surface); + // Copy to D3D12 surface (use target_surface from slot) + ID3D12Resource* d3d12Resource = static_cast(my_slot.target_surface); bool copySuccess = m_d3d12Handler->CopyNV12Frame( srcDevicePtr, srcPitch, d3d12Resource, m_width, m_height ); - // 9. Unmap frame - result = cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); - if (result != CUDA_SUCCESS) { - sprintf_s(debug_buf, "[DecodeToSurface] cuvidUnmapVideoFrame failed with code %d\n", result); - OutputDebugStringA(debug_buf); - LogCUDAError(result, "cuvidUnmapVideoFrame"); - } + // Unmap frame + cuvidUnmapVideoFrame(m_decoder, srcDevicePtr); if (!copySuccess) { - OutputDebugStringA("[DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed\n"); - my_slot.in_use = false; - m_returnIdCounter.fetch_add(1); + LOGF_ERROR("[DecodeToSurface] D3D12SurfaceHandler::CopyNV12Frame failed"); + my_slot.in_use.store(false); + m_returnCounter.fetch_add(1); return false; } @@ -1470,7 +1412,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ m_d3d12Handler->SignalD3D12Fence(++m_fenceValue); output_frame.sync_fence_value = m_fenceValue; - OutputDebugStringA("[DecodeToSurface] D3D12 frame processing complete\n"); + LOGF_DEBUG("[DecodeToSurface] D3D12 frame processing complete"); // Fill output frame metadata output_frame.width = m_width; @@ -1480,17 +1422,16 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ output_frame.timestamp_seconds = static_cast(m_framesDecoded) / 30.0; } - // 10. Release slot + // 9. Release slot { std::lock_guard lock(my_slot.slot_mutex); - my_slot.in_use = false; + my_slot.in_use.store(false); } - // 11. Option C: Advance return ID counter (FIFO order) - m_returnIdCounter.fetch_add(1); + LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx); - sprintf_s(debug_buf, "[DecodeToSurface] Option C: Slot %zu released, returnIdCounter advanced to %llu\n", slot_idx, m_returnIdCounter.load()); - OutputDebugStringA(debug_buf); + // 10. Advance return counter (FIFO order) + m_returnCounter.fetch_add(1); // Update statistics auto end_time = std::chrono::high_resolution_clock::now(); @@ -1537,77 +1478,51 @@ bool NVDECAV1Decoder::SetupCUDAD3D12Interop(void* d3d_device) { return true; } +// Simplified Polling Thread: Check all 8 slots for decode completion void NVDECAV1Decoder::PollingThreadFunc() { - OutputDebugStringA("[PollingThread] Started\n"); - printf("[PollingThread] Started\n"); + LOGF_INFO("[PollingThread] Started"); // Set CUDA context once at thread start if (m_cuContext) { std::lock_guard lock(m_cudaContextMutex); cuCtxSetCurrent(m_cuContext); - OutputDebugStringA("[PollingThread] CUDA context set\n"); + LOGF_DEBUG("[PollingThread] CUDA context set"); } - int poll_count = 0; while (m_pollingRunning) { - // Option C: Get current return ID and calculate slot index - uint64_t current_return_id = m_returnIdCounter.load(); - size_t slot_idx = current_return_id % RING_BUFFER_SIZE; - DecodeSlot& slot = m_ringBuffer[slot_idx]; + // 1. Check all slots for pending decodes + for (int slot_idx = 0; slot_idx < RING_BUFFER_SIZE; ++slot_idx) { + DecodeSlot& slot = m_ringBuffer[slot_idx]; - // Debug logging every 100 iterations - if (poll_count % 100 == 0) { - char debug_buf[256]; - sprintf_s(debug_buf, "[PollingThread] Option C: Poll #%d: returnId=%llu, slot=%zu, in_use=%d, is_ready=%d, frames=%zu\n", - poll_count, current_return_id, slot_idx, slot.in_use, slot.is_ready, slot.picture_indices.size()); - OutputDebugStringA(debug_buf); - } - poll_count++; - - // Check if slot is in use and not yet ready - if (slot.in_use && !slot.is_ready) { - // Option C Multi-Frame Support: Get copy of picture indices - std::vector picture_indices_copy; - { - std::lock_guard lock(slot.slot_mutex); - picture_indices_copy = slot.picture_indices; + // 2. Skip slots not in use or already ready + if (!slot.in_use.load() || slot.is_ready.load()) { + continue; } - // Check if ALL frames are decoded - bool all_complete = true; - for (int pic_idx : picture_indices_copy) { - CUVIDGETDECODESTATUS decodeStatus = {}; - CUresult result = cuvidGetDecodeStatus(m_decoder, pic_idx, &decodeStatus); + // 3. Query NVDEC decode status for this slot's picture + CUVIDGETDECODESTATUS decodeStatus = {}; + CUresult result = cuvidGetDecodeStatus(m_decoder, slot.picture_index, &decodeStatus); - if (result != CUDA_SUCCESS || - decodeStatus.decodeStatus != cuvidDecodeStatus_Success) { - all_complete = false; - break; - } - } + // 4. If decode complete, mark slot ready + if (result == CUDA_SUCCESS && + decodeStatus.decodeStatus == cuvidDecodeStatus_Success) { - // If all frames complete, signal ready - if (all_complete && !picture_indices_copy.empty()) { { std::lock_guard lock(slot.slot_mutex); - slot.is_ready = true; + slot.is_ready.store(true); } slot.frame_ready.notify_one(); - char debug_buf[256]; - sprintf_s(debug_buf, "[PollingThread] Option C: Slot %zu ALL frames complete (%zu frames)\n", - slot_idx, picture_indices_copy.size()); - OutputDebugStringA(debug_buf); - printf("%s", debug_buf); + LOGF_DEBUG("[PollingThread] Slot %d ready (submission_id=%llu)", + slot_idx, slot.submission_id); } } - // Sleep to avoid busy-wait (~1% CPU usage) + // 5. Sleep to avoid busy-wait (~1% CPU usage) std::this_thread::sleep_for(std::chrono::microseconds(100)); } - OutputDebugStringA("[PollingThread] Stopped\n"); - printf("[PollingThread] Stopped\n"); + LOGF_INFO("[PollingThread] Stopped"); } } // namespace VavCore diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h index c44d3cf..8da4cab 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.h @@ -20,6 +20,8 @@ typedef struct CUexternalSemaphore_st* cudaExternalSemaphore_t; namespace VavCore { +// Removed ParseContext - not needed with CurrPicIdx direct usage + // NVIDIA NVDEC-based AV1 decoder for hardware acceleration class NVDECAV1Decoder : public IVideoDecoder { public: @@ -145,33 +147,40 @@ private: // RingBuffer for asynchronous decoding (ulNumOutputSurfaces = 8) static constexpr size_t RING_BUFFER_SIZE = 8; + // Simplified DecodeSlot structure (no picture_indices vector) struct DecodeSlot { // Slot state - bool in_use = false; // Is this slot allocated? + std::atomic in_use{false}; // Is this NVDEC slot currently decoding? - // Submitted information + // Submitted information (set by DecodeToSurface) void* target_surface = nullptr; // Destination D3D12 resource VavCoreSurfaceType surface_type = VAVCORE_SURFACE_CPU; + uint64_t submission_id = 0; // For FIFO ordering (monotonic counter) - // NVDEC information (from HandlePictureDecode callback) - // Multi-frame support: One packet can decode to multiple frames - std::vector picture_indices; // All NVDEC frame indices from this packet + // NVDEC information (set by HandlePictureDecode callback) + int picture_index = -1; // CurrPicIdx from NVDEC (same as slot index) // Synchronization primitives - std::condition_variable frame_ready; // Signaled when ALL frames are decoded + std::condition_variable frame_ready; // Signaled when decode complete std::mutex slot_mutex; // Protects this slot's state - bool is_ready = false; // All frames decoded flag + std::atomic is_ready{false}; // Decode completed flag }; DecodeSlot m_ringBuffer[RING_BUFFER_SIZE]; - // Option C: Unified slot allocation counters (no mapping needed!) - std::atomic m_slotIdCounter{0}; // Monotonically increasing slot ID (submission order) - std::atomic m_returnIdCounter{0}; // Return order enforcement (FIFO) + // Simplified: Only submission ID for FIFO ordering + std::atomic m_submissionCounter{0}; // Monotonic submission ID + std::atomic m_returnCounter{0}; // Return order enforcement (FIFO) - // Mapping: picture_index → slot_id (NVDEC ignores our timestamp and uses WebM PTS) - std::unordered_map m_pictureIndexToSlotId; - std::mutex m_mappingMutex; + // CRITICAL: Ring buffer for pending submissions (multi-thread safe) + struct PendingSubmission { + void* target_surface = nullptr; + VavCoreSurfaceType surface_type = VAVCORE_SURFACE_CPU; + uint64_t submission_id = 0; + std::atomic in_use{false}; + }; + PendingSubmission m_pendingSubmissions[RING_BUFFER_SIZE]; // Ring buffer for contexts + std::mutex m_submissionMutex; // Protects pending submission allocation // Polling thread for cuvidGetDecodeStatus std::thread m_pollingThread;