Fix bug

2025-10-06 14:47:55 +09:00
parent 77b6246c67
commit b37cd1ded0
6 changed files with 118 additions and 35 deletions
--- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp
@@ -904,34 +904,41 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS

    DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];

-    // Find pending submission context using most recent submission_id
-    // cuvidParseVideoData is SYNCHRONOUS - the callback is for the packet we just submitted
-    // Therefore, m_submissionCounter - 1 is the submission_id for THIS packet
+    // Find pending submission context for this decode slot
+    // CRITICAL: Search for the HIGHEST submission_id (most recent packet being processed)
+    // This handles the case where cuvidParseVideoData callbacks may execute asynchronously
    uint64_t submission_id = 0;
    size_t pending_idx = 0;
    bool found = false;
    {
        std::lock_guard<std::mutex> lock(decoder->m_submissionMutex);

-        // Get the most recent submission (the one that triggered this callback)
-        uint64_t current_submission_id = decoder->m_submissionCounter.load() - 1;
-        pending_idx = current_submission_id % RING_BUFFER_SIZE;
+        // Search backwards from current submission counter to find most recent active pending
+        uint64_t current_counter = decoder->m_submissionCounter.load();
+        uint64_t max_submission_id = 0;
+        bool found_any = false;

-        auto& pending = decoder->m_pendingSubmissions[pending_idx];
+        for (size_t i = 0; i < RING_BUFFER_SIZE; i++) {
+            auto& pending = decoder->m_pendingSubmissions[i];

-        // Verify this pending submission is in use and matches the slot
-        if (pending.in_use.load()) {
-            // Copy pending submission context to decode slot
-            slot.target_surface = pending.target_surface;
-            slot.surface_type = pending.surface_type;
-            slot.submission_id = pending.submission_id;
-            submission_id = pending.submission_id;
+            if (pending.in_use.load()) {
+                // Found an active pending submission
+                if (!found_any || pending.submission_id > max_submission_id) {
+                    // This is the newest one so far
+                    slot.target_surface = pending.target_surface;
+                    slot.surface_type = pending.surface_type;
+                    slot.submission_id = pending.submission_id;
+                    submission_id = pending.submission_id;
+                    pending_idx = i;
+                    max_submission_id = pending.submission_id;
+                    found_any = true;
+                    found = true;
+                }
+            }
+        }

-            // Release pending slot for reuse
-            pending.in_use.store(false);
-            found = true;
-        } else {
-            LOGF_ERROR("[HandlePictureDecode] Pending submission slot %zu not in use!", pending_idx);
+        if (!found) {
+            LOGF_ERROR("[HandlePictureDecode] No active pending submission found for slot %d", slot_idx);
        }
    }

@@ -1181,12 +1188,11 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
        LOGF_DEBUG("[DecodeToSurface] Allocated submission_id=%llu, pending_idx=%zu",
                   my_submission_id, pending_idx);

-        // 2. Wait if pending slot is still in use (overflow protection)
-        while (m_pendingSubmissions[pending_idx].in_use.load()) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(1));
-        }
-
-        // 3. Store submission context in ring buffer slot
+        // 2. Store submission context in ring buffer slot (overwrite old data)
+        // No need to wait - ring buffer naturally cycles after 16 submissions
+        // Old pending submissions will be overwritten, which is safe because:
+        // - Decode slots already have their copy of pending data
+        // - 16 slots is enough buffer for B-frame reordering
        {
            std::lock_guard<std::mutex> lock(m_submissionMutex);
            auto& pending = m_pendingSubmissions[pending_idx];
@@ -1194,7 +1200,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
            pending.target_surface = target_surface;
            pending.surface_type = target_type;
            pending.submission_id = my_submission_id;
-            pending.in_use.store(true);
+            pending.in_use.store(true);  // Mark as active for HandlePictureDecode search
        }

        LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu",
@@ -1212,7 +1218,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
                  my_submission_id);

        CUresult result = cuvidParseVideoData(m_parser, &packet);
-        // cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return
+        // cuvidParseVideoData is SYNCHRONOUS - all callbacks execute before return

        if (result != CUDA_SUCCESS) {
            LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed with code %d", result);
@@ -1227,7 +1233,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
            return false;
        }

-        LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed");
+        LOGF_DEBUG("[DecodeToSurface] Packet submitted, synchronous callbacks completed");
+
+        // IMPORTANT: Do NOT release pending submission here!
+        // Even though cuvidParseVideoData is documented as synchronous, NVDEC's B-frame
+        // reordering means callbacks from THIS packet may execute during FUTURE packets.
+        // Pending submissions will naturally be overwritten when ring buffer wraps (16 slots).
+        LOGF_DEBUG("[DecodeToSurface] Keeping pending_idx=%zu active (will be reused after %d submissions)",
+                   pending_idx, RING_BUFFER_SIZE);

        // ===== Component 4: Wait and Retrieve =====
        // 5. Find which slot NVDEC used (check all slots for our submission_id)
@@ -1240,8 +1253,13 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
        }

        if (my_slot_idx == -1) {
-            LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id);
-            return false;
+            // Display-only packet: HandlePictureDisplay was called without HandlePictureDecode
+            // This happens when a packet only triggers display of a previously decoded frame
+            // No new frame was decoded, so we return false to indicate no frame is available
+            LOGF_DEBUG("[DecodeToSurface] Display-only packet (no decode) for submission_id=%llu - returning false", my_submission_id);
+
+            m_returnCounter.fetch_add(1);  // Advance counter to unblock FIFO queue
+            return false;  // No frame decoded - caller should use previous frame
        }

        DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
@@ -1254,14 +1272,27 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_

        LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id);

-        // 7. Wait for decode to complete
+        // 7. Wait for decode to complete with adaptive timeout based on resolution
        {
            std::unique_lock<std::mutex> lock(my_slot.slot_mutex);

-            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500),
+            // Adaptive timeout: base 500ms for 720p, scale by pixel count
+            // 720p (1280x720 = 921,600 pixels) -> 500ms
+            // 1080p (1920x1080 = 2,073,600 pixels) -> 1,125ms
+            // 4K (3840x2160 = 8,294,400 pixels) -> 4,500ms
+            const uint64_t base_pixels = 1280 * 720;  // 720p reference
+            const uint64_t base_timeout_ms = 500;
+            const uint64_t current_pixels = static_cast<uint64_t>(m_width) * m_height;
+            const uint64_t timeout_ms = std::max<uint64_t>(base_timeout_ms,
+                (current_pixels * base_timeout_ms) / base_pixels);
+
+            LOGF_DEBUG("[DecodeToSurface] Adaptive timeout: %llums for %dx%d (%llu pixels)",
+                      timeout_ms, m_width, m_height, current_pixels);
+
+            if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(timeout_ms),
                [&my_slot]() { return my_slot.is_ready.load(); })) {
                // Timeout - decode took too long
-                LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx);
+                LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d after %llums", my_slot_idx, timeout_ms);
                my_slot.in_use.store(false);
                m_returnCounter.fetch_add(1);  // Skip to avoid deadlock
                return false;
@@ -1443,6 +1474,8 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_

        LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx);

+        // Note: pending submission already released immediately after cuvidParseVideoData (line 1237)
+
        // 10. Advance return counter (FIFO order)
        m_returnCounter.fetch_add(1);