This commit is contained in:
2025-10-06 14:47:55 +09:00
parent 77b6246c67
commit b37cd1ded0
6 changed files with 118 additions and 35 deletions

View File

@@ -319,6 +319,11 @@ namespace winrt::Vav2Player::implementation
LogMgr::GetInstance().LogInfo(L"VideoPlayerControl2",
L"Video loaded: " + std::to_wstring(videoWidth) + L"x" + std::to_wstring(videoHeight));
// Prepare video texture before decoding (via FrameProcessor)
if (m_frameProcessor) {
m_frameProcessor->PrepareVideoTexture(videoWidth, videoHeight);
}
// Update AspectFit
UpdateVideoImageAspectFit(videoWidth, videoHeight);

View File

@@ -28,6 +28,13 @@ void FrameProcessor::SetDispatcherQueue(winrt::Microsoft::UI::Dispatching::Dispa
m_dispatcherQueue = queue;
}
void FrameProcessor::PrepareVideoTexture(uint32_t width, uint32_t height)
{
if (m_renderer) {
m_renderer->PrepareVideoTexture(width, height);
}
}
bool FrameProcessor::ProcessFrame(VavCorePlayer* player,
std::function<void(bool success)> onComplete)
{

View File

@@ -26,6 +26,9 @@ public:
// Set dispatcher queue for UI thread callbacks
void SetDispatcherQueue(winrt::Microsoft::UI::Dispatching::DispatcherQueue const& queue);
// Prepare video texture before first decode
void PrepareVideoTexture(uint32_t width, uint32_t height);
// Process single frame (called from PlaybackController timing thread)
// Returns: true if frame processing started, false if skipped (previous frame still rendering)
// onComplete: Callback invoked on UI thread after render completes (success flag)

View File

@@ -62,7 +62,10 @@ HRESULT D3D12VideoRenderer::InitializeWithSwapChain(
}
void D3D12VideoRenderer::Shutdown() {
WaitForGPU();
// Only wait for GPU if we have a valid command queue
if (m_commandQueue && m_fence && m_fenceEvent) {
WaitForGPU();
}
// Shutdown backends
if (m_rgbaSurfaceBackend) {
@@ -224,6 +227,24 @@ void D3D12VideoRenderer::SetSwapChainPanel(winrt::Microsoft::UI::Xaml::Controls:
m_swapChainPanel = panel;
}
HRESULT D3D12VideoRenderer::PrepareVideoTexture(uint32_t width, uint32_t height) {
if (!m_initialized) {
return E_NOT_VALID_STATE;
}
// Create RGBA texture in advance
if (m_rgbaSurfaceBackend) {
HRESULT hr = m_rgbaSurfaceBackend->CreateVideoTexture(width, height);
if (FAILED(hr)) {
return hr;
}
m_videoWidth = width;
m_videoHeight = height;
}
return S_OK;
}
ID3D12Resource* D3D12VideoRenderer::GetRGBATextureForCUDAInterop() const {
if (m_rgbaSurfaceBackend) {
return m_rgbaSurfaceBackend->GetVideoTexture();
@@ -383,13 +404,24 @@ HRESULT D3D12VideoRenderer::CreateCommandObjects() {
}
}
return m_device->CreateCommandList(
HRESULT hr = m_device->CreateCommandList(
0,
D3D12_COMMAND_LIST_TYPE_DIRECT,
m_commandAllocators[0].Get(),
nullptr,
IID_PPV_ARGS(&m_commandList)
);
if (FAILED(hr)) {
return hr;
}
// Close command list after creation (it starts in recording state)
hr = m_commandList->Close();
if (FAILED(hr)) {
return hr;
}
return S_OK;
}
HRESULT D3D12VideoRenderer::CreateSynchronizationObjects() {

View File

@@ -55,6 +55,9 @@ public:
ID3D12Device* GetD3D12Device() const { return m_device.Get(); }
ID3D12CommandQueue* GetCommandQueue() const { return m_commandQueue.Get(); }
// Prepare video texture before decoding
HRESULT PrepareVideoTexture(uint32_t width, uint32_t height);
// Backend-specific texture access for CUDA interop
ID3D12Resource* GetRGBATextureForCUDAInterop() const;
ID3D12Resource* GetNV12TextureForCUDAInterop() const { return nullptr; } // Future: NV12DirectBackend

View File

@@ -904,34 +904,41 @@ int CUDAAPI NVDECAV1Decoder::HandlePictureDecode(void* user_data, CUVIDPICPARAMS
DecodeSlot& slot = decoder->m_ringBuffer[slot_idx];
// Find pending submission context using most recent submission_id
// cuvidParseVideoData is SYNCHRONOUS - the callback is for the packet we just submitted
// Therefore, m_submissionCounter - 1 is the submission_id for THIS packet
// Find pending submission context for this decode slot
// CRITICAL: Search for the HIGHEST submission_id (most recent packet being processed)
// This handles the case where cuvidParseVideoData callbacks may execute asynchronously
uint64_t submission_id = 0;
size_t pending_idx = 0;
bool found = false;
{
std::lock_guard<std::mutex> lock(decoder->m_submissionMutex);
// Get the most recent submission (the one that triggered this callback)
uint64_t current_submission_id = decoder->m_submissionCounter.load() - 1;
pending_idx = current_submission_id % RING_BUFFER_SIZE;
// Search backwards from current submission counter to find most recent active pending
uint64_t current_counter = decoder->m_submissionCounter.load();
uint64_t max_submission_id = 0;
bool found_any = false;
auto& pending = decoder->m_pendingSubmissions[pending_idx];
for (size_t i = 0; i < RING_BUFFER_SIZE; i++) {
auto& pending = decoder->m_pendingSubmissions[i];
// Verify this pending submission is in use and matches the slot
if (pending.in_use.load()) {
// Copy pending submission context to decode slot
slot.target_surface = pending.target_surface;
slot.surface_type = pending.surface_type;
slot.submission_id = pending.submission_id;
submission_id = pending.submission_id;
if (pending.in_use.load()) {
// Found an active pending submission
if (!found_any || pending.submission_id > max_submission_id) {
// This is the newest one so far
slot.target_surface = pending.target_surface;
slot.surface_type = pending.surface_type;
slot.submission_id = pending.submission_id;
submission_id = pending.submission_id;
pending_idx = i;
max_submission_id = pending.submission_id;
found_any = true;
found = true;
}
}
}
// Release pending slot for reuse
pending.in_use.store(false);
found = true;
} else {
LOGF_ERROR("[HandlePictureDecode] Pending submission slot %zu not in use!", pending_idx);
if (!found) {
LOGF_ERROR("[HandlePictureDecode] No active pending submission found for slot %d", slot_idx);
}
}
@@ -1181,12 +1188,11 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
LOGF_DEBUG("[DecodeToSurface] Allocated submission_id=%llu, pending_idx=%zu",
my_submission_id, pending_idx);
// 2. Wait if pending slot is still in use (overflow protection)
while (m_pendingSubmissions[pending_idx].in_use.load()) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
// 3. Store submission context in ring buffer slot
// 2. Store submission context in ring buffer slot (overwrite old data)
// No need to wait - ring buffer naturally cycles after 16 submissions
// Old pending submissions will be overwritten, which is safe because:
// - Decode slots already have their copy of pending data
// - 16 slots is enough buffer for B-frame reordering
{
std::lock_guard<std::mutex> lock(m_submissionMutex);
auto& pending = m_pendingSubmissions[pending_idx];
@@ -1194,7 +1200,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
pending.target_surface = target_surface;
pending.surface_type = target_type;
pending.submission_id = my_submission_id;
pending.in_use.store(true);
pending.in_use.store(true); // Mark as active for HandlePictureDecode search
}
LOGF_DEBUG("[DecodeToSurface] Prepared submission_id=%llu, pending_idx=%zu",
@@ -1212,7 +1218,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
my_submission_id);
CUresult result = cuvidParseVideoData(m_parser, &packet);
// cuvidParseVideoData is SYNCHRONOUS - HandlePictureDecode called before return
// cuvidParseVideoData is SYNCHRONOUS - all callbacks execute before return
if (result != CUDA_SUCCESS) {
LOGF_ERROR("[DecodeToSurface] cuvidParseVideoData failed with code %d", result);
@@ -1227,7 +1233,14 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
return false;
}
LOGF_DEBUG("[DecodeToSurface] Packet submitted, callback completed");
LOGF_DEBUG("[DecodeToSurface] Packet submitted, synchronous callbacks completed");
// IMPORTANT: Do NOT release pending submission here!
// Even though cuvidParseVideoData is documented as synchronous, NVDEC's B-frame
// reordering means callbacks from THIS packet may execute during FUTURE packets.
// Pending submissions will naturally be overwritten when ring buffer wraps (16 slots).
LOGF_DEBUG("[DecodeToSurface] Keeping pending_idx=%zu active (will be reused after %d submissions)",
pending_idx, RING_BUFFER_SIZE);
// ===== Component 4: Wait and Retrieve =====
// 5. Find which slot NVDEC used (check all slots for our submission_id)
@@ -1240,8 +1253,13 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
}
if (my_slot_idx == -1) {
LOGF_ERROR("[DecodeToSurface] Failed to find slot for submission_id=%llu", my_submission_id);
return false;
// Display-only packet: HandlePictureDisplay was called without HandlePictureDecode
// This happens when a packet only triggers display of a previously decoded frame
// No new frame was decoded, so we return false to indicate no frame is available
LOGF_DEBUG("[DecodeToSurface] Display-only packet (no decode) for submission_id=%llu - returning false", my_submission_id);
m_returnCounter.fetch_add(1); // Advance counter to unblock FIFO queue
return false; // No frame decoded - caller should use previous frame
}
DecodeSlot& my_slot = m_ringBuffer[my_slot_idx];
@@ -1254,14 +1272,27 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
LOGF_DEBUG("[DecodeToSurface] My turn! submission_id=%llu", my_submission_id);
// 7. Wait for decode to complete
// 7. Wait for decode to complete with adaptive timeout based on resolution
{
std::unique_lock<std::mutex> lock(my_slot.slot_mutex);
if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(500),
// Adaptive timeout: base 500ms for 720p, scale by pixel count
// 720p (1280x720 = 921,600 pixels) -> 500ms
// 1080p (1920x1080 = 2,073,600 pixels) -> 1,125ms
// 4K (3840x2160 = 8,294,400 pixels) -> 4,500ms
const uint64_t base_pixels = 1280 * 720; // 720p reference
const uint64_t base_timeout_ms = 500;
const uint64_t current_pixels = static_cast<uint64_t>(m_width) * m_height;
const uint64_t timeout_ms = std::max<uint64_t>(base_timeout_ms,
(current_pixels * base_timeout_ms) / base_pixels);
LOGF_DEBUG("[DecodeToSurface] Adaptive timeout: %llums for %dx%d (%llu pixels)",
timeout_ms, m_width, m_height, current_pixels);
if (!my_slot.frame_ready.wait_for(lock, std::chrono::milliseconds(timeout_ms),
[&my_slot]() { return my_slot.is_ready.load(); })) {
// Timeout - decode took too long
LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d", my_slot_idx);
LOGF_ERROR("[DecodeToSurface] Decode timeout for slot %d after %llums", my_slot_idx, timeout_ms);
my_slot.in_use.store(false);
m_returnCounter.fetch_add(1); // Skip to avoid deadlock
return false;
@@ -1443,6 +1474,8 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_
LOGF_DEBUG("[DecodeToSurface] Released slot %d", my_slot_idx);
// Note: pending submission already released immediately after cuvidParseVideoData (line 1237)
// 10. Advance return counter (FIFO order)
m_returnCounter.fetch_add(1);