From e75d565ba2f803a5939d0e56b445268f52f8402a Mon Sep 17 00:00:00 2001 From: ened Date: Mon, 6 Oct 2025 07:53:14 +0900 Subject: [PATCH] WIP --- .claude/settings.local.json | 8 +- .../src/Decoder/D3D12SurfaceHandler.cpp | 43 ++++++++ .../src/Decoder/NV12ToRGBAConverter.cpp | 103 ++++++++++++++++++ .../vavcore/src/Decoder/NVDECAV1Decoder.cpp | 1 + 4 files changed, 154 insertions(+), 1 deletion(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 9c659a1..ca28cc2 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -108,7 +108,13 @@ "Bash(\"./compile_validation_kernel.bat\")", "Bash(./compile_validation_kernel.bat)", "Bash(\"./bin/Debug/RedSurfaceNVDECTest.exe\" \"D:/Project/video-av1/sample/test_720p_stripe.webm\")", - "Bash(\"./bin/Debug/RedSurfaceNVDECTest.exe\" \"D:/Project/video-av1/sample/test_1080p_stripe.webm\")" + "Bash(\"./bin/Debug/RedSurfaceNVDECTest.exe\" \"D:/Project/video-av1/sample/test_1080p_stripe.webm\")", + "Bash(\"./RedSurfaceNVDECTest.exe\" \"D:/Project/video-av1/sample/test_720p_stripe.webm\")", + "Bash(ffmpeg:*)", + "Bash(\"/c/Program Files/Microsoft Visual Studio/2022/Community/MSBuild/Current/Bin/MSBuild.exe\" RedSurfaceNVDECTest.vcxproj //p:Configuration=Debug //p:Platform=x64 //v:minimal)", + "Bash(\"./bin/Debug/RedSurfaceNVDECTest.exe\" \"D:/Project/video-av1/sample/test_4px_stripe_av1.webm\")", + "Bash(\"./bin/Debug/RedSurfaceNVDECTest.exe\" \"D:/Project/video-av1/sample/test_4px_stripe_720p_av1.webm\")", + "Bash(ffprobe:*)" ], "deny": [], "ask": [] diff --git a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp index d5e79f5..76200ca 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp @@ -253,6 +253,14 @@ bool D3D12SurfaceHandler::CopyRGBAFrame(CUdeviceptr src_rgba, UINT dst_pitch = layout.Footprint.RowPitch; uint32_t src_pitch = width * 4; // RGBA: 4 bytes per pixel + LOGF_DEBUG("[D3D12SurfaceHandler] CopyRGBAFrame DEBUG: width=%u, height=%u", width, height); + LOGF_DEBUG(" src_pitch=%u, dst_pitch=%u", src_pitch, dst_pitch); + LOGF_DEBUG(" D3D12 texture desc: Width=%llu, Height=%u, Format=%d", + desc.Width, desc.Height, desc.Format); + LOGF_DEBUG(" Footprint: Width=%u, Height=%u, Depth=%u, RowPitch=%u", + layout.Footprint.Width, layout.Footprint.Height, + layout.Footprint.Depth, layout.Footprint.RowPitch); + // Direct RGBA copy using cudaMemcpy2D (zero extra conversion needed) cudaError_t err = cudaMemcpy2D( (void*)dst_ptr, dst_pitch, // Destination: D3D12 texture with pitch @@ -268,6 +276,41 @@ bool D3D12SurfaceHandler::CopyRGBAFrame(CUdeviceptr src_rgba, } LOGF_DEBUG("[D3D12SurfaceHandler] RGBA frame copied to D3D12 texture (%ux%u)", width, height); + + // DEBUG: Read back D3D12 texture via CUDA to verify orientation + static int d3d12_debug_count = 0; + if (d3d12_debug_count < 1) { + // Sample first 32x32 pixels from D3D12 texture + const int sample_size = 32; + uint8_t* cpu_sample = new uint8_t[sample_size * sample_size * 4]; + + cudaError_t err = cudaMemcpy2D( + cpu_sample, sample_size * 4, + (void*)dst_ptr, dst_pitch, + sample_size * 4, sample_size, + cudaMemcpyDeviceToHost + ); + + if (err == cudaSuccess) { + LOGF_DEBUG("[D3D12SurfaceHandler] D3D12 texture sample (32x32) via CUDA:"); + for (int y = 0; y < 4; y++) { + char line[1024]; + int pos = 0; + pos += snprintf(line + pos, sizeof(line) - pos, " y=%d: ", y); + for (int x = 0; x < 16; x++) { + uint8_t r = cpu_sample[(y * sample_size + x) * 4 + 0]; + pos += snprintf(line + pos, sizeof(line) - pos, "%3d ", r); + } + LOGF_DEBUG("%s", line); + } + } else { + LOGF_ERROR("[D3D12SurfaceHandler] Failed to sample D3D12 texture: %s", cudaGetErrorString(err)); + } + + delete[] cpu_sample; + d3d12_debug_count++; + } + return true; } diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NV12ToRGBAConverter.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NV12ToRGBAConverter.cpp index 2abcb89..d1c18d0 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NV12ToRGBAConverter.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NV12ToRGBAConverter.cpp @@ -115,6 +115,7 @@ $L__BB0_2: ret; } + )PTX"; namespace VavCore { @@ -223,6 +224,43 @@ bool NV12ToRGBAConverter::ConvertNV12ToRGBA(CUdeviceptr nv12_ptr, uint32_t nv12_ dim3 gridDim((m_width + blockDim.x - 1) / blockDim.x, (m_height + blockDim.y - 1) / blockDim.y); + // DEBUG: Sample Y plane data to check orientation + static int debug_frame_count = 0; + if (debug_frame_count < 3) { + // Copy first 32x32 pixels of Y plane to CPU + const int sample_size = 32; + uint8_t* cpu_buffer = new uint8_t[sample_size * sample_size]; + + cudaError_t err = cudaMemcpy2D( + cpu_buffer, sample_size, + (void*)y_plane, nv12_pitch, + sample_size, sample_size, + cudaMemcpyDeviceToHost + ); + + if (err == cudaSuccess) { + LOGF_DEBUG("[NV12ToRGBAConverter] Frame %d - Y plane sample (32x32):", debug_frame_count); + LOGF_DEBUG(" m_width=%u, m_height=%u, nv12_pitch=%u", m_width, m_height, nv12_pitch); + + // Print first 32 pixels of first 4 rows + for (int y = 0; y < 4; y++) { + char line[1024]; + int pos = 0; + pos += snprintf(line + pos, sizeof(line) - pos, " y=%d: ", y); + for (int x = 0; x < 32; x++) { + uint8_t val = cpu_buffer[y * sample_size + x]; + pos += snprintf(line + pos, sizeof(line) - pos, "%3d ", val); + } + LOGF_DEBUG("%s", line); + } + } else { + LOGF_ERROR("[NV12ToRGBAConverter] Failed to copy Y plane sample: %s", cudaGetErrorString(err)); + } + + delete[] cpu_buffer; + debug_frame_count++; + } + // Setup kernel parameters void* args[] = { &y_plane, @@ -251,6 +289,71 @@ bool NV12ToRGBAConverter::ConvertNV12ToRGBA(CUdeviceptr nv12_ptr, uint32_t nv12_ return false; } + // DEBUG: Save RGBA buffer directly to BMP to verify CUDA kernel output + static int rgba_debug_count = 0; + if (rgba_debug_count < 1) { + // Copy entire RGBA buffer to CPU + size_t full_size = m_width * m_height * 4; + uint8_t* cpu_rgba = new uint8_t[full_size]; + + cudaError_t err = cudaMemcpy2D( + cpu_rgba, m_width * 4, + (void*)m_rgbaBuffer, rgba_pitch, + m_width * 4, m_height, + cudaMemcpyDeviceToHost + ); + + if (err == cudaSuccess) { + // Save directly to BMP + FILE* f = nullptr; + fopen_s(&f, "cuda_kernel_output.bmp", "wb"); + if (f) { + // BMP header + uint32_t file_size = 54 + full_size; + uint8_t header[54] = { + 'B', 'M', + (uint8_t)(file_size), (uint8_t)(file_size >> 8), (uint8_t)(file_size >> 16), (uint8_t)(file_size >> 24), + 0, 0, 0, 0, + 54, 0, 0, 0, + 40, 0, 0, 0, + (uint8_t)(m_width), (uint8_t)(m_width >> 8), (uint8_t)(m_width >> 16), (uint8_t)(m_width >> 24), + (uint8_t)(m_height), (uint8_t)(m_height >> 8), (uint8_t)(m_height >> 16), (uint8_t)(m_height >> 24), + 1, 0, + 32, 0, + 0, 0, 0, 0, + (uint8_t)(full_size), (uint8_t)(full_size >> 8), (uint8_t)(full_size >> 16), (uint8_t)(full_size >> 24), + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 + }; + fwrite(header, 1, 54, f); + + // Write pixel data (bottom-up, BMP format is BGRA) + for (int y = m_height - 1; y >= 0; y--) { + for (uint32_t x = 0; x < m_width; x++) { + uint32_t idx = (y * m_width + x) * 4; + // CUDA kernel outputs RGBA, BMP expects BGRA + uint8_t bgra[4] = { + cpu_rgba[idx + 2], // B from RGBA[2] + cpu_rgba[idx + 1], // G from RGBA[1] + cpu_rgba[idx + 0], // R from RGBA[0] + cpu_rgba[idx + 3] // A from RGBA[3] + }; + fwrite(bgra, 1, 4, f); + } + } + fclose(f); + LOGF_DEBUG("[NV12ToRGBAConverter] Saved CUDA kernel output to cuda_kernel_output.bmp"); + } + } else { + LOGF_ERROR("[NV12ToRGBAConverter] Failed to copy RGBA for BMP: %s", cudaGetErrorString(err)); + } + + delete[] cpu_rgba; + rgba_debug_count++; + } + // Return pointer to RGBA buffer *out_rgba_ptr = m_rgbaBuffer; diff --git a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp index bad041a..a4bcf88 100644 --- a/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp +++ b/vav2/platforms/windows/vavcore/src/Decoder/NVDECAV1Decoder.cpp @@ -1391,6 +1391,7 @@ bool NVDECAV1Decoder::DecodeToSurface(const uint8_t* packet_data, size_t packet_ } // Copy RGBA to D3D12 texture + LOGF_DEBUG("[DecodeToSurface] Calling CopyRGBAFrame with m_width=%u, m_height=%u", m_width, m_height); copySuccess = m_d3d12Handler->CopyRGBAFrame( rgbaPtr, d3d12Resource,