# CUDA Surface Object Refactoring - COMPLETED ✅
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
#include "pch.h"
|
||||
#include "D3D12SurfaceHandler.h"
|
||||
#include "ExternalMemoryCache.h"
|
||||
#include "rgba_surface_write_kernel_ptx.h"
|
||||
#include "../Common/VavCoreLogger.h"
|
||||
|
||||
namespace VavCore {
|
||||
@@ -9,12 +10,25 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
|
||||
: m_device(device)
|
||||
, m_cudaContext(cuda_context)
|
||||
, m_cache(std::make_unique<ExternalMemoryCache>(device, cuda_context))
|
||||
, m_surfaceWriteModule(nullptr)
|
||||
, m_surfaceWriteKernel(nullptr)
|
||||
{
|
||||
// Load surface write kernel
|
||||
if (!LoadSurfaceWriteKernel()) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel");
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Created");
|
||||
}
|
||||
|
||||
D3D12SurfaceHandler::~D3D12SurfaceHandler()
|
||||
{
|
||||
// Unload surface write kernel
|
||||
if (m_surfaceWriteModule) {
|
||||
cuModuleUnload(m_surfaceWriteModule);
|
||||
m_surfaceWriteModule = nullptr;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Destroyed");
|
||||
}
|
||||
|
||||
@@ -236,82 +250,106 @@ bool D3D12SurfaceHandler::CopyRGBAFrame(CUdeviceptr src_rgba,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get CUDA device pointer for D3D12 texture
|
||||
CUdeviceptr dst_ptr;
|
||||
if (!GetD3D12CUDAPointer(dst_texture, &dst_ptr)) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to get CUDA pointer for RGBA texture");
|
||||
// Get CUDA surface object for D3D12 texture (NEW: using surface instead of linear buffer)
|
||||
cudaSurfaceObject_t dst_surface;
|
||||
if (!m_cache->GetOrCreateSurfaceObject(dst_texture, &dst_surface)) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to get surface object for RGBA texture");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get D3D12 texture layout (ROW_MAJOR should have predictable pitch)
|
||||
D3D12_RESOURCE_DESC desc = dst_texture->GetDesc();
|
||||
D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout;
|
||||
UINT num_rows;
|
||||
UINT64 row_size, total_size;
|
||||
m_device->GetCopyableFootprints(&desc, 0, 1, 0, &layout, &num_rows, &row_size, &total_size);
|
||||
|
||||
UINT dst_pitch = layout.Footprint.RowPitch;
|
||||
uint32_t src_pitch = width * 4; // RGBA: 4 bytes per pixel
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] CopyRGBAFrame DEBUG: width=%u, height=%u", width, height);
|
||||
LOGF_DEBUG(" src_pitch=%u, dst_pitch=%u", src_pitch, dst_pitch);
|
||||
LOGF_DEBUG(" D3D12 texture desc: Width=%llu, Height=%u, Format=%d",
|
||||
desc.Width, desc.Height, desc.Format);
|
||||
LOGF_DEBUG(" Footprint: Width=%u, Height=%u, Depth=%u, RowPitch=%u",
|
||||
layout.Footprint.Width, layout.Footprint.Height,
|
||||
layout.Footprint.Depth, layout.Footprint.RowPitch);
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] CopyRGBAFrame via surface: width=%u, height=%u, surface=0x%llX",
|
||||
width, height, (unsigned long long)dst_surface);
|
||||
|
||||
// Direct RGBA copy using cudaMemcpy2D (zero extra conversion needed)
|
||||
cudaError_t err = cudaMemcpy2D(
|
||||
(void*)dst_ptr, dst_pitch, // Destination: D3D12 texture with pitch
|
||||
(void*)src_rgba, src_pitch, // Source: CUDA RGBA buffer
|
||||
width * 4, // Width in bytes (4 bytes per pixel)
|
||||
height, // Height in rows
|
||||
cudaMemcpyDeviceToDevice
|
||||
);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] RGBA copy failed: %s", cudaGetErrorString(err));
|
||||
// Use surface write kernel to handle tiled layout automatically
|
||||
if (!CopyRGBAToSurfaceViaKernel(dst_surface, src_rgba, width, height, src_pitch)) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to copy RGBA via surface kernel");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] RGBA frame copied to D3D12 texture (%ux%u)", width, height);
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] RGBA frame copied to D3D12 texture via surface (%ux%u)", width, height);
|
||||
|
||||
// DEBUG: Read back D3D12 texture via CUDA to verify orientation
|
||||
static int d3d12_debug_count = 0;
|
||||
if (d3d12_debug_count < 1) {
|
||||
// Sample first 32x32 pixels from D3D12 texture
|
||||
const int sample_size = 32;
|
||||
uint8_t* cpu_sample = new uint8_t[sample_size * sample_size * 4];
|
||||
|
||||
cudaError_t err = cudaMemcpy2D(
|
||||
cpu_sample, sample_size * 4,
|
||||
(void*)dst_ptr, dst_pitch,
|
||||
sample_size * 4, sample_size,
|
||||
cudaMemcpyDeviceToHost
|
||||
);
|
||||
|
||||
if (err == cudaSuccess) {
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] D3D12 texture sample (32x32) via CUDA:");
|
||||
for (int y = 0; y < 4; y++) {
|
||||
char line[1024];
|
||||
int pos = 0;
|
||||
pos += snprintf(line + pos, sizeof(line) - pos, " y=%d: ", y);
|
||||
for (int x = 0; x < 16; x++) {
|
||||
uint8_t r = cpu_sample[(y * sample_size + x) * 4 + 0];
|
||||
pos += snprintf(line + pos, sizeof(line) - pos, "%3d ", r);
|
||||
}
|
||||
LOGF_DEBUG("%s", line);
|
||||
}
|
||||
} else {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to sample D3D12 texture: %s", cudaGetErrorString(err));
|
||||
}
|
||||
|
||||
delete[] cpu_sample;
|
||||
d3d12_debug_count++;
|
||||
}
|
||||
// NOTE: Debug sampling removed because surface objects don't expose linear pointers
|
||||
// To verify output, save BMP files in RedSurfaceNVDECTest
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::LoadSurfaceWriteKernel()
|
||||
{
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Loading PTX from embedded string");
|
||||
|
||||
// Load PTX module from embedded string (no file I/O needed!)
|
||||
CUresult result = cuModuleLoadData(&m_surfaceWriteModule, RGBA_SURFACE_WRITE_KERNEL_PTX);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write PTX from memory: %d", result);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get kernel function
|
||||
result = cuModuleGetFunction(&m_surfaceWriteKernel, m_surfaceWriteModule, "WriteSurfaceFromBuffer_Kernel");
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Failed to get kernel function: %d", result);
|
||||
cuModuleUnload(m_surfaceWriteModule);
|
||||
m_surfaceWriteModule = nullptr;
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel loaded successfully from embedded PTX");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool D3D12SurfaceHandler::CopyRGBAToSurfaceViaKernel(cudaSurfaceObject_t dst_surface,
|
||||
CUdeviceptr src_rgba,
|
||||
uint32_t width, uint32_t height, uint32_t src_pitch)
|
||||
{
|
||||
if (!m_surfaceWriteKernel) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Surface write kernel not loaded");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Launch parameters: 16x16 thread blocks
|
||||
dim3 block_size(16, 16);
|
||||
dim3 grid_size((width + 15) / 16, (height + 15) / 16);
|
||||
|
||||
// Kernel arguments
|
||||
void* kernel_args[] = {
|
||||
&src_rgba,
|
||||
&dst_surface,
|
||||
&width,
|
||||
&height,
|
||||
&src_pitch
|
||||
};
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Launching kernel: grid=(%u,%u), block=(%u,%u)",
|
||||
grid_size.x, grid_size.y, block_size.x, block_size.y);
|
||||
|
||||
// Launch kernel
|
||||
CUresult result = cuLaunchKernel(
|
||||
m_surfaceWriteKernel,
|
||||
grid_size.x, grid_size.y, 1,
|
||||
block_size.x, block_size.y, 1,
|
||||
0, // Shared memory
|
||||
0, // Stream
|
||||
kernel_args,
|
||||
nullptr
|
||||
);
|
||||
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Kernel launch failed: %d", result);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Synchronize to ensure kernel completes
|
||||
cudaError_t err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
LOGF_ERROR("[D3D12SurfaceHandler] Kernel synchronization failed: %s", cudaGetErrorString(err));
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel completed successfully");
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace VavCore
|
||||
|
||||
@@ -63,10 +63,22 @@ private:
|
||||
CUdeviceptr dst, uint32_t dst_pitch,
|
||||
uint32_t width, uint32_t height);
|
||||
|
||||
// Load surface write kernel from PTX
|
||||
bool LoadSurfaceWriteKernel();
|
||||
|
||||
// Copy RGBA buffer to D3D12 surface using CUDA kernel
|
||||
bool CopyRGBAToSurfaceViaKernel(cudaSurfaceObject_t dst_surface,
|
||||
CUdeviceptr src_rgba,
|
||||
uint32_t width, uint32_t height, uint32_t src_pitch);
|
||||
|
||||
private:
|
||||
ID3D12Device* m_device;
|
||||
CUcontext m_cudaContext;
|
||||
std::unique_ptr<ExternalMemoryCache> m_cache;
|
||||
|
||||
// Surface write kernel
|
||||
CUmodule m_surfaceWriteModule;
|
||||
CUfunction m_surfaceWriteKernel;
|
||||
};
|
||||
|
||||
} // namespace VavCore
|
||||
|
||||
@@ -39,15 +39,56 @@ bool ExternalMemoryCache::GetOrCreateExternalMemory(ID3D12Resource* resource, CU
|
||||
}
|
||||
|
||||
// Add to cache
|
||||
CachedEntry entry;
|
||||
CachedEntry entry = {};
|
||||
entry.external_memory = external_memory;
|
||||
entry.device_ptr = device_ptr;
|
||||
entry.size = 0; // Size is stored but not used currently
|
||||
entry.is_texture = false; // This is a buffer resource
|
||||
|
||||
m_cache[resource] = entry;
|
||||
*out_ptr = device_ptr;
|
||||
|
||||
LOGF_DEBUG("[ExternalMemoryCache] New resource cached");
|
||||
LOGF_DEBUG("[ExternalMemoryCache] New buffer resource cached");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ExternalMemoryCache::GetOrCreateSurfaceObject(ID3D12Resource* resource, cudaSurfaceObject_t* out_surface)
|
||||
{
|
||||
if (!resource || !out_surface) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check cache first
|
||||
auto it = m_cache.find(resource);
|
||||
if (it != m_cache.end()) {
|
||||
if (!it->second.is_texture) {
|
||||
LOGF_ERROR("[ExternalMemoryCache] Resource was cached as buffer, not texture");
|
||||
return false;
|
||||
}
|
||||
*out_surface = it->second.surface;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Not in cache, import texture resource
|
||||
cudaExternalMemory_t external_memory;
|
||||
cudaMipmappedArray_t mipmapped_array;
|
||||
cudaSurfaceObject_t surface;
|
||||
|
||||
if (!ImportD3D12TextureAsSurface(resource, &external_memory, &mipmapped_array, &surface)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Add to cache
|
||||
CachedEntry entry = {};
|
||||
entry.external_memory = external_memory;
|
||||
entry.mipmapped_array = mipmapped_array;
|
||||
entry.surface = surface;
|
||||
entry.is_texture = true;
|
||||
|
||||
m_cache[resource] = entry;
|
||||
*out_surface = surface;
|
||||
|
||||
LOGF_DEBUG("[ExternalMemoryCache] New texture resource cached");
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -58,6 +99,16 @@ void ExternalMemoryCache::Release(ID3D12Resource* resource)
|
||||
return;
|
||||
}
|
||||
|
||||
// Clean up based on resource type
|
||||
if (it->second.is_texture) {
|
||||
if (it->second.surface) {
|
||||
cudaDestroySurfaceObject(it->second.surface);
|
||||
}
|
||||
if (it->second.mipmapped_array) {
|
||||
cudaFreeMipmappedArray(it->second.mipmapped_array);
|
||||
}
|
||||
}
|
||||
|
||||
// Destroy external memory
|
||||
cudaDestroyExternalMemory(it->second.external_memory);
|
||||
m_cache.erase(it);
|
||||
@@ -68,6 +119,15 @@ void ExternalMemoryCache::Release(ID3D12Resource* resource)
|
||||
void ExternalMemoryCache::ReleaseAll()
|
||||
{
|
||||
for (auto& pair : m_cache) {
|
||||
// Clean up based on resource type
|
||||
if (pair.second.is_texture) {
|
||||
if (pair.second.surface) {
|
||||
cudaDestroySurfaceObject(pair.second.surface);
|
||||
}
|
||||
if (pair.second.mipmapped_array) {
|
||||
cudaFreeMipmappedArray(pair.second.mipmapped_array);
|
||||
}
|
||||
}
|
||||
cudaDestroyExternalMemory(pair.second.external_memory);
|
||||
}
|
||||
m_cache.clear();
|
||||
@@ -138,4 +198,94 @@ bool ExternalMemoryCache::ImportD3D12Resource(ID3D12Resource* resource,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ExternalMemoryCache::ImportD3D12TextureAsSurface(ID3D12Resource* resource,
|
||||
cudaExternalMemory_t* out_ext_mem,
|
||||
cudaMipmappedArray_t* out_mipmap,
|
||||
cudaSurfaceObject_t* out_surface)
|
||||
{
|
||||
// Step 1: Create shared handle for D3D12 resource
|
||||
HANDLE shared_handle = nullptr;
|
||||
HRESULT hr = m_device->CreateSharedHandle(resource, nullptr, GENERIC_ALL, nullptr, &shared_handle);
|
||||
|
||||
if (FAILED(hr)) {
|
||||
LOGF_ERROR("[ExternalMemoryCache] CreateSharedHandle failed: 0x%08X", hr);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 2: Get resource allocation info
|
||||
D3D12_RESOURCE_DESC desc = resource->GetDesc();
|
||||
D3D12_RESOURCE_ALLOCATION_INFO alloc_info = m_device->GetResourceAllocationInfo(0, 1, &desc);
|
||||
|
||||
LOGF_DEBUG("[ExternalMemoryCache] Texture resource: %llux%u, format=%d",
|
||||
desc.Width, desc.Height, desc.Format);
|
||||
|
||||
// Step 3: Import as external memory
|
||||
cudaExternalMemoryHandleDesc mem_desc = {};
|
||||
mem_desc.type = cudaExternalMemoryHandleTypeD3D12Resource;
|
||||
mem_desc.handle.win32.handle = shared_handle;
|
||||
mem_desc.handle.win32.name = nullptr;
|
||||
mem_desc.size = alloc_info.SizeInBytes;
|
||||
mem_desc.flags = cudaExternalMemoryDedicated;
|
||||
|
||||
cudaExternalMemory_t external_memory;
|
||||
cudaError_t err = cudaImportExternalMemory(&external_memory, &mem_desc);
|
||||
|
||||
CloseHandle(shared_handle);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
LOGF_ERROR("[ExternalMemoryCache] cudaImportExternalMemory failed: %s", cudaGetErrorString(err));
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 4: Get mipmapped array from external memory
|
||||
cudaExternalMemoryMipmappedArrayDesc mipmap_desc = {};
|
||||
mipmap_desc.extent = make_cudaExtent(desc.Width, desc.Height, 0);
|
||||
mipmap_desc.formatDesc = cudaCreateChannelDesc<uchar4>(); // RGBA8 (R8G8B8A8_UNORM)
|
||||
mipmap_desc.numLevels = 1;
|
||||
mipmap_desc.flags = cudaArraySurfaceLoadStore; // Enable surface writes
|
||||
|
||||
cudaMipmappedArray_t mipmapped_array;
|
||||
err = cudaExternalMemoryGetMappedMipmappedArray(&mipmapped_array, external_memory, &mipmap_desc);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
LOGF_ERROR("[ExternalMemoryCache] cudaExternalMemoryGetMappedMipmappedArray failed: %s", cudaGetErrorString(err));
|
||||
cudaDestroyExternalMemory(external_memory);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 5: Get level 0 array
|
||||
cudaArray_t array;
|
||||
err = cudaGetMipmappedArrayLevel(&array, mipmapped_array, 0);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
LOGF_ERROR("[ExternalMemoryCache] cudaGetMipmappedArrayLevel failed: %s", cudaGetErrorString(err));
|
||||
cudaFreeMipmappedArray(mipmapped_array);
|
||||
cudaDestroyExternalMemory(external_memory);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Step 6: Create surface object
|
||||
cudaResourceDesc res_desc = {};
|
||||
res_desc.resType = cudaResourceTypeArray;
|
||||
res_desc.res.array.array = array;
|
||||
|
||||
cudaSurfaceObject_t surface;
|
||||
err = cudaCreateSurfaceObject(&surface, &res_desc);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
LOGF_ERROR("[ExternalMemoryCache] cudaCreateSurfaceObject failed: %s", cudaGetErrorString(err));
|
||||
cudaFreeMipmappedArray(mipmapped_array);
|
||||
cudaDestroyExternalMemory(external_memory);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOGF_DEBUG("[ExternalMemoryCache] Successfully created surface object for D3D12 texture (surface=0x%llX)", (unsigned long long)surface);
|
||||
|
||||
*out_ext_mem = external_memory;
|
||||
*out_mipmap = mipmapped_array;
|
||||
*out_surface = surface;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace VavCore
|
||||
|
||||
@@ -14,10 +14,14 @@ public:
|
||||
ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context);
|
||||
~ExternalMemoryCache();
|
||||
|
||||
// Get or create CUDA device pointer for D3D12 resource
|
||||
// Get or create CUDA device pointer for D3D12 buffer resource
|
||||
// Returns true on success, false on failure
|
||||
bool GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr);
|
||||
|
||||
// Get or create CUDA surface object for D3D12 texture resource
|
||||
// Returns true on success, false on failure
|
||||
bool GetOrCreateSurfaceObject(ID3D12Resource* resource, cudaSurfaceObject_t* out_surface);
|
||||
|
||||
// Release specific resource from cache
|
||||
void Release(ID3D12Resource* resource);
|
||||
|
||||
@@ -30,15 +34,25 @@ public:
|
||||
private:
|
||||
struct CachedEntry {
|
||||
cudaExternalMemory_t external_memory;
|
||||
CUdeviceptr device_ptr;
|
||||
CUdeviceptr device_ptr; // For buffer resources
|
||||
cudaMipmappedArray_t mipmapped_array; // For texture resources
|
||||
cudaArray_t array; // Level 0 of mipmapped array
|
||||
cudaSurfaceObject_t surface; // Surface object for texture writes
|
||||
size_t size;
|
||||
bool is_texture; // True if texture, false if buffer
|
||||
};
|
||||
|
||||
// Import D3D12 resource as CUDA external memory
|
||||
// Import D3D12 buffer resource as CUDA external memory (linear buffer)
|
||||
bool ImportD3D12Resource(ID3D12Resource* resource,
|
||||
cudaExternalMemory_t* out_ext_mem,
|
||||
CUdeviceptr* out_ptr);
|
||||
|
||||
// Import D3D12 texture resource as CUDA surface object (tiled texture)
|
||||
bool ImportD3D12TextureAsSurface(ID3D12Resource* resource,
|
||||
cudaExternalMemory_t* out_ext_mem,
|
||||
cudaMipmappedArray_t* out_mipmap,
|
||||
cudaSurfaceObject_t* out_surface);
|
||||
|
||||
private:
|
||||
ID3D12Device* m_device;
|
||||
CUcontext m_cudaContext;
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
// CUDA kernel to write RGBA buffer to surface object
|
||||
// This handles the tiled texture layout automatically
|
||||
|
||||
extern "C" __global__ void WriteSurfaceFromBuffer_Kernel(
|
||||
const unsigned char* __restrict__ rgba_buffer,
|
||||
cudaSurfaceObject_t rgba_surface,
|
||||
unsigned int width,
|
||||
unsigned int height,
|
||||
unsigned int pitch
|
||||
)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x >= width || y >= height) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Read from linear buffer
|
||||
int idx = y * pitch + x * 4;
|
||||
uchar4 rgba_pixel = make_uchar4(
|
||||
rgba_buffer[idx + 0], // R
|
||||
rgba_buffer[idx + 1], // G
|
||||
rgba_buffer[idx + 2], // B
|
||||
rgba_buffer[idx + 3] // A
|
||||
);
|
||||
|
||||
// Write to tiled surface (handles layout automatically)
|
||||
surf2Dwrite(rgba_pixel, rgba_surface, x * sizeof(uchar4), y);
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
#pragma once
|
||||
|
||||
// Embedded PTX for RGBA surface write kernel
|
||||
// Generated from rgba_surface_write_kernel.cu
|
||||
|
||||
namespace VavCore {
|
||||
|
||||
const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
|
||||
//
|
||||
// Generated by NVIDIA NVVM Compiler
|
||||
//
|
||||
// Compiler Build ID: CL-36424714
|
||||
// Cuda compilation tools, release 13.0, V13.0.88
|
||||
// Based on NVVM 7.0.1
|
||||
//
|
||||
|
||||
.version 9.0
|
||||
.target sm_75
|
||||
.address_size 64
|
||||
|
||||
// .globl WriteSurfaceFromBuffer_Kernel
|
||||
|
||||
.visible .entry WriteSurfaceFromBuffer_Kernel(
|
||||
.param .u64 WriteSurfaceFromBuffer_Kernel_param_0,
|
||||
.param .u64 WriteSurfaceFromBuffer_Kernel_param_1,
|
||||
.param .u32 WriteSurfaceFromBuffer_Kernel_param_2,
|
||||
.param .u32 WriteSurfaceFromBuffer_Kernel_param_3,
|
||||
.param .u32 WriteSurfaceFromBuffer_Kernel_param_4
|
||||
)
|
||||
{
|
||||
.reg .pred %p<4>;
|
||||
.reg .b16 %rs<9>;
|
||||
.reg .b32 %r<14>;
|
||||
.reg .b64 %rd<6>;
|
||||
|
||||
|
||||
ld.param.u64 %rd1, [WriteSurfaceFromBuffer_Kernel_param_0];
|
||||
ld.param.u64 %rd2, [WriteSurfaceFromBuffer_Kernel_param_1];
|
||||
ld.param.u32 %r4, [WriteSurfaceFromBuffer_Kernel_param_2];
|
||||
ld.param.u32 %r5, [WriteSurfaceFromBuffer_Kernel_param_3];
|
||||
ld.param.u32 %r3, [WriteSurfaceFromBuffer_Kernel_param_4];
|
||||
mov.u32 %r6, %ntid.x;
|
||||
mov.u32 %r7, %ctaid.x;
|
||||
mov.u32 %r8, %tid.x;
|
||||
mad.lo.s32 %r1, %r7, %r6, %r8;
|
||||
mov.u32 %r9, %ntid.y;
|
||||
mov.u32 %r10, %ctaid.y;
|
||||
mov.u32 %r11, %tid.y;
|
||||
mad.lo.s32 %r2, %r10, %r9, %r11;
|
||||
setp.ge.u32 %p1, %r1, %r4;
|
||||
setp.ge.u32 %p2, %r2, %r5;
|
||||
or.pred %p3, %p1, %p2;
|
||||
@%p3 bra $L__BB0_2;
|
||||
|
||||
shl.b32 %r12, %r1, 2;
|
||||
mad.lo.s32 %r13, %r2, %r3, %r12;
|
||||
cvt.s64.s32 %rd3, %r13;
|
||||
cvta.to.global.u64 %rd4, %rd1;
|
||||
add.s64 %rd5, %rd4, %rd3;
|
||||
ld.global.nc.u8 %rs1, [%rd5];
|
||||
ld.global.nc.u8 %rs3, [%rd5+1];
|
||||
ld.global.nc.u8 %rs5, [%rd5+2];
|
||||
ld.global.nc.u8 %rs7, [%rd5+3];
|
||||
sust.b.2d.v4.b8.trap [%rd2, {%r12, %r2}], {%rs1, %rs3, %rs5, %rs7};
|
||||
|
||||
$L__BB0_2:
|
||||
ret;
|
||||
|
||||
}
|
||||
)PTX";
|
||||
|
||||
} // namespace VavCore
|
||||
Reference in New Issue
Block a user