# CUDA Surface Object Refactoring - COMPLETED ✅

2025-10-06 09:16:01 +09:00
parent e63bd48731
commit 73d9d8d9a6
14 changed files with 2459 additions and 1048 deletions
--- a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.cpp
@@ -1,6 +1,7 @@
 #include "pch.h"
 #include "D3D12SurfaceHandler.h"
 #include "ExternalMemoryCache.h"
+#include "rgba_surface_write_kernel_ptx.h"
 #include "../Common/VavCoreLogger.h"

 namespace VavCore {
@@ -9,12 +10,25 @@ D3D12SurfaceHandler::D3D12SurfaceHandler(ID3D12Device* device, CUcontext cuda_co
    : m_device(device)
    , m_cudaContext(cuda_context)
    , m_cache(std::make_unique<ExternalMemoryCache>(device, cuda_context))
+    , m_surfaceWriteModule(nullptr)
+    , m_surfaceWriteKernel(nullptr)
 {
+    // Load surface write kernel
+    if (!LoadSurfaceWriteKernel()) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write kernel");
+    }
+
    LOGF_DEBUG("[D3D12SurfaceHandler] Created");
 }

 D3D12SurfaceHandler::~D3D12SurfaceHandler()
 {
+    // Unload surface write kernel
+    if (m_surfaceWriteModule) {
+        cuModuleUnload(m_surfaceWriteModule);
+        m_surfaceWriteModule = nullptr;
+    }
+
    LOGF_DEBUG("[D3D12SurfaceHandler] Destroyed");
 }

@@ -236,82 +250,106 @@ bool D3D12SurfaceHandler::CopyRGBAFrame(CUdeviceptr src_rgba,
        return false;
    }

-    // Get CUDA device pointer for D3D12 texture
-    CUdeviceptr dst_ptr;
-    if (!GetD3D12CUDAPointer(dst_texture, &dst_ptr)) {
-        LOGF_ERROR("[D3D12SurfaceHandler] Failed to get CUDA pointer for RGBA texture");
+    // Get CUDA surface object for D3D12 texture (NEW: using surface instead of linear buffer)
+    cudaSurfaceObject_t dst_surface;
+    if (!m_cache->GetOrCreateSurfaceObject(dst_texture, &dst_surface)) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Failed to get surface object for RGBA texture");
        return false;
    }

-    // Get D3D12 texture layout (ROW_MAJOR should have predictable pitch)
-    D3D12_RESOURCE_DESC desc = dst_texture->GetDesc();
-    D3D12_PLACED_SUBRESOURCE_FOOTPRINT layout;
-    UINT num_rows;
-    UINT64 row_size, total_size;
-    m_device->GetCopyableFootprints(&desc, 0, 1, 0, &layout, &num_rows, &row_size, &total_size);
-
-    UINT dst_pitch = layout.Footprint.RowPitch;
    uint32_t src_pitch = width * 4;  // RGBA: 4 bytes per pixel

-    LOGF_DEBUG("[D3D12SurfaceHandler] CopyRGBAFrame DEBUG: width=%u, height=%u", width, height);
-    LOGF_DEBUG("  src_pitch=%u, dst_pitch=%u", src_pitch, dst_pitch);
-    LOGF_DEBUG("  D3D12 texture desc: Width=%llu, Height=%u, Format=%d",
-               desc.Width, desc.Height, desc.Format);
-    LOGF_DEBUG("  Footprint: Width=%u, Height=%u, Depth=%u, RowPitch=%u",
-               layout.Footprint.Width, layout.Footprint.Height,
-               layout.Footprint.Depth, layout.Footprint.RowPitch);
+    LOGF_DEBUG("[D3D12SurfaceHandler] CopyRGBAFrame via surface: width=%u, height=%u, surface=0x%llX",
+               width, height, (unsigned long long)dst_surface);

-    // Direct RGBA copy using cudaMemcpy2D (zero extra conversion needed)
-    cudaError_t err = cudaMemcpy2D(
-        (void*)dst_ptr, dst_pitch,           // Destination: D3D12 texture with pitch
-        (void*)src_rgba, src_pitch,          // Source: CUDA RGBA buffer
-        width * 4,                           // Width in bytes (4 bytes per pixel)
-        height,                              // Height in rows
-        cudaMemcpyDeviceToDevice
-    );
-
-    if (err != cudaSuccess) {
-        LOGF_ERROR("[D3D12SurfaceHandler] RGBA copy failed: %s", cudaGetErrorString(err));
+    // Use surface write kernel to handle tiled layout automatically
+    if (!CopyRGBAToSurfaceViaKernel(dst_surface, src_rgba, width, height, src_pitch)) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Failed to copy RGBA via surface kernel");
        return false;
    }

-    LOGF_DEBUG("[D3D12SurfaceHandler] RGBA frame copied to D3D12 texture (%ux%u)", width, height);
+    LOGF_DEBUG("[D3D12SurfaceHandler] RGBA frame copied to D3D12 texture via surface (%ux%u)", width, height);

-    // DEBUG: Read back D3D12 texture via CUDA to verify orientation
-    static int d3d12_debug_count = 0;
-    if (d3d12_debug_count < 1) {
-        // Sample first 32x32 pixels from D3D12 texture
-        const int sample_size = 32;
-        uint8_t* cpu_sample = new uint8_t[sample_size * sample_size * 4];
-
-        cudaError_t err = cudaMemcpy2D(
-            cpu_sample, sample_size * 4,
-            (void*)dst_ptr, dst_pitch,
-            sample_size * 4, sample_size,
-            cudaMemcpyDeviceToHost
-        );
-
-        if (err == cudaSuccess) {
-            LOGF_DEBUG("[D3D12SurfaceHandler] D3D12 texture sample (32x32) via CUDA:");
-            for (int y = 0; y < 4; y++) {
-                char line[1024];
-                int pos = 0;
-                pos += snprintf(line + pos, sizeof(line) - pos, "  y=%d: ", y);
-                for (int x = 0; x < 16; x++) {
-                    uint8_t r = cpu_sample[(y * sample_size + x) * 4 + 0];
-                    pos += snprintf(line + pos, sizeof(line) - pos, "%3d ", r);
-                }
-                LOGF_DEBUG("%s", line);
-            }
-        } else {
-            LOGF_ERROR("[D3D12SurfaceHandler] Failed to sample D3D12 texture: %s", cudaGetErrorString(err));
-        }
-
-        delete[] cpu_sample;
-        d3d12_debug_count++;
-    }
+    // NOTE: Debug sampling removed because surface objects don't expose linear pointers
+    // To verify output, save BMP files in RedSurfaceNVDECTest

    return true;
 }

+bool D3D12SurfaceHandler::LoadSurfaceWriteKernel()
+{
+    LOGF_DEBUG("[D3D12SurfaceHandler] Loading PTX from embedded string");
+
+    // Load PTX module from embedded string (no file I/O needed!)
+    CUresult result = cuModuleLoadData(&m_surfaceWriteModule, RGBA_SURFACE_WRITE_KERNEL_PTX);
+    if (result != CUDA_SUCCESS) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Failed to load surface write PTX from memory: %d", result);
+        return false;
+    }
+
+    // Get kernel function
+    result = cuModuleGetFunction(&m_surfaceWriteKernel, m_surfaceWriteModule, "WriteSurfaceFromBuffer_Kernel");
+    if (result != CUDA_SUCCESS) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Failed to get kernel function: %d", result);
+        cuModuleUnload(m_surfaceWriteModule);
+        m_surfaceWriteModule = nullptr;
+        return false;
+    }
+
+    LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel loaded successfully from embedded PTX");
+    return true;
+}
+
+bool D3D12SurfaceHandler::CopyRGBAToSurfaceViaKernel(cudaSurfaceObject_t dst_surface,
+                                                       CUdeviceptr src_rgba,
+                                                       uint32_t width, uint32_t height, uint32_t src_pitch)
+{
+    if (!m_surfaceWriteKernel) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Surface write kernel not loaded");
+        return false;
+    }
+
+    // Launch parameters: 16x16 thread blocks
+    dim3 block_size(16, 16);
+    dim3 grid_size((width + 15) / 16, (height + 15) / 16);
+
+    // Kernel arguments
+    void* kernel_args[] = {
+        &src_rgba,
+        &dst_surface,
+        &width,
+        &height,
+        &src_pitch
+    };
+
+    LOGF_DEBUG("[D3D12SurfaceHandler] Launching kernel: grid=(%u,%u), block=(%u,%u)",
+               grid_size.x, grid_size.y, block_size.x, block_size.y);
+
+    // Launch kernel
+    CUresult result = cuLaunchKernel(
+        m_surfaceWriteKernel,
+        grid_size.x, grid_size.y, 1,
+        block_size.x, block_size.y, 1,
+        0,       // Shared memory
+        0,       // Stream
+        kernel_args,
+        nullptr
+    );
+
+    if (result != CUDA_SUCCESS) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Kernel launch failed: %d", result);
+        return false;
+    }
+
+    // Synchronize to ensure kernel completes
+    cudaError_t err = cudaDeviceSynchronize();
+    if (err != cudaSuccess) {
+        LOGF_ERROR("[D3D12SurfaceHandler] Kernel synchronization failed: %s", cudaGetErrorString(err));
+        return false;
+    }
+
+    LOGF_DEBUG("[D3D12SurfaceHandler] Surface write kernel completed successfully");
+    return true;
+}
+
 } // namespace VavCore
--- a/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/D3D12SurfaceHandler.h
@@ -63,10 +63,22 @@ private:
                    CUdeviceptr dst, uint32_t dst_pitch,
                    uint32_t width, uint32_t height);

+    // Load surface write kernel from PTX
+    bool LoadSurfaceWriteKernel();
+
+    // Copy RGBA buffer to D3D12 surface using CUDA kernel
+    bool CopyRGBAToSurfaceViaKernel(cudaSurfaceObject_t dst_surface,
+                                     CUdeviceptr src_rgba,
+                                     uint32_t width, uint32_t height, uint32_t src_pitch);
+
 private:
    ID3D12Device* m_device;
    CUcontext m_cudaContext;
    std::unique_ptr<ExternalMemoryCache> m_cache;
+
+    // Surface write kernel
+    CUmodule m_surfaceWriteModule;
+    CUfunction m_surfaceWriteKernel;
 };

 } // namespace VavCore
--- a/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.cpp
+++ b/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.cpp
@@ -39,15 +39,56 @@ bool ExternalMemoryCache::GetOrCreateExternalMemory(ID3D12Resource* resource, CU
    }

    // Add to cache
-    CachedEntry entry;
+    CachedEntry entry = {};
    entry.external_memory = external_memory;
    entry.device_ptr = device_ptr;
    entry.size = 0; // Size is stored but not used currently
+    entry.is_texture = false; // This is a buffer resource

    m_cache[resource] = entry;
    *out_ptr = device_ptr;

-    LOGF_DEBUG("[ExternalMemoryCache] New resource cached");
+    LOGF_DEBUG("[ExternalMemoryCache] New buffer resource cached");
+    return true;
+}
+
+bool ExternalMemoryCache::GetOrCreateSurfaceObject(ID3D12Resource* resource, cudaSurfaceObject_t* out_surface)
+{
+    if (!resource || !out_surface) {
+        return false;
+    }
+
+    // Check cache first
+    auto it = m_cache.find(resource);
+    if (it != m_cache.end()) {
+        if (!it->second.is_texture) {
+            LOGF_ERROR("[ExternalMemoryCache] Resource was cached as buffer, not texture");
+            return false;
+        }
+        *out_surface = it->second.surface;
+        return true;
+    }
+
+    // Not in cache, import texture resource
+    cudaExternalMemory_t external_memory;
+    cudaMipmappedArray_t mipmapped_array;
+    cudaSurfaceObject_t surface;
+
+    if (!ImportD3D12TextureAsSurface(resource, &external_memory, &mipmapped_array, &surface)) {
+        return false;
+    }
+
+    // Add to cache
+    CachedEntry entry = {};
+    entry.external_memory = external_memory;
+    entry.mipmapped_array = mipmapped_array;
+    entry.surface = surface;
+    entry.is_texture = true;
+
+    m_cache[resource] = entry;
+    *out_surface = surface;
+
+    LOGF_DEBUG("[ExternalMemoryCache] New texture resource cached");
    return true;
 }

@@ -58,6 +99,16 @@ void ExternalMemoryCache::Release(ID3D12Resource* resource)
        return;
    }

+    // Clean up based on resource type
+    if (it->second.is_texture) {
+        if (it->second.surface) {
+            cudaDestroySurfaceObject(it->second.surface);
+        }
+        if (it->second.mipmapped_array) {
+            cudaFreeMipmappedArray(it->second.mipmapped_array);
+        }
+    }
+
    // Destroy external memory
    cudaDestroyExternalMemory(it->second.external_memory);
    m_cache.erase(it);
@@ -68,6 +119,15 @@ void ExternalMemoryCache::Release(ID3D12Resource* resource)
 void ExternalMemoryCache::ReleaseAll()
 {
    for (auto& pair : m_cache) {
+        // Clean up based on resource type
+        if (pair.second.is_texture) {
+            if (pair.second.surface) {
+                cudaDestroySurfaceObject(pair.second.surface);
+            }
+            if (pair.second.mipmapped_array) {
+                cudaFreeMipmappedArray(pair.second.mipmapped_array);
+            }
+        }
        cudaDestroyExternalMemory(pair.second.external_memory);
    }
    m_cache.clear();
@@ -138,4 +198,94 @@ bool ExternalMemoryCache::ImportD3D12Resource(ID3D12Resource* resource,
    return true;
 }

+bool ExternalMemoryCache::ImportD3D12TextureAsSurface(ID3D12Resource* resource,
+                                                       cudaExternalMemory_t* out_ext_mem,
+                                                       cudaMipmappedArray_t* out_mipmap,
+                                                       cudaSurfaceObject_t* out_surface)
+{
+    // Step 1: Create shared handle for D3D12 resource
+    HANDLE shared_handle = nullptr;
+    HRESULT hr = m_device->CreateSharedHandle(resource, nullptr, GENERIC_ALL, nullptr, &shared_handle);
+
+    if (FAILED(hr)) {
+        LOGF_ERROR("[ExternalMemoryCache] CreateSharedHandle failed: 0x%08X", hr);
+        return false;
+    }
+
+    // Step 2: Get resource allocation info
+    D3D12_RESOURCE_DESC desc = resource->GetDesc();
+    D3D12_RESOURCE_ALLOCATION_INFO alloc_info = m_device->GetResourceAllocationInfo(0, 1, &desc);
+
+    LOGF_DEBUG("[ExternalMemoryCache] Texture resource: %llux%u, format=%d",
+              desc.Width, desc.Height, desc.Format);
+
+    // Step 3: Import as external memory
+    cudaExternalMemoryHandleDesc mem_desc = {};
+    mem_desc.type = cudaExternalMemoryHandleTypeD3D12Resource;
+    mem_desc.handle.win32.handle = shared_handle;
+    mem_desc.handle.win32.name = nullptr;
+    mem_desc.size = alloc_info.SizeInBytes;
+    mem_desc.flags = cudaExternalMemoryDedicated;
+
+    cudaExternalMemory_t external_memory;
+    cudaError_t err = cudaImportExternalMemory(&external_memory, &mem_desc);
+
+    CloseHandle(shared_handle);
+
+    if (err != cudaSuccess) {
+        LOGF_ERROR("[ExternalMemoryCache] cudaImportExternalMemory failed: %s", cudaGetErrorString(err));
+        return false;
+    }
+
+    // Step 4: Get mipmapped array from external memory
+    cudaExternalMemoryMipmappedArrayDesc mipmap_desc = {};
+    mipmap_desc.extent = make_cudaExtent(desc.Width, desc.Height, 0);
+    mipmap_desc.formatDesc = cudaCreateChannelDesc<uchar4>();  // RGBA8 (R8G8B8A8_UNORM)
+    mipmap_desc.numLevels = 1;
+    mipmap_desc.flags = cudaArraySurfaceLoadStore;  // Enable surface writes
+
+    cudaMipmappedArray_t mipmapped_array;
+    err = cudaExternalMemoryGetMappedMipmappedArray(&mipmapped_array, external_memory, &mipmap_desc);
+
+    if (err != cudaSuccess) {
+        LOGF_ERROR("[ExternalMemoryCache] cudaExternalMemoryGetMappedMipmappedArray failed: %s", cudaGetErrorString(err));
+        cudaDestroyExternalMemory(external_memory);
+        return false;
+    }
+
+    // Step 5: Get level 0 array
+    cudaArray_t array;
+    err = cudaGetMipmappedArrayLevel(&array, mipmapped_array, 0);
+
+    if (err != cudaSuccess) {
+        LOGF_ERROR("[ExternalMemoryCache] cudaGetMipmappedArrayLevel failed: %s", cudaGetErrorString(err));
+        cudaFreeMipmappedArray(mipmapped_array);
+        cudaDestroyExternalMemory(external_memory);
+        return false;
+    }
+
+    // Step 6: Create surface object
+    cudaResourceDesc res_desc = {};
+    res_desc.resType = cudaResourceTypeArray;
+    res_desc.res.array.array = array;
+
+    cudaSurfaceObject_t surface;
+    err = cudaCreateSurfaceObject(&surface, &res_desc);
+
+    if (err != cudaSuccess) {
+        LOGF_ERROR("[ExternalMemoryCache] cudaCreateSurfaceObject failed: %s", cudaGetErrorString(err));
+        cudaFreeMipmappedArray(mipmapped_array);
+        cudaDestroyExternalMemory(external_memory);
+        return false;
+    }
+
+    LOGF_DEBUG("[ExternalMemoryCache] Successfully created surface object for D3D12 texture (surface=0x%llX)", (unsigned long long)surface);
+
+    *out_ext_mem = external_memory;
+    *out_mipmap = mipmapped_array;
+    *out_surface = surface;
+
+    return true;
+}
+
 } // namespace VavCore
--- a/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/ExternalMemoryCache.h
@@ -14,10 +14,14 @@ public:
    ExternalMemoryCache(ID3D12Device* device, CUcontext cuda_context);
    ~ExternalMemoryCache();

-    // Get or create CUDA device pointer for D3D12 resource
+    // Get or create CUDA device pointer for D3D12 buffer resource
    // Returns true on success, false on failure
    bool GetOrCreateExternalMemory(ID3D12Resource* resource, CUdeviceptr* out_ptr);

+    // Get or create CUDA surface object for D3D12 texture resource
+    // Returns true on success, false on failure
+    bool GetOrCreateSurfaceObject(ID3D12Resource* resource, cudaSurfaceObject_t* out_surface);
+
    // Release specific resource from cache
    void Release(ID3D12Resource* resource);

@@ -30,15 +34,25 @@ public:
 private:
    struct CachedEntry {
        cudaExternalMemory_t external_memory;
-        CUdeviceptr device_ptr;
+        CUdeviceptr device_ptr;              // For buffer resources
+        cudaMipmappedArray_t mipmapped_array; // For texture resources
+        cudaArray_t array;                   // Level 0 of mipmapped array
+        cudaSurfaceObject_t surface;         // Surface object for texture writes
        size_t size;
+        bool is_texture;                     // True if texture, false if buffer
    };

-    // Import D3D12 resource as CUDA external memory
+    // Import D3D12 buffer resource as CUDA external memory (linear buffer)
    bool ImportD3D12Resource(ID3D12Resource* resource,
                            cudaExternalMemory_t* out_ext_mem,
                            CUdeviceptr* out_ptr);

+    // Import D3D12 texture resource as CUDA surface object (tiled texture)
+    bool ImportD3D12TextureAsSurface(ID3D12Resource* resource,
+                                      cudaExternalMemory_t* out_ext_mem,
+                                      cudaMipmappedArray_t* out_mipmap,
+                                      cudaSurfaceObject_t* out_surface);
+
 private:
    ID3D12Device* m_device;
    CUcontext m_cudaContext;
--- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu
+++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel.cu
@@ -0,0 +1,30 @@
+// CUDA kernel to write RGBA buffer to surface object
+// This handles the tiled texture layout automatically
+
+extern "C" __global__ void WriteSurfaceFromBuffer_Kernel(
+    const unsigned char* __restrict__ rgba_buffer,
+    cudaSurfaceObject_t rgba_surface,
+    unsigned int width,
+    unsigned int height,
+    unsigned int pitch
+)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height) {
+        return;
+    }
+
+    // Read from linear buffer
+    int idx = y * pitch + x * 4;
+    uchar4 rgba_pixel = make_uchar4(
+        rgba_buffer[idx + 0],  // R
+        rgba_buffer[idx + 1],  // G
+        rgba_buffer[idx + 2],  // B
+        rgba_buffer[idx + 3]   // A
+    );
+
+    // Write to tiled surface (handles layout automatically)
+    surf2Dwrite(rgba_pixel, rgba_surface, x * sizeof(uchar4), y);
+}
--- a/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h
+++ b/vav2/platforms/windows/vavcore/src/Decoder/rgba_surface_write_kernel_ptx.h
@@ -0,0 +1,72 @@
+#pragma once
+
+// Embedded PTX for RGBA surface write kernel
+// Generated from rgba_surface_write_kernel.cu
+
+namespace VavCore {
+
+const char* RGBA_SURFACE_WRITE_KERNEL_PTX = R"PTX(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-36424714
+// Cuda compilation tools, release 13.0, V13.0.88
+// Based on NVVM 7.0.1
+//
+
+.version 9.0
+.target sm_75
+.address_size 64
+
+	// .globl	WriteSurfaceFromBuffer_Kernel
+
+.visible .entry WriteSurfaceFromBuffer_Kernel(
+	.param .u64 WriteSurfaceFromBuffer_Kernel_param_0,
+	.param .u64 WriteSurfaceFromBuffer_Kernel_param_1,
+	.param .u32 WriteSurfaceFromBuffer_Kernel_param_2,
+	.param .u32 WriteSurfaceFromBuffer_Kernel_param_3,
+	.param .u32 WriteSurfaceFromBuffer_Kernel_param_4
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<14>;
+	.reg .b64 	%rd<6>;
+
+
+	ld.param.u64 	%rd1, [WriteSurfaceFromBuffer_Kernel_param_0];
+	ld.param.u64 	%rd2, [WriteSurfaceFromBuffer_Kernel_param_1];
+	ld.param.u32 	%r4, [WriteSurfaceFromBuffer_Kernel_param_2];
+	ld.param.u32 	%r5, [WriteSurfaceFromBuffer_Kernel_param_3];
+	ld.param.u32 	%r3, [WriteSurfaceFromBuffer_Kernel_param_4];
+	mov.u32 	%r6, %ntid.x;
+	mov.u32 	%r7, %ctaid.x;
+	mov.u32 	%r8, %tid.x;
+	mad.lo.s32 	%r1, %r7, %r6, %r8;
+	mov.u32 	%r9, %ntid.y;
+	mov.u32 	%r10, %ctaid.y;
+	mov.u32 	%r11, %tid.y;
+	mad.lo.s32 	%r2, %r10, %r9, %r11;
+	setp.ge.u32 	%p1, %r1, %r4;
+	setp.ge.u32 	%p2, %r2, %r5;
+	or.pred  	%p3, %p1, %p2;
+	@%p3 bra 	$L__BB0_2;
+
+	shl.b32 	%r12, %r1, 2;
+	mad.lo.s32 	%r13, %r2, %r3, %r12;
+	cvt.s64.s32 	%rd3, %r13;
+	cvta.to.global.u64 	%rd4, %rd1;
+	add.s64 	%rd5, %rd4, %rd3;
+	ld.global.nc.u8 	%rs1, [%rd5];
+	ld.global.nc.u8 	%rs3, [%rd5+1];
+	ld.global.nc.u8 	%rs5, [%rd5+2];
+	ld.global.nc.u8 	%rs7, [%rd5+3];
+	sust.b.2d.v4.b8.trap 	[%rd2, {%r12, %r2}], {%rs1, %rs3, %rs5, %rs7};
+
+$L__BB0_2:
+	ret;
+
+}
+)PTX";
+
+} // namespace VavCore