From 33cc48f5912db78542485105c91f7fe8209e95c3 Mon Sep 17 00:00:00 2001
From: Stephen <elfarto@elfarto.com>
Date: Sat, 7 Dec 2024 10:27:40 +0000
Subject: [PATCH] reverted single plane export

---
 src/direct/direct-export-buf.c | 140 ++++++++++----------
 src/direct/nv-driver.c         | 229 ++++++++++++++++-----------------
 src/direct/nv-driver.h         |  10 +-
 3 files changed, 181 insertions(+), 198 deletions(-)

diff --git a/src/direct/direct-export-buf.c b/src/direct/direct-export-buf.c
index e006bf1..7618508 100644
--- a/src/direct/direct-export-buf.c
+++ b/src/direct/direct-export-buf.c
@@ -42,6 +42,46 @@ static void findGPUIndexFromFd(NVDriver *drv) {
     drv->cudaGpuId = 0;
 }
 
+static bool import_to_cuda(NVDriver *drv, NVDriverImage *image, int bpc, int channels, NVCudaImage *cudaImage, CUarray *array) {
+    CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {
+        .type      = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+        .handle.fd = image->nvFd,
+        .flags     = 0,
+        .size      = image->memorySize
+    };
+
+    //LOG("importing memory size: %dx%d = %x", image->width, image->height, image->memorySize);
+
+    CHECK_CUDA_RESULT_RETURN(drv->cu->cuImportExternalMemory(&cudaImage->extMem, &extMemDesc), false);
+
+    //For some reason, this close *must* be *here*, otherwise we will get random visual glitches.
+    close(image->nvFd);
+    close(image->nvFd2);
+    image->nvFd = 0;
+    image->nvFd2 = 0;
+
+    CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapArrayDesc = {
+        .arrayDesc = {
+            .Width = image->width,
+            .Height = image->height,
+            .Depth = 0,
+            .Format = bpc == 8 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16,
+            .NumChannels = channels,
+            .Flags = 0
+        },
+        .numLevels = 1,
+        .offset = 0
+    };
+    //create a mimap array from the imported memory
+    CHECK_CUDA_RESULT_RETURN(drv->cu->cuExternalMemoryGetMappedMipmappedArray(&cudaImage->mipmapArray, cudaImage->extMem, &mipmapArrayDesc), false);
+
+    //create an array from the mipmap array
+    CHECK_CUDA_RESULT_RETURN(drv->cu->cuMipmappedArrayGetLevel(array, cudaImage->mipmapArray, 0), false);
+
+    return true;
+}
+
+
 static void debug(EGLenum error,const char *command,EGLint messageType,EGLLabelKHR threadLabel,EGLLabelKHR objectLabel,const char* message) {
     LOG("[EGL] %s: %s", command, message);
 }
@@ -146,92 +186,49 @@ static BackingImage *direct_allocateBackingImage(NVDriver *drv, NVSurface *surfa
     }
 
     const NVFormatInfo *fmtInfo = &formatsInfo[backingImage->format];
+    const NVFormatPlane *p = fmtInfo->plane;
 
-    backingImage->totalSize = calculate_image_size(&drv->driverContext, driverImages, surface->width, surface->height, fmtInfo->bppc, fmtInfo->numPlanes, fmtInfo->plane);
-    LOG("Allocating BackingImage: %p %ux%u = %u bytes", backingImage, surface->width, surface->height, backingImage->totalSize);
-
-    //alloc memory - Note this requires that all the planes have the same widthInBytes
-    //otherwise the value passed to the kernel driver won't be correct, luckily all the formats
-    //we currently support are all the same width
-    int memFd = 0, memFd2 = 0, drmFd = 0;
-    if (!alloc_buffer(&drv->driverContext, backingImage->totalSize, driverImages, &memFd, &memFd2, &drmFd)) {
-        goto import_fail;
-    }
-    LOG("Allocate Buffer: %d %d %d", memFd, memFd2, drmFd);
-
-    //import the memory to CUDA
-    const CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {
-        .type      = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
-        .handle.fd = memFd,
-        .flags     = 0,
-        .size      = backingImage->totalSize
-    };
-
-    LOG("Importing memory to CUDA")
-    if (CHECK_CUDA_RESULT(drv->cu->cuImportExternalMemory(&backingImage->extMem, &extMemDesc))) {
-        goto import_fail;
+    LOG("Allocating BackingImages: %p %dx%d", backingImage, surface->width, surface->height);
+    for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+        alloc_image(&drv->driverContext, surface->width >> p[i].ss.x, surface->height >> p[i].ss.y,
+                    p[i].channelCount, 8 * fmtInfo->bppc, p[i].fourcc, &driverImages[i]);
     }
 
-    close(memFd2);
-    memFd2 = -1;
-    // memFd file descriptor is closed by CUDA after importing
-    memFd = -1;
-
-
-    //now map the arrays
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
-        CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapArrayDesc = {
-            .arrayDesc = {
-                .Width = driverImages[i].width,
-                .Height = driverImages[i].height,
-                .Depth = 0,
-                .Format = fmtInfo->bppc == 1 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16,
-                .NumChannels = fmtInfo->plane[i].channelCount,
-                .Flags = 0
-            },
-            .numLevels = 1,
-            .offset = driverImages[i].offset
-        };
-
-        //create a mimap array from the imported memory
-        if (CHECK_CUDA_RESULT(drv->cu->cuExternalMemoryGetMappedMipmappedArray(&backingImage->cudaImages[i].mipmapArray, backingImage->extMem, &mipmapArrayDesc))) {
+        if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i]))
             goto bail;
-        }
-
-        //create an array from the mipmap array
-        if (CHECK_CUDA_RESULT(drv->cu->cuMipmappedArrayGetLevel(&backingImage->arrays[i], backingImage->cudaImages[i].mipmapArray, 0))) {
-            goto bail;
-        }
     }
 
     backingImage->width = surface->width;
     backingImage->height = surface->height;
-    backingImage->fds[0] = drmFd;
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+        backingImage->fds[i] = driverImages[i].drmFd;
         backingImage->strides[i] = driverImages[i].pitch;
         backingImage->mods[i] = driverImages[i].mods;
-        backingImage->offsets[i] = driverImages[i].offset;
+        backingImage->size[i] = driverImages[i].memorySize;
     }
 
     return backingImage;
 
 bail:
-    destroyBackingImage(drv, backingImage);
     //another 'free' might occur on this pointer.
     //hence, set it to NULL to ensure no operation is performed if this really happens.
     backingImage = NULL;
-
-import_fail:
-    if (memFd >= 0) {
-        close(memFd);
-    }
-    if (memFd2 >= 0) {
-        close(memFd2);
+    for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+        if (driverImages[i].nvFd != 0) {
+            close(driverImages[i].nvFd);
+        }
+        if (driverImages[i].nvFd2 != 0) {
+            close(driverImages[i].nvFd2);
+        }
+        if (driverImages[i].drmFd != 0) {
+            close(driverImages[i].drmFd);
+        }
     }
-    if (drmFd >= 0) {
-        close(drmFd);
+
+    if (backingImage !=  NULL) {
+        destroyBackingImage(drv, backingImage);
     }
-    free(backingImage);
     return NULL;
 }
 
@@ -365,19 +362,16 @@ static bool direct_fillExportDescriptor(NVDriver *drv, NVSurface *surface, VADRM
     desc->height = surface->height;
 
     desc->num_layers = fmtInfo->numPlanes;
-    desc->num_objects = 1;
-    //desc->num_objects = 2;
-    desc->objects[0].fd = dup(img->fds[0]);
-    desc->objects[0].size = img->totalSize;
-    desc->objects[0].drm_format_modifier = img->mods[0];
-    //desc->objects[1].fd = dup(img->fds[0]);
-    //desc->objects[1].size = img->totalSize;
-    //desc->objects[1].drm_format_modifier = img->mods[1];
+    desc->num_objects = fmtInfo->numPlanes;
 
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+        desc->objects[i].fd = dup(img->fds[i]);
+        desc->objects[i].size = img->size[i];
+        desc->objects[i].drm_format_modifier = img->mods[i];
+
         desc->layers[i].drm_format = fmtInfo->plane[i].fourcc;
         desc->layers[i].num_planes = 1;
-        desc->layers[i].object_index[0] = 0;
+        desc->layers[i].object_index[0] = i;
         desc->layers[i].offset[0] = img->offsets[i];
         desc->layers[i].pitch[0] = img->strides[i];
     }
diff --git a/src/direct/nv-driver.c b/src/direct/nv-driver.c
index d137998..f7f3aac 100644
--- a/src/direct/nv-driver.c
+++ b/src/direct/nv-driver.c
@@ -477,126 +477,115 @@ bool alloc_memory(const NVDriverContext *context, const uint32_t size, int *fd)
     return false;
 }
 
-uint32_t calculate_image_size(const NVDriverContext *context, NVDriverImage images[], const uint32_t width, const uint32_t height,
-                              const uint32_t bppc, const uint32_t numPlanes, const NVFormatPlane planes[]) {
-    //first figure out the gob layout
-    const uint32_t log2GobsPerBlockX = 0;
-    const uint32_t log2GobsPerBlockZ = 0;
-
-    uint32_t offset = 0;
-    for (uint32_t i = 0; i < numPlanes; i++) {
-        //calculate each planes dimensions and bpp
-        const uint32_t planeWidth = width >> planes[i].ss.x;
-        const uint32_t planeHeight = height >> planes[i].ss.y;
-        const uint32_t bytesPerPixel = planes[i].channelCount * bppc;
-
-        //Depending on the height of the allocated image, the modifiers
-        //needed for the exported image to work correctly change. However, this can cause problems if the Y surface
-        //needs one modifier, and UV need another when attempting to use a single surface export (as only one modifier
-        //is possible). So for now we're just going to limit the minimum height to 88 pixels so we can use a single
-        //modifier.
-        //Update: with the single buffer export this no longer works, as we're only allowed one mod per fd when exporting
-        //so different memory layouts for different planes can't work. Luckily this only seems to effect videos <= 128 pixels high.
-        uint32_t log2GobsPerBlockY = 4;
-        //uint32_t log2GobsPerBlockY = (planeHeight < 88) ? 3 : 4;
-        //LOG("Calculated log2GobsPerBlockY: %dx%d == %d", planeWidth, planeHeight, log2GobsPerBlockY);
-
-        //These two seem to be correct, but it was discovered by trial and error so I'm not 100% sure
-        const uint32_t widthInBytes = ROUND_UP(planeWidth * bytesPerPixel, GOB_WIDTH_IN_BYTES << log2GobsPerBlockX);
-        const uint32_t alignedHeight = ROUND_UP(planeHeight, GOB_HEIGHT_IN_BYTES << log2GobsPerBlockY);
-
-        images[i].width = planeWidth;
-        images[i].height = alignedHeight;
-        images[i].offset = offset;
-        images[i].memorySize = widthInBytes * alignedHeight;
-        images[i].pitch = widthInBytes;
-        images[i].mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, context->sector_layout, context->page_kind_generation, context->generic_page_kind, log2GobsPerBlockY);
-        images[i].fourcc = planes[i].fourcc;
-        images[i].log2GobsPerBlockX = log2GobsPerBlockX;
-        images[i].log2GobsPerBlockY = log2GobsPerBlockY;
-        images[i].log2GobsPerBlockZ = log2GobsPerBlockZ;
-
-        offset += images[i].memorySize;
-        offset = ROUND_UP(offset, 64);
-    }
-
-    return offset;
-}
-
-bool alloc_buffer(NVDriverContext *context, const uint32_t size, const NVDriverImage images[], int *fd1, int *fd2, int *drmFd) {
-    int memFd = -1;
-    const bool ret = alloc_memory(context, size, &memFd);
-    if (!ret) {
-        LOG("alloc_memory failed")
-        return false;
-    }
-
-    //now export the dma-buf
-    //duplicate the fd so we don't invalidate it by importing it
-    const int memFd2 = dup(memFd);
-    if (memFd2 == -1) {
-        LOG("dup failed")
-        goto err;
-    }
-
-    struct NvKmsKapiPrivImportMemoryParams nvkmsParams = {
-        .memFd = memFd2,
-        .surfaceParams = {
-            .layout = NvKmsSurfaceMemoryLayoutBlockLinear,
-            .blockLinear = {
-                .genericMemory = 0,
-                .pitchInBlocks = images[0].pitch / GOB_WIDTH_IN_BYTES,
-                .log2GobsPerBlock.x = images[0].log2GobsPerBlockX,
-                .log2GobsPerBlock.y = images[0].log2GobsPerBlockY,
-                .log2GobsPerBlock.z = images[0].log2GobsPerBlockZ,
-            }
-        }
-    };
-
-    struct drm_nvidia_gem_import_nvkms_memory_params params = {
-        .mem_size = size,
-        .nvkms_params_ptr = (uint64_t) &nvkmsParams,
-        .nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver
-    };
-    int drmret = ioctl(context->drmFd, DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY, &params);
-    if (drmret != 0) {
-        LOG("DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY failed: %d %d", drmret, errno)
-        goto err;
-    }
+ bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t fourcc, NVDriverImage *image) {
+     uint32_t gobWidthInBytes = 64;
+     uint32_t gobHeightInBytes = 8;
+
+     uint32_t bytesPerChannel = bitsPerChannel/8;
+     uint32_t bytesPerPixel = channels * bytesPerChannel;
+
+     //first figure out the gob layout
+     uint32_t log2GobsPerBlockX = 0; //TODO not sure if these are the correct numbers to start with, but they're the largest ones i've seen used
+     uint32_t log2GobsPerBlockY = height < 88 ? 3 : 4; //TODO 88 is a guess, 80px high needs 3, 112px needs 4, 96px needs 4, 88px needs 4
+     uint32_t log2GobsPerBlockZ = 0;
+
+     //LOG("Calculated GOB size: %dx%d (%dx%d)", gobWidthInBytes << log2GobsPerBlockX, gobHeightInBytes << log2GobsPerBlockY, log2GobsPerBlockX, log2GobsPerBlockY);
+
+     //These two seem to be correct, but it was discovered by trial and error so I'm not 100% sure
+     uint32_t widthInBytes = ROUND_UP(width * bytesPerPixel, gobWidthInBytes << log2GobsPerBlockX);
+     uint32_t alignedHeight = ROUND_UP(height, gobHeightInBytes << log2GobsPerBlockY);
+
+     uint32_t imageSizeInBytes = widthInBytes * alignedHeight;
+     uint32_t size = imageSizeInBytes;
+
+     //this gets us some memory, and the fd to import into cuda
+     int memFd = -1;
+     bool ret = alloc_memory(context, size, &memFd);
+     if (!ret) {
+         LOG("alloc_memory failed");
+         return false;
+     }
+
+     //now export the dma-buf
+     uint32_t pitchInBlocks = widthInBytes / (gobWidthInBytes << log2GobsPerBlockX);
+
+     //printf("got gobsPerBlock: %ux%u %u %u %u %d\n", width, height, log2GobsPerBlockX, log2GobsPerBlockY, log2GobsPerBlockZ, pitchInBlocks);
+     //duplicate the fd so we don't invalidate it by importing it
+     int memFd2 = dup(memFd);
+     if (memFd2 == -1) {
+         LOG("dup failed");
+         goto err;
+     }
+
+     struct NvKmsKapiPrivImportMemoryParams nvkmsParams = {
+         .memFd = memFd2,
+         .surfaceParams = {
+             .layout = NvKmsSurfaceMemoryLayoutBlockLinear,
+             .blockLinear = {
+                 .genericMemory = 0,
+                 .pitchInBlocks = pitchInBlocks,
+                 .log2GobsPerBlock.x = log2GobsPerBlockX,
+                 .log2GobsPerBlock.y = log2GobsPerBlockY,
+                 .log2GobsPerBlock.z = log2GobsPerBlockZ,
+             }
+         }
+     };
+
+     struct drm_nvidia_gem_import_nvkms_memory_params params = {
+         .mem_size = imageSizeInBytes,
+         .nvkms_params_ptr = (uint64_t) &nvkmsParams,
+         .nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver
+     };
+     int drmret = ioctl(context->drmFd, DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY, &params);
+     if (drmret != 0) {
+         LOG("DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY failed: %d %d", drmret, errno);
+         goto err;
+     }
+
+     //export dma-buf
+     struct drm_prime_handle prime_handle = {
+         .handle = params.handle
+     };
+     drmret = ioctl(context->drmFd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle);
+     if (drmret != 0) {
+         LOG("DRM_IOCTL_PRIME_HANDLE_TO_FD failed: %d %d", drmret, errno);
+         goto err;
+     }
+
+     struct drm_gem_close gem_close = {
+         .handle = params.handle
+     };
+     drmret = ioctl(context->drmFd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+     if (drmret != 0) {
+         LOG("DRM_IOCTL_GEM_CLOSE failed: %d %d", drmret, errno);
+         goto prime_err;
+     }
+
+     image->width = width;
+     image->height = height;
+     image->nvFd = memFd;
+     image->nvFd2 = memFd2; //not sure why we can't close this one, we shouldn't need it after importing the image
+     image->drmFd = prime_handle.fd;
+     image->mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, context->sector_layout, context->page_kind_generation, context->generic_page_kind, log2GobsPerBlockY);
+     image->offset = 0;
+     image->pitch = widthInBytes;
+     image->memorySize = imageSizeInBytes;
+     image->fourcc = fourcc;
+
+     //LOG("created image: %dx%d %lx %d %x", width, height, image->mods, widthInBytes, imageSizeInBytes);
+
+     return true;
+
+ prime_err:
+     if (prime_handle.fd > 0) {
+         close(prime_handle.fd);
+     }
 
-    //export dma-buf
-    struct drm_prime_handle prime_handle = {
-        .handle = params.handle
-    };
-    drmret = ioctl(context->drmFd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle);
-    if (drmret != 0) {
-        LOG("DRM_IOCTL_PRIME_HANDLE_TO_FD failed: %d %d", drmret, errno)
-        goto err;
-    }
-
-    struct drm_gem_close gem_close = {
-        .handle = params.handle
-    };
-    drmret = ioctl(context->drmFd, DRM_IOCTL_GEM_CLOSE, &gem_close);
-    if (drmret != 0) {
-        LOG("DRM_IOCTL_GEM_CLOSE failed: %d %d", drmret, errno)
-        goto prime_err;
-    }
-
-    *fd1 = memFd;
-    *fd2 = memFd2;
-    *drmFd = prime_handle.fd;
-    return true;
+ err:
+     if (memFd > 0) {
+         close(memFd);
+     }
 
-prime_err:
-    if (prime_handle.fd > 0) {
-        close(prime_handle.fd);
-    }
+     return false;
+ }
 
-err:
-    if (memFd > 0) {
-        close(memFd);
-    }
-
-    return false;
-}
diff --git a/src/direct/nv-driver.h b/src/direct/nv-driver.h
index 1f15566..b87efcd 100644
--- a/src/direct/nv-driver.h
+++ b/src/direct/nv-driver.h
@@ -26,6 +26,9 @@ typedef struct {
 } NVDriverContext;
 
 typedef struct {
+    int nvFd;
+    int nvFd2;
+    int drmFd;
     uint32_t width;
     uint32_t height;
     uint64_t mods;
@@ -33,15 +36,12 @@ typedef struct {
     uint32_t offset;
     uint32_t pitch;
     uint32_t fourcc;
-    uint32_t log2GobsPerBlockX;
-    uint32_t log2GobsPerBlockY;
-    uint32_t log2GobsPerBlockZ;
 } NVDriverImage;
 
 bool init_nvdriver(NVDriverContext *context, int drmFd);
 bool free_nvdriver(NVDriverContext *context);
 bool get_device_uuid(const NVDriverContext *context, uint8_t uuid[16]);
 bool alloc_memory(const NVDriverContext *context, uint32_t size, int *fd);
-bool alloc_buffer(NVDriverContext *context, uint32_t size, const NVDriverImage images[], int *fd1, int *fd2, int *drmFd);
-uint32_t calculate_image_size(const NVDriverContext *context, NVDriverImage images[], uint32_t width, uint32_t height, uint32_t bppc, uint32_t numPlanes, const NVFormatPlane planes[]);
+bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bytesPerChannel, uint32_t fourcc, NVDriverImage *image);
+
 #endif