diff --git a/src/direct/direct-export-buf.c b/src/direct/direct-export-buf.c index e006bf1..7618508 100644 --- a/src/direct/direct-export-buf.c +++ b/src/direct/direct-export-buf.c @@ -42,6 +42,46 @@ static void findGPUIndexFromFd(NVDriver *drv) { drv->cudaGpuId = 0; } +static bool import_to_cuda(NVDriver *drv, NVDriverImage *image, int bpc, int channels, NVCudaImage *cudaImage, CUarray *array) { + CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = { + .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, + .handle.fd = image->nvFd, + .flags = 0, + .size = image->memorySize + }; + + //LOG("importing memory size: %dx%d = %x", image->width, image->height, image->memorySize); + + CHECK_CUDA_RESULT_RETURN(drv->cu->cuImportExternalMemory(&cudaImage->extMem, &extMemDesc), false); + + //For some reason, this close *must* be *here*, otherwise we will get random visual glitches. + close(image->nvFd); + close(image->nvFd2); + image->nvFd = 0; + image->nvFd2 = 0; + + CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapArrayDesc = { + .arrayDesc = { + .Width = image->width, + .Height = image->height, + .Depth = 0, + .Format = bpc == 8 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16, + .NumChannels = channels, + .Flags = 0 + }, + .numLevels = 1, + .offset = 0 + }; + //create a mimap array from the imported memory + CHECK_CUDA_RESULT_RETURN(drv->cu->cuExternalMemoryGetMappedMipmappedArray(&cudaImage->mipmapArray, cudaImage->extMem, &mipmapArrayDesc), false); + + //create an array from the mipmap array + CHECK_CUDA_RESULT_RETURN(drv->cu->cuMipmappedArrayGetLevel(array, cudaImage->mipmapArray, 0), false); + + return true; +} + + static void debug(EGLenum error,const char *command,EGLint messageType,EGLLabelKHR threadLabel,EGLLabelKHR objectLabel,const char* message) { LOG("[EGL] %s: %s", command, message); } @@ -146,92 +186,49 @@ static BackingImage *direct_allocateBackingImage(NVDriver *drv, NVSurface *surfa } const NVFormatInfo *fmtInfo = &formatsInfo[backingImage->format]; + const NVFormatPlane *p = fmtInfo->plane; - backingImage->totalSize = calculate_image_size(&drv->driverContext, driverImages, surface->width, surface->height, fmtInfo->bppc, fmtInfo->numPlanes, fmtInfo->plane); - LOG("Allocating BackingImage: %p %ux%u = %u bytes", backingImage, surface->width, surface->height, backingImage->totalSize); - - //alloc memory - Note this requires that all the planes have the same widthInBytes - //otherwise the value passed to the kernel driver won't be correct, luckily all the formats - //we currently support are all the same width - int memFd = 0, memFd2 = 0, drmFd = 0; - if (!alloc_buffer(&drv->driverContext, backingImage->totalSize, driverImages, &memFd, &memFd2, &drmFd)) { - goto import_fail; - } - LOG("Allocate Buffer: %d %d %d", memFd, memFd2, drmFd); - - //import the memory to CUDA - const CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = { - .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, - .handle.fd = memFd, - .flags = 0, - .size = backingImage->totalSize - }; - - LOG("Importing memory to CUDA") - if (CHECK_CUDA_RESULT(drv->cu->cuImportExternalMemory(&backingImage->extMem, &extMemDesc))) { - goto import_fail; + LOG("Allocating BackingImages: %p %dx%d", backingImage, surface->width, surface->height); + for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { + alloc_image(&drv->driverContext, surface->width >> p[i].ss.x, surface->height >> p[i].ss.y, + p[i].channelCount, 8 * fmtInfo->bppc, p[i].fourcc, &driverImages[i]); } - close(memFd2); - memFd2 = -1; - // memFd file descriptor is closed by CUDA after importing - memFd = -1; - - - //now map the arrays for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { - CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapArrayDesc = { - .arrayDesc = { - .Width = driverImages[i].width, - .Height = driverImages[i].height, - .Depth = 0, - .Format = fmtInfo->bppc == 1 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16, - .NumChannels = fmtInfo->plane[i].channelCount, - .Flags = 0 - }, - .numLevels = 1, - .offset = driverImages[i].offset - }; - - //create a mimap array from the imported memory - if (CHECK_CUDA_RESULT(drv->cu->cuExternalMemoryGetMappedMipmappedArray(&backingImage->cudaImages[i].mipmapArray, backingImage->extMem, &mipmapArrayDesc))) { + if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i])) goto bail; - } - - //create an array from the mipmap array - if (CHECK_CUDA_RESULT(drv->cu->cuMipmappedArrayGetLevel(&backingImage->arrays[i], backingImage->cudaImages[i].mipmapArray, 0))) { - goto bail; - } } backingImage->width = surface->width; backingImage->height = surface->height; - backingImage->fds[0] = drmFd; for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { + backingImage->fds[i] = driverImages[i].drmFd; backingImage->strides[i] = driverImages[i].pitch; backingImage->mods[i] = driverImages[i].mods; - backingImage->offsets[i] = driverImages[i].offset; + backingImage->size[i] = driverImages[i].memorySize; } return backingImage; bail: - destroyBackingImage(drv, backingImage); //another 'free' might occur on this pointer. //hence, set it to NULL to ensure no operation is performed if this really happens. backingImage = NULL; - -import_fail: - if (memFd >= 0) { - close(memFd); - } - if (memFd2 >= 0) { - close(memFd2); + for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { + if (driverImages[i].nvFd != 0) { + close(driverImages[i].nvFd); + } + if (driverImages[i].nvFd2 != 0) { + close(driverImages[i].nvFd2); + } + if (driverImages[i].drmFd != 0) { + close(driverImages[i].drmFd); + } } - if (drmFd >= 0) { - close(drmFd); + + if (backingImage != NULL) { + destroyBackingImage(drv, backingImage); } - free(backingImage); return NULL; } @@ -365,19 +362,16 @@ static bool direct_fillExportDescriptor(NVDriver *drv, NVSurface *surface, VADRM desc->height = surface->height; desc->num_layers = fmtInfo->numPlanes; - desc->num_objects = 1; - //desc->num_objects = 2; - desc->objects[0].fd = dup(img->fds[0]); - desc->objects[0].size = img->totalSize; - desc->objects[0].drm_format_modifier = img->mods[0]; - //desc->objects[1].fd = dup(img->fds[0]); - //desc->objects[1].size = img->totalSize; - //desc->objects[1].drm_format_modifier = img->mods[1]; + desc->num_objects = fmtInfo->numPlanes; for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { + desc->objects[i].fd = dup(img->fds[i]); + desc->objects[i].size = img->size[i]; + desc->objects[i].drm_format_modifier = img->mods[i]; + desc->layers[i].drm_format = fmtInfo->plane[i].fourcc; desc->layers[i].num_planes = 1; - desc->layers[i].object_index[0] = 0; + desc->layers[i].object_index[0] = i; desc->layers[i].offset[0] = img->offsets[i]; desc->layers[i].pitch[0] = img->strides[i]; } diff --git a/src/direct/nv-driver.c b/src/direct/nv-driver.c index d137998..f7f3aac 100644 --- a/src/direct/nv-driver.c +++ b/src/direct/nv-driver.c @@ -477,126 +477,115 @@ bool alloc_memory(const NVDriverContext *context, const uint32_t size, int *fd) return false; } -uint32_t calculate_image_size(const NVDriverContext *context, NVDriverImage images[], const uint32_t width, const uint32_t height, - const uint32_t bppc, const uint32_t numPlanes, const NVFormatPlane planes[]) { - //first figure out the gob layout - const uint32_t log2GobsPerBlockX = 0; - const uint32_t log2GobsPerBlockZ = 0; - - uint32_t offset = 0; - for (uint32_t i = 0; i < numPlanes; i++) { - //calculate each planes dimensions and bpp - const uint32_t planeWidth = width >> planes[i].ss.x; - const uint32_t planeHeight = height >> planes[i].ss.y; - const uint32_t bytesPerPixel = planes[i].channelCount * bppc; - - //Depending on the height of the allocated image, the modifiers - //needed for the exported image to work correctly change. However, this can cause problems if the Y surface - //needs one modifier, and UV need another when attempting to use a single surface export (as only one modifier - //is possible). So for now we're just going to limit the minimum height to 88 pixels so we can use a single - //modifier. - //Update: with the single buffer export this no longer works, as we're only allowed one mod per fd when exporting - //so different memory layouts for different planes can't work. Luckily this only seems to effect videos <= 128 pixels high. - uint32_t log2GobsPerBlockY = 4; - //uint32_t log2GobsPerBlockY = (planeHeight < 88) ? 3 : 4; - //LOG("Calculated log2GobsPerBlockY: %dx%d == %d", planeWidth, planeHeight, log2GobsPerBlockY); - - //These two seem to be correct, but it was discovered by trial and error so I'm not 100% sure - const uint32_t widthInBytes = ROUND_UP(planeWidth * bytesPerPixel, GOB_WIDTH_IN_BYTES << log2GobsPerBlockX); - const uint32_t alignedHeight = ROUND_UP(planeHeight, GOB_HEIGHT_IN_BYTES << log2GobsPerBlockY); - - images[i].width = planeWidth; - images[i].height = alignedHeight; - images[i].offset = offset; - images[i].memorySize = widthInBytes * alignedHeight; - images[i].pitch = widthInBytes; - images[i].mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, context->sector_layout, context->page_kind_generation, context->generic_page_kind, log2GobsPerBlockY); - images[i].fourcc = planes[i].fourcc; - images[i].log2GobsPerBlockX = log2GobsPerBlockX; - images[i].log2GobsPerBlockY = log2GobsPerBlockY; - images[i].log2GobsPerBlockZ = log2GobsPerBlockZ; - - offset += images[i].memorySize; - offset = ROUND_UP(offset, 64); - } - - return offset; -} - -bool alloc_buffer(NVDriverContext *context, const uint32_t size, const NVDriverImage images[], int *fd1, int *fd2, int *drmFd) { - int memFd = -1; - const bool ret = alloc_memory(context, size, &memFd); - if (!ret) { - LOG("alloc_memory failed") - return false; - } - - //now export the dma-buf - //duplicate the fd so we don't invalidate it by importing it - const int memFd2 = dup(memFd); - if (memFd2 == -1) { - LOG("dup failed") - goto err; - } - - struct NvKmsKapiPrivImportMemoryParams nvkmsParams = { - .memFd = memFd2, - .surfaceParams = { - .layout = NvKmsSurfaceMemoryLayoutBlockLinear, - .blockLinear = { - .genericMemory = 0, - .pitchInBlocks = images[0].pitch / GOB_WIDTH_IN_BYTES, - .log2GobsPerBlock.x = images[0].log2GobsPerBlockX, - .log2GobsPerBlock.y = images[0].log2GobsPerBlockY, - .log2GobsPerBlock.z = images[0].log2GobsPerBlockZ, - } - } - }; - - struct drm_nvidia_gem_import_nvkms_memory_params params = { - .mem_size = size, - .nvkms_params_ptr = (uint64_t) &nvkmsParams, - .nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver - }; - int drmret = ioctl(context->drmFd, DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY, ¶ms); - if (drmret != 0) { - LOG("DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY failed: %d %d", drmret, errno) - goto err; - } + bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bitsPerChannel, uint32_t fourcc, NVDriverImage *image) { + uint32_t gobWidthInBytes = 64; + uint32_t gobHeightInBytes = 8; + + uint32_t bytesPerChannel = bitsPerChannel/8; + uint32_t bytesPerPixel = channels * bytesPerChannel; + + //first figure out the gob layout + uint32_t log2GobsPerBlockX = 0; //TODO not sure if these are the correct numbers to start with, but they're the largest ones i've seen used + uint32_t log2GobsPerBlockY = height < 88 ? 3 : 4; //TODO 88 is a guess, 80px high needs 3, 112px needs 4, 96px needs 4, 88px needs 4 + uint32_t log2GobsPerBlockZ = 0; + + //LOG("Calculated GOB size: %dx%d (%dx%d)", gobWidthInBytes << log2GobsPerBlockX, gobHeightInBytes << log2GobsPerBlockY, log2GobsPerBlockX, log2GobsPerBlockY); + + //These two seem to be correct, but it was discovered by trial and error so I'm not 100% sure + uint32_t widthInBytes = ROUND_UP(width * bytesPerPixel, gobWidthInBytes << log2GobsPerBlockX); + uint32_t alignedHeight = ROUND_UP(height, gobHeightInBytes << log2GobsPerBlockY); + + uint32_t imageSizeInBytes = widthInBytes * alignedHeight; + uint32_t size = imageSizeInBytes; + + //this gets us some memory, and the fd to import into cuda + int memFd = -1; + bool ret = alloc_memory(context, size, &memFd); + if (!ret) { + LOG("alloc_memory failed"); + return false; + } + + //now export the dma-buf + uint32_t pitchInBlocks = widthInBytes / (gobWidthInBytes << log2GobsPerBlockX); + + //printf("got gobsPerBlock: %ux%u %u %u %u %d\n", width, height, log2GobsPerBlockX, log2GobsPerBlockY, log2GobsPerBlockZ, pitchInBlocks); + //duplicate the fd so we don't invalidate it by importing it + int memFd2 = dup(memFd); + if (memFd2 == -1) { + LOG("dup failed"); + goto err; + } + + struct NvKmsKapiPrivImportMemoryParams nvkmsParams = { + .memFd = memFd2, + .surfaceParams = { + .layout = NvKmsSurfaceMemoryLayoutBlockLinear, + .blockLinear = { + .genericMemory = 0, + .pitchInBlocks = pitchInBlocks, + .log2GobsPerBlock.x = log2GobsPerBlockX, + .log2GobsPerBlock.y = log2GobsPerBlockY, + .log2GobsPerBlock.z = log2GobsPerBlockZ, + } + } + }; + + struct drm_nvidia_gem_import_nvkms_memory_params params = { + .mem_size = imageSizeInBytes, + .nvkms_params_ptr = (uint64_t) &nvkmsParams, + .nvkms_params_size = context->driverMajorVersion == 470 ? 0x20 : sizeof(nvkmsParams) //needs to be 0x20 in the 470 series driver + }; + int drmret = ioctl(context->drmFd, DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY, ¶ms); + if (drmret != 0) { + LOG("DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY failed: %d %d", drmret, errno); + goto err; + } + + //export dma-buf + struct drm_prime_handle prime_handle = { + .handle = params.handle + }; + drmret = ioctl(context->drmFd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle); + if (drmret != 0) { + LOG("DRM_IOCTL_PRIME_HANDLE_TO_FD failed: %d %d", drmret, errno); + goto err; + } + + struct drm_gem_close gem_close = { + .handle = params.handle + }; + drmret = ioctl(context->drmFd, DRM_IOCTL_GEM_CLOSE, &gem_close); + if (drmret != 0) { + LOG("DRM_IOCTL_GEM_CLOSE failed: %d %d", drmret, errno); + goto prime_err; + } + + image->width = width; + image->height = height; + image->nvFd = memFd; + image->nvFd2 = memFd2; //not sure why we can't close this one, we shouldn't need it after importing the image + image->drmFd = prime_handle.fd; + image->mods = DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, context->sector_layout, context->page_kind_generation, context->generic_page_kind, log2GobsPerBlockY); + image->offset = 0; + image->pitch = widthInBytes; + image->memorySize = imageSizeInBytes; + image->fourcc = fourcc; + + //LOG("created image: %dx%d %lx %d %x", width, height, image->mods, widthInBytes, imageSizeInBytes); + + return true; + + prime_err: + if (prime_handle.fd > 0) { + close(prime_handle.fd); + } - //export dma-buf - struct drm_prime_handle prime_handle = { - .handle = params.handle - }; - drmret = ioctl(context->drmFd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &prime_handle); - if (drmret != 0) { - LOG("DRM_IOCTL_PRIME_HANDLE_TO_FD failed: %d %d", drmret, errno) - goto err; - } - - struct drm_gem_close gem_close = { - .handle = params.handle - }; - drmret = ioctl(context->drmFd, DRM_IOCTL_GEM_CLOSE, &gem_close); - if (drmret != 0) { - LOG("DRM_IOCTL_GEM_CLOSE failed: %d %d", drmret, errno) - goto prime_err; - } - - *fd1 = memFd; - *fd2 = memFd2; - *drmFd = prime_handle.fd; - return true; + err: + if (memFd > 0) { + close(memFd); + } -prime_err: - if (prime_handle.fd > 0) { - close(prime_handle.fd); - } + return false; + } -err: - if (memFd > 0) { - close(memFd); - } - - return false; -} diff --git a/src/direct/nv-driver.h b/src/direct/nv-driver.h index 1f15566..b87efcd 100644 --- a/src/direct/nv-driver.h +++ b/src/direct/nv-driver.h @@ -26,6 +26,9 @@ typedef struct { } NVDriverContext; typedef struct { + int nvFd; + int nvFd2; + int drmFd; uint32_t width; uint32_t height; uint64_t mods; @@ -33,15 +36,12 @@ typedef struct { uint32_t offset; uint32_t pitch; uint32_t fourcc; - uint32_t log2GobsPerBlockX; - uint32_t log2GobsPerBlockY; - uint32_t log2GobsPerBlockZ; } NVDriverImage; bool init_nvdriver(NVDriverContext *context, int drmFd); bool free_nvdriver(NVDriverContext *context); bool get_device_uuid(const NVDriverContext *context, uint8_t uuid[16]); bool alloc_memory(const NVDriverContext *context, uint32_t size, int *fd); -bool alloc_buffer(NVDriverContext *context, uint32_t size, const NVDriverImage images[], int *fd1, int *fd2, int *drmFd); -uint32_t calculate_image_size(const NVDriverContext *context, NVDriverImage images[], uint32_t width, uint32_t height, uint32_t bppc, uint32_t numPlanes, const NVFormatPlane planes[]); +bool alloc_image(NVDriverContext *context, uint32_t width, uint32_t height, uint8_t channels, uint8_t bytesPerChannel, uint32_t fourcc, NVDriverImage *image); + #endif