/* GStreamer * Copyright (C) 2024 Seungha Yang * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301, USA. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "gstnvcompvideodec.h" #ifdef HAVE_GST_GL #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_dec_debug); #define GST_CAT_DEFAULT gst_nv_comp_video_dec_debug #ifdef HAVE_GST_GL #define SRC_CAPS \ GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ GST_VIDEO_FORMATS_ALL) ";" \ GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \ GST_VIDEO_FORMATS_ALL) ";" \ GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) #else #define SRC_CAPS \ GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ GST_VIDEO_FORMATS_ALL) ";" \ GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) #endif static GstStaticPadTemplate sink_template = GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS, GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; " "video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; " "video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; " "video/x-nvcomp-bitcomp; video/x-nvcomp-ans")); static GstStaticPadTemplate src_template = GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS, GST_STATIC_CAPS (SRC_CAPS)); /* *INDENT-OFF* */ using namespace nvcomp; struct DecoderTask { ~DecoderTask () { if (ctx) { gst_cuda_context_push (ctx); clear_resource (); gst_cuda_context_pop (nullptr); gst_object_unref (ctx); } } void clear_resource () { if (!ctx) return; if (device_compressed) CuMemFree ((CUdeviceptr) device_compressed); device_compressed = nullptr; if (host_compressed) CuMemFreeHost (host_compressed); host_compressed = nullptr; if (device_compressed_bytes) CuMemFree ((CUdeviceptr) device_compressed_bytes); device_compressed_bytes = nullptr; if (device_compressed_ptrs) CuMemFree ((CUdeviceptr) device_compressed_ptrs); device_compressed_ptrs = nullptr; if (host_compressed_bytes) CuMemFreeHost (host_compressed_bytes); host_compressed_bytes = nullptr; if (host_compressed_ptrs) CuMemFreeHost (host_compressed_ptrs); host_compressed_ptrs = nullptr; if (device_uncompressed) CuMemFree ((CUdeviceptr) device_uncompressed); device_uncompressed = nullptr; if (device_uncompressed_temp) CuMemFree ((CUdeviceptr) device_uncompressed_temp); device_uncompressed_temp = nullptr; if (host_uncompressed) CuMemFreeHost (host_uncompressed); host_uncompressed = nullptr; if (device_uncompressed_bytes) CuMemFree ((CUdeviceptr) device_uncompressed_bytes); device_uncompressed_bytes = nullptr; if (device_uncompressed_ptrs) CuMemFree ((CUdeviceptr) device_uncompressed_ptrs); device_uncompressed_ptrs = nullptr; if (host_uncompressed_bytes) CuMemFreeHost (host_uncompressed_bytes); host_uncompressed_bytes = nullptr; if (host_uncompressed_ptrs) CuMemFreeHost (host_uncompressed_ptrs); host_uncompressed_ptrs = nullptr; if (device_actual_uncompressed_bytes) CuMemFree ((CUdeviceptr) device_actual_uncompressed_bytes); device_actual_uncompressed_bytes = nullptr; if (temp_ptr) CuMemFree ((CUdeviceptr) temp_ptr); temp_ptr = nullptr; if (device_statuses) CuMemFree ((CUdeviceptr) device_statuses); device_statuses = nullptr; batch_size = 0; max_compressed_chunk_size = 0; max_uncompressed_chunk_size = 0; } bool allocate_batched (size_t num_chunks, size_t compressed_chunk_size, size_t uncompressed_chunk_size, size_t temp_bytes) { size_t compressed_alloc; size_t uncompressed_alloc; size_t alloc_size = num_chunks * sizeof (size_t); uint8_t *src; compressed_chunk_size = GST_ROUND_UP_8 (compressed_chunk_size); uncompressed_chunk_size = GST_ROUND_UP_8 (uncompressed_chunk_size); compressed_alloc = num_chunks * compressed_chunk_size; uncompressed_alloc = num_chunks * uncompressed_chunk_size; auto ret = CuMemAlloc ((CUdeviceptr *) &device_compressed, compressed_alloc); if (!gst_cuda_result (ret)) return false; ret = CuMemAllocHost ((void **) &host_compressed, compressed_alloc); if (!gst_cuda_result (ret)) return false; ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAllocHost ((void **) &host_compressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAllocHost ((void **) &host_compressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return false; src = device_compressed; for (size_t i = 0; i < num_chunks; i++) { host_compressed_ptrs[i] = src; src += compressed_chunk_size; } ret = CuMemcpyHtoD ((CUdeviceptr) device_compressed_ptrs, host_compressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_temp, uncompressed_alloc); if (!gst_cuda_result (ret)) return false; ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed, uncompressed_alloc); if (!gst_cuda_result (ret)) return false; ret = CuMemAllocHost ((void **) &host_uncompressed, uncompressed_alloc); if (!gst_cuda_result (ret)) return false; ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAllocHost ((void **) &host_uncompressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAllocHost ((void **) &host_uncompressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return false; src = device_uncompressed_temp; for (size_t i = 0; i < num_chunks; i++) { host_uncompressed_bytes[i] = uncompressed_chunk_size; host_uncompressed_ptrs[i] = src; src += uncompressed_chunk_size; } ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_bytes, host_uncompressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_ptrs, host_uncompressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return false; ret = CuMemAlloc ((CUdeviceptr *) &device_actual_uncompressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return false; if (temp_bytes > 0) { ret = CuMemAlloc ((CUdeviceptr *) &temp_ptr, temp_bytes); if (!gst_cuda_result (ret)) return false; } ret = CuMemAlloc ((CUdeviceptr *) &device_statuses, sizeof (nvcompStatus_t) * num_chunks); if (!gst_cuda_result (ret)) return false; batched = TRUE; batch_size = num_chunks; temp_size = temp_bytes; max_compressed_chunk_size = compressed_chunk_size; max_uncompressed_chunk_size = uncompressed_chunk_size; compressed_alloc_size = compressed_alloc; uncompressed_alloc_size = uncompressed_alloc; return true; } GstCudaContext *ctx = nullptr; uint8_t *device_compressed = nullptr; uint8_t *host_compressed = nullptr; size_t *device_compressed_bytes = nullptr; void **device_compressed_ptrs = nullptr; size_t *host_compressed_bytes = nullptr; void **host_compressed_ptrs = nullptr; uint8_t *device_uncompressed = nullptr; uint8_t *device_uncompressed_temp = nullptr; uint8_t *host_uncompressed = nullptr; size_t *device_uncompressed_bytes = nullptr; void **device_uncompressed_ptrs = nullptr; size_t *host_uncompressed_bytes = nullptr; void **host_uncompressed_ptrs = nullptr; size_t *device_actual_uncompressed_bytes = nullptr; void *temp_ptr = nullptr; size_t temp_size = 0; nvcompStatus_t *device_statuses = nullptr; gboolean batched = FALSE; size_t batch_size = 0; size_t max_uncompressed_chunk_size = 0; size_t max_compressed_chunk_size = 0; size_t uncompressed_alloc_size = 0; size_t compressed_alloc_size = 0; }; struct BatchedDecompBase { virtual nvcompStatus_t get_temp_size( size_t num_chunks, size_t max_uncompressed_chunk_bytes, size_t * temp_bytes) = 0; virtual nvcompStatus_t decompress( void **device_compressed_ptrs, size_t *device_compressed_bytes, size_t *device_uncompressed_bytes, size_t *device_actual_uncompressed_bytes, size_t batch_size, void *device_temp_ptr, size_t temp_bytes, void **device_uncompressed_ptrs, nvcompStatus_t *device_statuses, cudaStream_t stream) = 0; }; template class BatchedDecomp : public BatchedDecompBase { public: BatchedDecomp () {} nvcompStatus_t get_temp_size( size_t num_chunks, size_t max_uncompressed_chunk_bytes, size_t * temp_bytes) { return T (num_chunks, max_uncompressed_chunk_bytes, temp_bytes); } nvcompStatus_t decompress( void **device_compressed_ptrs, size_t *device_compressed_bytes, size_t *device_uncompressed_bytes, size_t *device_actual_uncompressed_bytes, size_t batch_size, void *device_temp_ptr, size_t temp_bytes, void **device_uncompressed_ptrs, nvcompStatus_t *device_statuses, cudaStream_t stream) { return D (device_compressed_ptrs, device_compressed_bytes, device_uncompressed_bytes, device_actual_uncompressed_bytes, batch_size, device_temp_ptr, temp_bytes, device_uncompressed_ptrs, device_statuses, stream); } }; struct GstNvCompVideoDecPrivate { GstNvCompVideoDecPrivate () { gst_video_info_init (&info); } GstCudaContext *ctx = nullptr; GstCudaStream *stream = nullptr; #ifdef HAVE_GST_GL GstGLDisplay *gl_display = nullptr; GstGLContext *gl_context = nullptr; GstGLContext *other_gl_context = nullptr; #endif GstVideoCodecState *state = nullptr; std::shared_ptr manager; std::shared_ptr batched_decomp; std::shared_ptr task; gboolean gl_interop = FALSE; GstVideoInfo info; gboolean batched = FALSE; GstNvCompMethod method; }; /* *INDENT-ON* */ struct _GstNvCompVideoDec { GstVideoDecoder parent; GstNvCompVideoDecPrivate *priv; }; static void gst_nv_comp_video_dec_finalize (GObject * object); static void gst_nv_comp_video_dec_set_context (GstElement * element, GstContext * context); static gboolean gst_nv_comp_video_dec_open (GstVideoDecoder * decoder); static gboolean gst_nv_comp_video_dec_close (GstVideoDecoder * decoder); static gboolean gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder, GstQuery * query); static gboolean gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder, GstQuery * query); static gboolean gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder, GstQuery * query); static gboolean gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder, GstVideoCodecState * state); static gboolean gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder); static GstFlowReturn gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder, GstVideoCodecFrame * frame); #define gst_nv_comp_video_dec_parent_class parent_class G_DEFINE_TYPE (GstNvCompVideoDec, gst_nv_comp_video_dec, GST_TYPE_VIDEO_DECODER); static void gst_nv_comp_video_dec_class_init (GstNvCompVideoDecClass * klass) { auto object_class = G_OBJECT_CLASS (klass); auto element_class = GST_ELEMENT_CLASS (klass); auto decoder_class = GST_VIDEO_DECODER_CLASS (klass); object_class->finalize = gst_nv_comp_video_dec_finalize; gst_element_class_add_static_pad_template (element_class, &sink_template); gst_element_class_add_static_pad_template (element_class, &src_template); gst_element_class_set_static_metadata (element_class, "nvCOMP Video Decoder", "Decoder/Video/Hardware", "Decompress a video stream using nvCOMP library", "Seungha Yang "); element_class->set_context = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_context); decoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_open); decoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_close); decoder_class->sink_query = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_sink_query); decoder_class->src_query = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_src_query); decoder_class->decide_allocation = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_decide_allocation); decoder_class->set_format = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_format); decoder_class->negotiate = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_negotiate); decoder_class->handle_frame = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_handle_frame); GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_dec_debug, "nvcompvideodec", 0, "nvcompvideodec"); } static void gst_nv_comp_video_dec_init (GstNvCompVideoDec * self) { self->priv = new GstNvCompVideoDecPrivate (); } static void gst_nv_comp_video_dec_finalize (GObject * object) { auto self = GST_NV_COMP_VIDEO_DEC (object); delete self->priv; G_OBJECT_CLASS (parent_class)->finalize (object); } static void gst_nv_comp_video_dec_set_context (GstElement * element, GstContext * context) { auto self = GST_NV_COMP_VIDEO_DEC (element); auto priv = self->priv; gst_cuda_handle_set_context (element, context, -1, &priv->ctx); #ifdef HAVE_GST_GL if (gst_gl_handle_set_context (element, context, &priv->gl_display, &priv->other_gl_context)) { if (priv->gl_display) gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); } #endif GST_ELEMENT_CLASS (parent_class)->set_context (element, context); } static gboolean gst_nv_comp_video_dec_open (GstVideoDecoder * decoder) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); auto priv = self->priv; if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (decoder), -1, &priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't get cuda context"); return FALSE; } priv->stream = gst_cuda_stream_new (priv->ctx); return TRUE; } static gboolean gst_nv_comp_video_dec_close (GstVideoDecoder * decoder) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); auto priv = self->priv; if (priv->ctx) { gst_cuda_context_push (priv->ctx); priv->manager = nullptr; priv->task = nullptr; gst_cuda_context_pop (nullptr); } gst_clear_cuda_stream (&priv->stream); gst_clear_object (&priv->ctx); #ifdef HAVE_GST_GL gst_clear_object (&priv->other_gl_context); gst_clear_object (&priv->gl_context); gst_clear_object (&priv->gl_context); #endif return TRUE; } static gboolean gst_nv_comp_video_dec_handle_context_query (GstNvCompVideoDec * self, GstQuery * query) { auto priv = self->priv; #ifdef HAVE_GST_GL { GstGLDisplay *display = nullptr; GstGLContext *other = nullptr; GstGLContext *local = nullptr; if (priv->gl_display) display = (GstGLDisplay *) gst_object_ref (priv->gl_display); if (priv->gl_context) local = (GstGLContext *) gst_object_ref (priv->gl_context); if (priv->other_gl_context) other = (GstGLContext *) gst_object_ref (priv->other_gl_context); auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query, display, local, other); gst_clear_object (&display); gst_clear_object (&other); gst_clear_object (&local); if (ret) return TRUE; } #endif if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx)) return TRUE; return FALSE; } static gboolean gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder, GstQuery * query) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); switch (GST_QUERY_TYPE (query)) { case GST_QUERY_CONTEXT: if (gst_nv_comp_video_dec_handle_context_query (self, query)) return TRUE; break; default: break; } return GST_VIDEO_DECODER_CLASS (parent_class)->sink_query (decoder, query); } static gboolean gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder, GstQuery * query) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); switch (GST_QUERY_TYPE (query)) { case GST_QUERY_CONTEXT: if (gst_nv_comp_video_dec_handle_context_query (self, query)) return TRUE; break; default: break; } return GST_VIDEO_DECODER_CLASS (parent_class)->src_query (decoder, query); } #ifdef HAVE_GST_GL static void check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret) { guint device_count = 0; CUdevice device_list[1] = { 0, }; CUresult cuda_ret; *ret = FALSE; cuda_ret = CuGLGetDevices (&device_count, device_list, 1, CU_GL_DEVICE_LIST_ALL); if (!gst_cuda_result (cuda_ret) || device_count == 0) return; *ret = TRUE; } static gboolean gst_nv_comp_video_dec_ensure_gl_context (GstNvCompVideoDec * self) { auto priv = self->priv; gboolean ret = FALSE; if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display, &priv->other_gl_context)) { GST_DEBUG_OBJECT (self, "Couldn't get GL display"); return FALSE; } gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context, &priv->gl_context, nullptr)) { GST_DEBUG_OBJECT (self, "Couldn't get GL context"); return FALSE; } gst_gl_context_thread_add (priv->gl_context, (GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret); return ret; } #endif static gboolean gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder, GstQuery * query) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); auto priv = self->priv; GstBufferPool *pool = nullptr; guint size; guint min = 0; guint max = 0; GstCaps *caps; gst_query_parse_allocation (query, &caps, nullptr); if (!caps) { GST_WARNING_OBJECT (self, "null caps in query"); return FALSE; } GstVideoInfo info; if (!gst_video_info_from_caps (&info, caps)) { GST_WARNING_OBJECT (self, "Failed to convert caps into info"); return FALSE; } gboolean update_pool = FALSE; if (gst_query_get_n_allocation_pools (query) > 0) { gst_query_parse_nth_allocation_pool (query, 0, &pool, &size, &min, &max); update_pool = TRUE; } auto features = gst_caps_get_features (caps, 0); gboolean use_cuda_pool = FALSE; if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) { GST_DEBUG_OBJECT (self, "Downstream support CUDA memory"); if (pool) { if (!GST_IS_CUDA_BUFFER_POOL (pool)) { gst_clear_object (&pool); } else { auto cuda_pool = GST_CUDA_BUFFER_POOL (pool); if (cuda_pool->context != priv->ctx) gst_clear_object (&pool); } } if (!pool) pool = gst_cuda_buffer_pool_new (priv->ctx); use_cuda_pool = TRUE; } #ifdef HAVE_GST_GL else if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY) && priv->gl_interop) { GST_DEBUG_OBJECT (self, "Downstream support GL memory"); if (!gst_nv_comp_video_dec_ensure_gl_context (self)) { priv->gl_interop = FALSE; } else { if (pool && !GST_IS_GL_BUFFER_POOL (pool)) gst_clear_object (&pool); if (!pool) pool = gst_gl_buffer_pool_new (priv->gl_context); } } #endif if (!pool) pool = gst_video_buffer_pool_new (); auto config = gst_buffer_pool_get_config (pool); size = GST_VIDEO_INFO_SIZE (&info); gst_buffer_pool_config_set_params (config, caps, size, 0, 0); if (use_cuda_pool && priv->stream) { /* Set our stream on buffer pool config so that CUstream can be shared */ gst_buffer_pool_config_set_cuda_stream (config, priv->stream); } if (!gst_buffer_pool_set_config (pool, config)) { GST_WARNING_OBJECT (self, "Failed to set pool config"); gst_object_unref (pool); return FALSE; } config = gst_buffer_pool_get_config (pool); gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr); gst_structure_free (config); if (update_pool) gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max); else gst_query_add_allocation_pool (query, pool, size, min, max); gst_object_unref (pool); return TRUE; } static gboolean gst_nv_comp_video_dec_alloc_task (GstNvCompVideoDec * self, DecoderTask * task, gboolean batched, gsize size) { if (batched) return TRUE; task->uncompressed_alloc_size = size; auto cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed, size); if (!gst_cuda_result (cuda_ret)) return FALSE; cuda_ret = CuMemAllocHost ((void **) &task->host_uncompressed, size); if (!gst_cuda_result (cuda_ret)) return FALSE; task->compressed_alloc_size = size; cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, size); if (!gst_cuda_result (cuda_ret)) return FALSE; cuda_ret = CuMemAllocHost ((void **) &task->host_compressed, size); if (!gst_cuda_result (cuda_ret)) return FALSE; return TRUE; } static gboolean gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder, GstVideoCodecState * state) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); auto priv = self->priv; if (!priv->ctx) { GST_ERROR_OBJECT (self, "CUDA context was not configured"); return FALSE; } GST_DEBUG_OBJECT (self, "Set format with caps %" GST_PTR_FORMAT, state->caps); g_clear_pointer (&priv->state, gst_video_codec_state_unref); priv->state = gst_video_codec_state_ref (state); auto s = gst_caps_get_structure (state->caps, 0); std::string mime_type = gst_structure_get_name (s); auto format_str = gst_structure_get_string (s, "format"); if (!format_str) { GST_ERROR_OBJECT (self, "Unknown video format"); return FALSE; } GstVideoFormat format = gst_video_format_from_string (format_str); if (format == GST_VIDEO_FORMAT_UNKNOWN || format == GST_VIDEO_FORMAT_ENCODED) { GST_ERROR_OBJECT (self, "Invalid format string %s", format_str); return FALSE; } s = gst_structure_copy (s); gst_structure_set_name (s, "video/x-raw"); auto video_caps = gst_caps_new_empty (); gst_caps_append_structure (video_caps, s); auto ret = gst_video_info_from_caps (&priv->info, video_caps); gst_caps_unref (video_caps); if (!ret) { GST_ERROR_OBJECT (self, "Couldn't build output caps"); return FALSE; } if (!gst_cuda_context_push (priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't push context"); return FALSE; } priv->manager = nullptr; priv->batched_decomp = nullptr; priv->task = nullptr; priv->batched = TRUE; if (mime_type == "video/x-nvcomp") { priv->batched = FALSE; } else if (mime_type == "video/x-nvcomp-lz4") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedLZ4DecompressGetTempSize, nvcompBatchedLZ4DecompressAsync >> (); } else if (mime_type == "video/x-nvcomp-snappy") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedSnappyDecompressGetTempSize, nvcompBatchedSnappyDecompressAsync >> (); } else if (mime_type == "video/x-nvcomp-gdeflate") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedGdeflateDecompressGetTempSize, nvcompBatchedGdeflateDecompressAsync >> (); } else if (mime_type == "video/x-nvcomp-deflate") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedDeflateDecompressGetTempSize, nvcompBatchedDeflateDecompressAsync >> (); } else if (mime_type == "video/x-nvcomp-zstd") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedZstdDecompressGetTempSize, nvcompBatchedZstdDecompressAsync >> (); } else if (mime_type == "video/x-nvcomp-cascaded") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedCascadedDecompressGetTempSize, nvcompBatchedCascadedDecompressAsync >> (); } else if (mime_type == "video/x-nvcomp-bitcomp") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedBitcompDecompressGetTempSize, nvcompBatchedBitcompDecompressAsync >> (); } else if (mime_type == "video/x-nvcomp-ans") { priv->batched_decomp = std::make_shared < BatchedDecomp < nvcompBatchedANSDecompressGetTempSize, nvcompBatchedANSDecompressAsync >> (); } else { gst_cuda_context_pop (nullptr); g_assert_not_reached (); return FALSE; } auto task = std::make_shared < DecoderTask > (); task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx); if (!gst_nv_comp_video_dec_alloc_task (self, task.get (), priv->batched, priv->info.size)) { task = nullptr; gst_cuda_context_pop (nullptr); return FALSE; } priv->task = task; gst_cuda_context_pop (nullptr); return gst_video_decoder_negotiate (decoder); } static gboolean is_supported_cuda_format (GstVideoFormat format) { switch (format) { case GST_VIDEO_FORMAT_I420: case GST_VIDEO_FORMAT_YV12: case GST_VIDEO_FORMAT_NV12: case GST_VIDEO_FORMAT_NV21: case GST_VIDEO_FORMAT_P010_10LE: case GST_VIDEO_FORMAT_P012_LE: case GST_VIDEO_FORMAT_P016_LE: case GST_VIDEO_FORMAT_I420_10LE: case GST_VIDEO_FORMAT_I420_12LE: case GST_VIDEO_FORMAT_Y444: case GST_VIDEO_FORMAT_Y444_10LE: case GST_VIDEO_FORMAT_Y444_12LE: case GST_VIDEO_FORMAT_Y444_16LE: case GST_VIDEO_FORMAT_BGRA: case GST_VIDEO_FORMAT_RGBA: case GST_VIDEO_FORMAT_RGBx: case GST_VIDEO_FORMAT_BGRx: case GST_VIDEO_FORMAT_ARGB: case GST_VIDEO_FORMAT_ABGR: case GST_VIDEO_FORMAT_RGB: case GST_VIDEO_FORMAT_BGR: case GST_VIDEO_FORMAT_BGR10A2_LE: case GST_VIDEO_FORMAT_RGB10A2_LE: case GST_VIDEO_FORMAT_Y42B: case GST_VIDEO_FORMAT_I422_10LE: case GST_VIDEO_FORMAT_I422_12LE: case GST_VIDEO_FORMAT_YUY2: case GST_VIDEO_FORMAT_UYVY: case GST_VIDEO_FORMAT_RGBP: case GST_VIDEO_FORMAT_BGRP: case GST_VIDEO_FORMAT_GBR: case GST_VIDEO_FORMAT_GBR_10LE: case GST_VIDEO_FORMAT_GBR_12LE: case GST_VIDEO_FORMAT_GBR_16LE: case GST_VIDEO_FORMAT_GBRA: case GST_VIDEO_FORMAT_VUYA: return TRUE; default: break; } return FALSE; } #ifdef HAVE_GST_GL static gboolean is_supported_gl_format (GstVideoFormat format) { auto gl_caps = gst_caps_from_string ("video/x-raw, format = (string) " GST_GL_COLOR_CONVERT_FORMATS); auto our_caps = gst_caps_new_empty_simple ("video/x-raw"); gst_caps_set_simple (our_caps, "format", G_TYPE_STRING, gst_video_format_to_string (format), nullptr); auto ret = gst_caps_is_subset (our_caps, gl_caps); gst_caps_unref (gl_caps); gst_caps_unref (our_caps); return ret; } #endif static gboolean gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); auto priv = self->priv; gboolean is_cuda = FALSE; #ifdef HAVE_GST_GL gboolean is_gl = FALSE; #endif auto peer_caps = gst_pad_get_allowed_caps (decoder->srcpad); GST_DEBUG_OBJECT (self, "Allowed caps %" GST_PTR_FORMAT, peer_caps); if (!peer_caps || gst_caps_is_any (peer_caps)) { GST_DEBUG_OBJECT (self, "cannot determine output format, use system memory"); } else { GstCapsFeatures *features; guint size = gst_caps_get_size (peer_caps); guint i; for (i = 0; i < size; i++) { features = gst_caps_get_features (peer_caps, i); if (!features) continue; if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) { is_cuda = TRUE; } #ifdef HAVE_GST_GL if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) { is_gl = TRUE; } #endif } } gst_clear_caps (&peer_caps); auto state = gst_video_decoder_set_interlaced_output_state (decoder, GST_VIDEO_INFO_FORMAT (&priv->info), GST_VIDEO_INFO_INTERLACE_MODE (&priv->info), priv->info.width, priv->info.height, priv->state); if (!state) { GST_ERROR_OBJECT (self, "Couldn't set output state"); return FALSE; } priv->gl_interop = FALSE; state->caps = gst_video_info_to_caps (&state->info); auto format = GST_VIDEO_INFO_FORMAT (&priv->info); if (is_cuda && is_supported_cuda_format (format)) { gst_caps_set_features_simple (state->caps, gst_caps_features_new_static_str (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, nullptr)); } #ifdef HAVE_GST_GL else if (is_gl && is_supported_gl_format (format)) { gst_caps_set_features_simple (state->caps, gst_caps_features_new_static_str (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr)); priv->gl_interop = TRUE; } #endif return GST_VIDEO_DECODER_CLASS (parent_class)->negotiate (decoder); } static gboolean gst_nv_comp_video_dec_download (GstNvCompVideoDec * self, GstVideoFrame * frame, CUstream stream, gboolean is_device_copy) { auto priv = self->priv; auto info = &priv->info; auto finfo = info->finfo; gint comp[GST_VIDEO_MAX_COMPONENTS]; CUresult ret = CUDA_SUCCESS; auto task = priv->task; for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) { guint8 *sp; if (is_device_copy) sp = task->device_uncompressed + info->offset[i]; else sp = task->host_uncompressed + info->offset[i]; guint8 *dp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i); guint ss, ds; guint w, h; if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { if (is_device_copy) { ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, 256 * 4, stream); } else { memcpy (dp, sp, 256 * 4); } if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); return FALSE; } return TRUE; } ds = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i); ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i); gst_video_format_info_component (finfo, i, comp); w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); if (w == 0) w = MIN (ss, ds); h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { gint tile_size; gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; GstVideoTileMode mode; tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); sx_tiles = GST_VIDEO_TILE_X_TILES (ss); sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); dx_tiles = GST_VIDEO_TILE_X_TILES (ds); dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); w = MIN (sx_tiles, dx_tiles); h = MIN (sy_tiles, dy_tiles); for (guint j = 0; j < h; j++) { for (guint k = 0; k < w; k++) { guint si, di; guint8 *cur_dp; guint8 *cur_sp; si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); cur_dp = dp + (di * tile_size); cur_sp = sp + (si * tile_size); if (is_device_copy) { ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, w, stream); } else { memcpy (cur_dp, cur_sp, w); } if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); return FALSE; } } } } else { if (is_device_copy) { CUDA_MEMCPY2D params = { }; params.srcMemoryType = CU_MEMORYTYPE_DEVICE; params.srcDevice = (CUdeviceptr) sp; params.srcPitch = ss; params.dstMemoryType = CU_MEMORYTYPE_DEVICE; params.dstDevice = (CUdeviceptr) dp; params.dstPitch = ds; params.WidthInBytes = w; params.Height = h; ret = CuMemcpy2DAsync (¶ms, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); return FALSE; } } else { for (guint j = 0; j < h; j++) { memcpy (dp, sp, w); dp += ds; sp += ss; } } } } return TRUE; } #ifdef HAVE_GST_GL struct GLInteropData { GstNvCompVideoDec *self = nullptr; GstBuffer *buffer = nullptr; gboolean ret = FALSE; }; static GstCudaGraphicsResource * ensure_gl_cuda_resource (GstNvCompVideoDec * self, GstMemory * mem) { auto priv = self->priv; GstCudaGraphicsResource *resource; GQuark quark; if (!gst_is_gl_memory_pbo (mem)) { GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s", mem->allocator->mem_type); return nullptr; } quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE); resource = (GstCudaGraphicsResource *) gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark); if (!resource) { GstMapInfo map_info; GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; GstGLBuffer *gl_buf = pbo->pbo; gboolean ret; if (!gst_memory_map (mem, &map_info, (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) { GST_ERROR_OBJECT (self, "Couldn't map gl memory"); return nullptr; } resource = gst_cuda_graphics_resource_new (priv->ctx, GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context), GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER); GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id); ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id, CU_GRAPHICS_REGISTER_FLAGS_NONE); gst_memory_unmap (mem, &map_info); if (!ret) { GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id); gst_cuda_graphics_resource_free (resource); return nullptr; } gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource, (GDestroyNotify) gst_cuda_graphics_resource_free); } return resource; } static void gst_nv_comp_video_dec_download_gl (GstGLContext * context, GLInteropData * data) { auto self = data->self; auto priv = self->priv; auto info = &priv->info; auto finfo = info->finfo; GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, }; CUstream stream = gst_cuda_stream_get_handle (priv->stream); CUresult ret; gint comp[GST_VIDEO_MAX_COMPONENTS]; auto task = priv->task; if (!gst_cuda_context_push (priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't push context"); return; } for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { GstMemory *mem = gst_buffer_peek_memory (data->buffer, i); gsize src_size; if (!gst_is_gl_memory_pbo (mem)) { GST_ERROR_OBJECT (self, "Not a GL PBO memory"); goto out; } gst_res[i] = ensure_gl_cuda_resource (self, mem); if (!gst_res[i]) { GST_ERROR_OBJECT (self, "Couldn't get resource %d", i); goto out; } cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD); if (!cuda_res[i]) { GST_ERROR_OBJECT (self, "Couldn't map resource"); goto out; } ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i], &src_size, cuda_res[i]); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer"); goto out; } /* Need PBO -> texture */ GST_MINI_OBJECT_FLAG_SET (mem, GST_GL_BASE_MEMORY_TRANSFER_NEED_UPLOAD); } for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { guint8 *sp = task->device_uncompressed + info->offset[i]; guint8 *dp = (guint8 *) src_devptr[i]; guint ss, ds; guint w, h; if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, 256 * 4, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); goto out; } data->ret = TRUE; goto out; } auto meta = gst_buffer_get_video_meta (data->buffer); if (meta) ds = meta->stride[i]; else ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i); ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i); gst_video_format_info_component (finfo, i, comp); w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); if (w == 0) w = MIN (ss, ds); h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { gint tile_size; gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; GstVideoTileMode mode; tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); sx_tiles = GST_VIDEO_TILE_X_TILES (ss); sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); dx_tiles = GST_VIDEO_TILE_X_TILES (ds); dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); w = MIN (sx_tiles, dx_tiles); h = MIN (sy_tiles, dy_tiles); for (guint j = 0; j < h; j++) { for (guint k = 0; k < w; k++) { guint si, di; guint8 *cur_dp; guint8 *cur_sp; si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); cur_dp = dp + (di * tile_size); cur_sp = sp + (si * tile_size); ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, w, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); goto out; } } } } else { CUDA_MEMCPY2D params = { }; params.srcMemoryType = CU_MEMORYTYPE_DEVICE; params.srcDevice = (CUdeviceptr) sp; params.srcPitch = ss; params.dstMemoryType = CU_MEMORYTYPE_DEVICE; params.dstDevice = (CUdeviceptr) dp; params.dstPitch = ds; params.WidthInBytes = w; params.Height = h; ret = CuMemcpy2DAsync (¶ms, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); goto out; } } } data->ret = TRUE; out: for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) { if (!gst_res[i]) break; gst_cuda_graphics_resource_unmap (gst_res[i], stream); } CuStreamSynchronize (stream); gst_cuda_context_pop (nullptr); } #endif struct ChunkData { size_t uncomp_size = 0; size_t comp_size = 0; size_t offset = 0; }; static gboolean gst_nv_comp_video_dec_parse_header (GstNvCompVideoDec * self, const guint8 * data, gsize size, size_t &uncompressed_chunk_size, size_t &max_compressed_chunk_size, size_t &batch_size, std::vector < ChunkData > &compressed_chunks) { guint32 val; const guint8 *ptr = data; gsize remaining = size; if (size <= GST_NV_COMP_HEADER_MIN_SIZE) { GST_ERROR_OBJECT (self, "Too small size"); return FALSE; } val = GST_READ_UINT32_LE (ptr); if (val != GST_NV_COMP_HEADER_VERSION) { GST_ERROR_OBJECT (self, "Invalid version"); return FALSE; } ptr += sizeof (guint32); remaining -= sizeof (guint32); uncompressed_chunk_size = GST_READ_UINT32_LE (ptr); ptr += sizeof (guint32); remaining -= sizeof (guint32); max_compressed_chunk_size = GST_READ_UINT32_LE (ptr); ptr += sizeof (guint32); remaining -= sizeof (guint32); batch_size = GST_READ_UINT32_LE (ptr); ptr += sizeof (guint32); remaining -= sizeof (guint32); compressed_chunks.resize (batch_size); size_t total_compressed_size = 0; for (size_t i = 0; i < batch_size; i++) { if (remaining < sizeof (guint32)) return FALSE; compressed_chunks[i].uncomp_size = GST_READ_UINT32_LE (ptr); ptr += sizeof (guint32); remaining -= sizeof (guint32); if (remaining < sizeof (guint32)) return FALSE; compressed_chunks[i].comp_size = GST_READ_UINT32_LE (ptr); total_compressed_size += compressed_chunks[i].comp_size; ptr += sizeof (guint32); remaining -= sizeof (guint32); } if (remaining != total_compressed_size) { GST_ERROR_OBJECT (self, "Size mismatch, remaining: %" G_GSIZE_FORMAT ", total compressed: %" G_GSIZE_FORMAT, remaining, total_compressed_size); return FALSE; } for (size_t i = 0; i < batch_size; i++) { compressed_chunks[i].offset = ptr - data; ptr += compressed_chunks[i].comp_size; } return TRUE; } static GstFlowReturn gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder, GstVideoCodecFrame * frame) { auto self = GST_NV_COMP_VIDEO_DEC (decoder); auto priv = self->priv; CUstream stream = nullptr; GstVideoFrame vframe; GstMapInfo map_info; CUresult cuda_ret; gboolean need_copy = TRUE; GstMemory *mem; nvcompStatus_t status; auto task = priv->task; GstFlowReturn ret; if (!priv->ctx || !priv->task) { GST_ERROR_OBJECT (self, "Context was not configured"); goto error; } ret = gst_video_decoder_allocate_output_frame (decoder, frame); if (ret != GST_FLOW_OK) { gst_video_decoder_release_frame (decoder, frame); return ret; } if (!gst_cuda_context_push (priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't push context"); goto error; } stream = gst_cuda_stream_get_handle (priv->stream); if (!gst_buffer_map (frame->input_buffer, &map_info, GST_MAP_READ)) { GST_ERROR_OBJECT (self, "Couldn't map input buffer"); gst_cuda_context_pop (nullptr); goto error; } if (priv->batched) { g_assert (priv->batched_decomp); /* Parse custom header */ size_t uncompressed_chunk_size; size_t max_compressed_chunk_size; size_t batch_size; std::vector < ChunkData > compressed_chunks; guint8 *mapped_data = map_info.data; uint8_t *uncompressed; if (!gst_nv_comp_video_dec_parse_header (self, mapped_data, map_info.size, uncompressed_chunk_size, max_compressed_chunk_size, batch_size, compressed_chunks)) { gst_buffer_unmap (frame->input_buffer, &map_info); gst_cuda_context_pop (nullptr); goto error; } GST_LOG_OBJECT (self, "batch size %" G_GSIZE_FORMAT ", uncompressed-chunk-size %" G_GSIZE_FORMAT ", compressed-chunk-size %" G_GSIZE_FORMAT, batch_size, uncompressed_chunk_size, max_compressed_chunk_size); if (task->batch_size < batch_size || task->max_uncompressed_chunk_size < uncompressed_chunk_size || task->max_compressed_chunk_size < max_compressed_chunk_size) { task->clear_resource (); } if (task->batch_size == 0) { size_t temp_size = 0; GST_DEBUG_OBJECT (self, "Allocating resource"); status = priv->batched_decomp->get_temp_size (batch_size, uncompressed_chunk_size, &temp_size); if (status != nvcompSuccess) { GST_ERROR_OBJECT (self, "Couldn't get temp size"); gst_buffer_unmap (frame->input_buffer, &map_info); gst_cuda_context_pop (nullptr); goto error; } if (!task->allocate_batched (batch_size, max_compressed_chunk_size, uncompressed_chunk_size, temp_size)) { GST_ERROR_OBJECT (self, "Couldn't allocate resource"); gst_buffer_unmap (frame->input_buffer, &map_info); gst_cuda_context_pop (nullptr); goto error; } } for (size_t i = 0; i < batch_size; i++) { memcpy (task->host_compressed + (i * task->max_compressed_chunk_size), mapped_data + compressed_chunks[i].offset, compressed_chunks[i].comp_size); task->host_compressed_bytes[i] = compressed_chunks[i].comp_size; } gst_buffer_unmap (frame->input_buffer, &map_info); for (size_t i = 0; i < batch_size; i++) { GST_LOG_OBJECT (self, "Uploading chunk %" G_GSIZE_FORMAT ", size %" G_GSIZE_FORMAT, i, compressed_chunks[i].comp_size); auto offset = i * task->max_compressed_chunk_size; cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) (task->device_compressed + offset), task->host_compressed + offset, compressed_chunks[i].comp_size, stream); if (!gst_cuda_result (cuda_ret)) { gst_cuda_context_pop (nullptr); goto error; } } cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed_bytes, task->host_compressed_bytes, sizeof (size_t) * batch_size, stream); if (!gst_cuda_result (cuda_ret)) { gst_cuda_context_pop (nullptr); goto error; } status = priv->batched_decomp->decompress (task->device_compressed_ptrs, task->device_compressed_bytes, task->device_uncompressed_bytes, task->device_actual_uncompressed_bytes, batch_size, task->temp_ptr, task->temp_size, task->device_uncompressed_ptrs, task->device_statuses, (cudaStream_t) stream); if (status != nvcompSuccess) { GST_ERROR_OBJECT (self, "Couldn't decompress stream, status: %d", status); gst_cuda_context_pop (nullptr); goto error; } uncompressed = task->device_uncompressed; for (size_t i = 0; i < batch_size; i++) { auto size = compressed_chunks[i].uncomp_size; cuda_ret = CuMemcpyDtoDAsync ((CUdeviceptr) uncompressed, (CUdeviceptr) task->host_uncompressed_ptrs[i], size, stream); if (!gst_cuda_result (cuda_ret)) { gst_cuda_context_pop (nullptr); goto error; } uncompressed += size; } } else { if (task->compressed_alloc_size < map_info.size) { if (task->device_compressed) CuMemFree ((CUdeviceptr) task->device_compressed); task->device_compressed = nullptr; if (task->host_compressed) CuMemFreeHost (task->host_compressed); task->host_compressed = nullptr; task->compressed_alloc_size = GST_ROUND_UP_128 (map_info.size); auto cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, task->compressed_alloc_size); if (!gst_cuda_result (cuda_ret)) { gst_buffer_unmap (frame->input_buffer, &map_info); gst_cuda_context_pop (nullptr); goto error; } cuda_ret = CuMemAllocHost ((void **) &task->host_compressed, task->compressed_alloc_size); if (!gst_cuda_result (cuda_ret)) { gst_buffer_unmap (frame->input_buffer, &map_info); gst_cuda_context_pop (nullptr); goto error; } } memcpy (task->host_compressed, map_info.data, map_info.size); cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed, task->host_compressed, map_info.size, stream); gst_buffer_unmap (frame->input_buffer, &map_info); if (!gst_cuda_result (cuda_ret)) { GST_ERROR_OBJECT (self, "Couldn't copy compressed memory"); gst_cuda_context_pop (nullptr); goto error; } if (!priv->manager) { priv->manager = create_manager (task->device_compressed, (cudaStream_t) stream); } { auto config = priv->manager->configure_decompression (task->device_compressed); if (config.decomp_data_size != priv->info.size) { GST_ERROR_OBJECT (self, "size mismatch, expected %" G_GSIZE_FORMAT ", required %" G_GSIZE_FORMAT, priv->info.size, config.decomp_data_size); gst_cuda_context_pop (nullptr); goto error; } priv->manager->decompress (task->device_uncompressed, task->device_compressed, config); } } mem = gst_buffer_peek_memory (frame->output_buffer, 0); #ifdef HAVE_GST_GL if (priv->gl_interop && gst_buffer_n_memory (frame->output_buffer) == GST_VIDEO_INFO_N_PLANES (&priv->info)) { GLInteropData interop_data; interop_data.self = self; interop_data.buffer = frame->output_buffer; interop_data.ret = FALSE; auto gl_mem = (GstGLMemory *) mem; gst_gl_context_thread_add (gl_mem->mem.context, (GstGLContextThreadFunc) gst_nv_comp_video_dec_download_gl, &interop_data); if (interop_data.ret) { need_copy = FALSE; GST_TRACE_OBJECT (self, "CUDA -> GL copy done"); } else { priv->gl_interop = FALSE; } } #endif if (need_copy) { GstMapFlags map_flags = GST_MAP_WRITE; gboolean device_copy = FALSE; gboolean do_sync = TRUE; if (gst_is_cuda_memory (mem)) { auto cmem = GST_CUDA_MEMORY_CAST (mem); if (cmem->context == priv->ctx) { map_flags = (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA); device_copy = TRUE; auto mem_stream = gst_cuda_memory_get_stream (cmem); if (mem_stream && mem_stream == priv->stream) do_sync = FALSE; } } if (!device_copy) { cuda_ret = CuMemcpyDtoHAsync (task->host_uncompressed, (CUdeviceptr) task->device_uncompressed, priv->info.size, stream); if (!gst_cuda_result (cuda_ret)) { GST_ERROR_OBJECT (self, "Couldn't download image"); gst_cuda_context_pop (nullptr); goto error; } CuStreamSynchronize (stream); do_sync = FALSE; } gst_video_frame_map (&vframe, &priv->info, frame->output_buffer, map_flags); gst_nv_comp_video_dec_download (self, &vframe, stream, device_copy); if (do_sync) CuStreamSynchronize (stream); gst_video_frame_unmap (&vframe); } gst_cuda_context_pop (nullptr); return gst_video_decoder_finish_frame (decoder, frame); error: gst_video_decoder_release_frame (decoder, frame); return GST_FLOW_ERROR; }