diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
index 45f6ba1..afe0b30 100644
--- a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
+++ b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -241,7 +241,7 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     // What happens to those decode surfaces that haven't yet been mapped is
     // unclear.
     flush();
-    unmapPreviousFrame();
+    // unmapPreviousFrame();
     NVDECCache::getCache(device_).returnDecoder(
         &videoFormat_, std::move(decoder_));
   }
@@ -560,7 +560,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   // color-converted (with a copy), or that's a frame that was discarded in
   // SingleStreamDecoder. Either way, the underlying output surface can be
   // safely re-used.
-  unmapPreviousFrame();
+  // unmapPreviousFrame();
   CUresult result = cuvidMapVideoFrame(
       *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
   if (result != CUDA_SUCCESS) {
@@ -569,6 +569,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   previouslyMappedFrame_ = framePtr;
 
   avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
+  unmapPreviousFrame();
 
   return AVSUCCESS;
 }
 
 void BetaCudaDeviceInterface::unmapPreviousFrame() {
-  if (previouslyMappedFrame_ == 0) {
-    return;
-  }
+  // if (previouslyMappedFrame_ == 0) {
+  //   return;
+  // }
   CUresult result =
       cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
index c8870df..2a5915c 100644
--- a/src/torchcodec/_core/SingleStreamDecoder.cpp
+++ b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -409,6 +409,17 @@ torch::Tensor SingleStreamDecoder::getKeyFrameIndices() {
   return keyFrameIndices;
 }
 
+inline char* find_codec(const char* input) {
+    const char* codecs[] = {"h264", "hevc", "av1", "vp9"};
+    size_t codec_len = sizeof(codecs) / sizeof(codecs[0]);
+    for (size_t i = 0; i < codec_len; ++i) {
+        if (strstr(input, codecs[i])) {
+            return (char*)codecs[i];
+        }
+    }
+    return NULL;
+}
+
 // --------------------------------------------------------------------------
 // ADDING STREAMS API
 // --------------------------------------------------------------------------
@@ -461,9 +472,22 @@ void SingleStreamDecoder::addStream(
   // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
   // addStream() which is supposed to be generic
   if (mediaType == AVMEDIA_TYPE_VIDEO) {
-    avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
-        deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
-            .value_or(avCodec));
+    if (device.type() != torch::kCUDA) {
+      avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
+          deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
+              .value_or(avCodec));
+    }
+    else {
+      const char* cuvid_suffix = "_cuvid";
+      char* codec_name = find_codec(avCodec->name);
+      size_t cuvid_length = std::strlen(codec_name) + std::strlen(cuvid_suffix) + 1;
+      char* cuvid_name = new char[cuvid_length];
+      std::strcpy(cuvid_name, codec_name);
+      std::strcat(cuvid_name, cuvid_suffix);
+      avCodec = avcodec_find_decoder_by_name(cuvid_name);
+      delete[] cuvid_name;
+      TORCH_CHECK(avCodec != nullptr);
+    }
   }
 
   AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);