VTDecompressionSessionDecodeFrame 返回 imageBuffer = nil 但 OSStatus = noErr

问题描述

我正在尝试使用 Swift (macOS) 中的 VideoToolBox API 解码原始 H264 流。

在 viewDidLoad() 中，我设置了我的显示层和 CMTimeBase：

self.view.wantsLayer = true

self.VideoLayer = AVSampleBufferdisplayLayer()
self.VideoLayer.frame = self.view.bounds
self.view.layer?.addSublayer(self.VideoLayer)

var _CMTimebasePointer: CMTimebase? = nil
let status = CMTimebaseCreateWithMasterClock(
    allocator: kcfAllocatorDefault,masterClock: CMClockGetHostTimeClock(),timebaSEOut: &_CMTimebasePointer)

self.VideoLayer.controlTimebase = _CMTimebasePointer
CMTimebaseSetTime(
    self.VideoLayer.controlTimebase!,time: CMTime.zero);
CMTimebaseSetRate(
    self.VideoLayer.controlTimebase!,rate: 1.0);

然后我将我的 H264 文件作为原始字节读取并解析为单独的 NALU。（我与其他项目中的 NALU 解析器进行了交叉检查，我的 NALU 解析器是正确的，但如果您认为我应该在此处发布它的代码，请发表评论，我将编辑我的问题 :) ）

这就是我处理每个 NALU 的方式（我基本上在前 4 个字节中设置 NALU 长度（以转换为 avcC 格式），对于 SPS 和 PPS NALU，我忽略前 4 个字节。）：

func decodeFrame(_ videoPacket: inout VideoPacket)
{
    // replace start code with nal size
    var biglen = CFSwapInt32HostToBig(UInt32(videoPacket.count - 4)) // NALU length doesn't contain the first 4 size bytes
    memcpy(&videoPacket,&biglen,4)
    let nalType = videoPacket[4] & 0x1F
    switch nalType
    {
        case 0x05:
//                print("Nal type is IDR frame")
            // inside this I create the format description and decompression session
            createDecompressionSession()
            decodeVideoPacket(videoPacket)
        case 0x07:
//                print("Nal type is SPS")
            spsSize = videoPacket.count - 4
            sps = Array(videoPacket[4..<videoPacket.count])
        case 0x08:
//                print("Nal type is PPS")
            ppsSize = videoPacket.count - 4
            pps = Array(videoPacket[4..<videoPacket.count])
        default:
//                print("Nal type is B/P frame: \(nalType)")
            decodeVideoPacket(videoPacket)
            break;
    }
}

然后我像这样创建 VideoFormatDescription：

let pointerSPS = UnsafePointer<UInt8>(spsData)
let pointerPPS = UnsafePointer<UInt8>(ppsData)

// make pointers array
let dataParamArray = [pointerSPS,pointerPPS]
let parameterSetPointers = UnsafePointer<UnsafePointer<UInt8>>(dataParamArray)

// make parameter sizes array
let sizeParamArray = [spsData.count,ppsData.count]
let parameterSetSizes = UnsafePointer<Int>(sizeParamArray)

let status = CMVideoFormatDescriptionCreateFromH264ParameterSets(
    allocator: kcfAllocatorDefault,parameterSetCount: 2,parameterSetPointers: parameterSetPointers,parameterSetSizes: parameterSetSizes,nalUnitHeaderLength: 4,formatDescriptionOut: &self.VideoFormatDescription) // class variable

我把 VTDecompressionSession 做成这样：

let decoderParameters = NSMutableDictionary()
let destinationPixelBufferAttributes = NSMutableDictionary()
destinationPixelBufferAttributes.setValue(
    NSNumber(value: kCVPixelFormatType_32ARGB),// I've tried varIoUs values here to no avail...
    forKey: kCVPixelBufferPixelFormatTypeKey as String
)

var outputCallback = VTDecompressionOutputCallbackRecord()
outputCallback.decompressionOutputCallback = decompressionSessionDecodeFrameCallback
outputCallback.decompressionOutputRefCon = UnsafeMutableRawPointer(Unmanaged.passUnretained(self).toOpaque())

let status = VTDecompressionSessionCreate(
    allocator: kcfAllocatorDefault,formatDescription: videoDescription,decoderSpecification: decoderParameters,imageBufferAttributes: destinationPixelBufferAttributes,outputCallback: &outputCallback,decompressionSessionOut: &self.DecompressionSession)

然后，这就是我解码每一帧的方式：

func decodeVideoPacket(_ videoPacket: VideoPacket)
{
    let bufferPointer = UnsafeMutablePointer<UInt8>(mutating: videoPacket)
    var blockBuffer: CMBlockBuffer?
    var status = CMBlockBufferCreateWithMemoryBlock(
        allocator: kcfAllocatorDefault,memoryBlock: bufferPointer,blockLength: videoPacket.count,blockAllocator: kcfAllocatorNull,customBlockSource: nil,offsetToData: 0,dataLength: videoPacket.count,flags: 0,blockBufferOut: &blockBuffer)
    if status != noErr
    {
        print("CMBlockBufferCreateWithMemoryBlock ERROR: \(status)")
        return
    }
    
    var sampleBuffer: CMSampleBuffer?
    let sampleSizeArray = [videoPacket.count]
    
    let frameFPS = Double(1) / Double(60)
    let tval = Double(frameFPS * Double(self.frameCount))
    let presentationTime = CMTimeMakeWithSeconds(tval,preferredTimescale: 1000)
    var info = CMSampleTimingInfo(
        duration: CMTimeMakeWithSeconds(frameFPS,preferredTimescale: 1000),presentationTimeStamp: presentationTime,decodeTimeStamp: presentationTime)
    self.frameCount += 1
    
    status = CMSampleBufferCreateReady(
        allocator: kcfAllocatorDefault,dataBuffer: blockBuffer,formatDescription: self.VideoFormatDescription,sampleCount: 1,sampleTimingEntryCount: 1,sampleTimingArray: &info,sampleSizeEntryCount: 1,sampleSizeArray: sampleSizeArray,sampleBufferOut: &sampleBuffer)
    if status != noErr
    {
        print("CMSampleBufferCreateReady ERROR: \(status)")
        return
    }
    
    guard let buffer = sampleBuffer
    else
    {
        print("Could not unwrap sampleBuffer!")
        return
    }
    
    if self.VideoLayer.isReadyForMoreMediaData
    {
        self.VideoLayer?.enqueue(buffer)
        self.VideoLayer.displayIfNeeded()
    }
    
    
    if let session = self.DecompressionSession
    {
        var outputBuffer: CVPixelBuffer?

        status = VTDecompressionSessionDecodeFrame(
            session,sampleBuffer: buffer,flags: [],frameRefcon: &outputBuffer,infoFlagsOut: nil)
        if status != noErr
        {
            print("VTDecompressionSessionDecodeFrame ERROR: \(status)")
        }

        status = VTDecompressionSessionWaitForAsynchronousFrames(session)
        if status != noErr
        {
            print("VTDecompressionSessionWaitForAsynchronousFrames ERROR: \(status)")
        }
    }
}

最后，在解码回调函数中，目前我只是尝试检查 imageBuffer 是否为 nil，但它始终为 nil 并且始终设置 Osstatus到noErr

private func decompressionSessionDecodeFrameCallback(
    _ decompressionOutputRefCon: UnsafeMutableRawPointer?,_ sourceFrameRefCon: UnsafeMutableRawPointer?,_ status: Osstatus,_ infoFlags: VTDecodeInfoFlags,_ imageBuffer: CVImageBuffer?,_ presentationTimeStamp: CMTime,_ presentationDuration: CMTime) -> Void
{
    print("status: \(status),image_nil?: \(imageBuffer == nil)")
}

很明显，由于 imageBuffer 是 nil，所以我认为有问题......

（同样 AVSampleBufferdisplayLayer 不渲染任何图像）

你们能帮我找出我的代码有什么问题吗，或者告诉我如何更深入地找出可能发生但对我隐藏的 VTDecompression 错误？

PS：让我知道任何可能需要在我的代码中解释更多的内容

解决方法

我有一些建议，可以帮助您。（删除评论，创建完整答案）

有一个 outputCallback 闭包，它也有 status: OSStatus，您也可以在那里检查错误：

/// This step is not necessary,because I'm using sample buffer layer to display it
/// this method generate gives you `CVPixelBuffer` if you want to manage displaying yourself
private var outputCallback: VTDecompressionOutputCallback = {
    (decompressionOutputRefCon: UnsafeMutableRawPointer?,sourceFrameRefCon: UnsafeMutableRawPointer?,status: OSStatus,infoFlags: VTDecodeInfoFlags,imageBuffer: CVPixelBuffer?,presentationTimeStamp: CMTime,duration: CMTime) in
    
    let selfPointer = Unmanaged<VideoStreamManager>.fromOpaque(decompressionOutputRefCon!).takeUnretainedValue()
    if status == noErr {
        debugPrint("===== ✅ Image successfully decompressed,OSStatus: \(status) =====")
    } else {
        debugPrint("===== ❌ Failed to decompress,OSStatus: \(status) =====")
    }
}

NAL 中的起始代码，它不总是 00 00 00 01（3 个字节），它可以是 00 00 01（2 个字节），但您下标总是 [4] 个字节

附件 B 规范通过要求在每个 NALU 之前有“起始代码”来解决这个问题。起始码是 2 或 3 个 0x00 字节后跟 0x01 字节。例如0x000001 或 0x00000001。

Reference:

如果这对您有帮助，请告诉我。

我的问题是，尽管我正确解析了每个 NALU 并将每个 NALU 转换为 AVCC 格式以输入 AVSampleBufferDisplayLayer / VTDecompressor，但每个 NALU 都不是整个视频帧。我在某处偶然发现了这个随机线程（现在找不到它），但它描述了将构成一个视频帧的所有 NALU 并将它们组合成一个大 NALU。

如下所示：

NALU_length_header_1 = 4 字节 big-endian NALU 长度值

NALU_1 = 剩余的 nalu 数据字节（包含我认为的 NALU slice_header 和视频帧数据）

每个 NALU 看起来像 = [NALU_length_header_1][NALU_1]

所以当我们将多个组合成一帧时，它应该是这样的： [NALU_length_header_1][NALU_1][NALU_length_header_2][NALU_2][NALU_length_header_3][NALU_3][NALU_length_header_4][NALU_4]

就我而言，四个 NALU 构成了一个完整的视频帧。

一旦您将 NALU 组合在一起，可能在某种 [UInt8] 数组类型中，该值可用于创建 BlockBuffer 然后是 CMSampleBuffer，并传递给解码器/视频层。

我发现有两种方法可以用来检测哪些 NALU 组合在一起构成了一个视频帧。两者都涉及查看 NALU 切片标头属性。

首先，您可以查看名为 frame_num 的属性，如果任何 NALU 具有相同的 frame_num 值，则将数据合并为一个“大”NALU。 （我的编码器没有设置这个值，所以我不得不使用 first_mb_in_slice 值）

其次，读取名为 first_mb_in_slice 的属性。这个属性在四个 NALU 的跨度上递增为 0,2040,4080,6120，它指的是视频帧数据的偏移量，我们可以用它来检测构成一帧视频的 NALU。

Ps：抱歉，如果我的回答有点冗长或令人困惑，希望能帮到你！

h.264 ios ios macos macos swift swift swift swift video-toolbox