# Intel VPL AV1 디코더 설계 문서 ## 📋 개요 Intel Video Processing Library (VPL)를 사용하여 VavCore에 AV1 디코더를 추가하는 설계 문서입니다. **목적**: Intel QuickSync 하드웨어 가속을 활용한 고성능 AV1 디코딩 **대상 플랫폼**: Intel GPU 지원 시스템 (소프트웨어 fallback 포함) **통합 위치**: VavCore 디코더 팩토리 시스템 --- ## 🏗️ 라이브러리 분석 ### **Intel VPL 구조** - **소스 위치**: `D:\Project\video-av1\oss\libvpl` - **주요 헤더**: - `api/vpl/mfxvideo.h` - 비디오 디코딩 API - `api/vpl/mfxstructures.h` - 데이터 구조체 - **참고 문서**: https://intel.github.io/libvpl/latest/API_ref/VPL_func_vid_decode.html ### **핵심 API 워크플로우** 1. **MFXLoad()** - VPL 라이브러리 로더 생성 2. **MFXCreateConfig()** - 디코더 구성 설정 (HW/SW, 코덱 타입) 3. **MFXCreateSession()** - 디코딩 세션 생성 4. **MFXVideoDECODE_DecodeHeader()** - 비트스트림 헤더 파싱 5. **MFXVideoDECODE_QueryIOSurf()** - 필요한 표면 개수 계산 6. **MFXVideoDECODE_Init()** - 디코더 초기화 7. **MFXVideoDECODE_DecodeFrameAsync()** - 비동기 프레임 디코딩 8. **MFXVideoCORE_SyncOperation()** - 디코딩 완료 대기 --- ## 🎯 클래스 설계 ### **VPLAV1Decoder 클래스 구조** ```cpp namespace VavCore { class VPLAV1Decoder : public IVideoDecoder { public: VPLAV1Decoder(); ~VPLAV1Decoder() override; // Prevent copying VPLAV1Decoder(const VPLAV1Decoder&) = delete; VPLAV1Decoder& operator=(const VPLAV1Decoder&) = delete; // IVideoDecoder 인터페이스 구현 bool Initialize(const VideoMetadata& metadata) override; void Cleanup() override; bool IsInitialized() const override; bool DecodeFrame(const VideoPacket& input_packet, VideoFrame& output_frame) override; bool DecodeFrame(const uint8_t* packet_data, size_t packet_size, VideoFrame& output_frame) override; bool Reset() override; bool Flush() override; // IVideoDecoder 인터페이스 - 추가 메서드 std::string GetCodecName() const override { return "AV1 (Intel VPL)"; } VideoCodecType GetCodecType() const override { return VideoCodecType::AV1; } std::string GetVersion() const override; DecoderStats GetStats() const override; void ResetStats() override; // VPL 전용 메서드 bool IsVPLAvailable() const; bool InitializeVPL(); protected: // Protected members for inheritance (AdaptiveVPLDecoder) mfxSession m_session = nullptr; mfxVideoParam m_videoParams = {}; uint32_t m_width = 0; uint32_t m_height = 0; uint32_t m_maxWidth = 4096; uint32_t m_maxHeight = 4096; // Protected helper methods void LogVPLError(mfxStatus status, const std::string& operation) const; private: // VPL 객체 mfxLoader m_loader = nullptr; mfxConfig m_config = nullptr; // 표면 관리 std::vector m_surfaces; mfxU16 m_numSurfaces = 0; // 비트스트림 버퍼 std::unique_ptr m_bitstreamBuffer; mfxU32 m_bitstreamBufferSize = 2 * 1024 * 1024; // 2MB 기본값 // 통계 uint64_t m_framesDecoded = 0; uint64_t m_decodeErrors = 0; double m_avgDecodeTime = 0.0; uint64_t m_bytesProcessed = 0; // 상태 bool m_initialized = false; bool m_headerParsed = false; // 헬퍼 메서드 bool CheckVPLCapability(); bool CreateSession(); bool SetupDecoder(); bool AllocateSurfaces(); void CleanupVPL(); // 프레임 변환 bool ConvertVPLSurface(mfxFrameSurface1* surface, VideoFrame& output_frame); mfxFrameSurface1* GetFreeSurface(); // 비트스트림 처리 bool PrepareDataForDecode(const uint8_t* packet_data, size_t packet_size, mfxBitstream& bitstream); // 에러 처리 void LogError(const std::string& message) const; }; } // namespace VavCore ``` --- ## ⚙️ 주요 구현 단계 ### **1. 초기화 단계 (Initialize)** ```cpp bool VPLAV1Decoder::Initialize(const VideoMetadata& metadata) { if (m_initialized) { LogError("Decoder already initialized"); return false; } // Store video dimensions m_width = metadata.width; m_height = metadata.height; // Initialize VPL if (!InitializeVPL()) { LogError("Failed to initialize VPL"); return false; } // Create VPL session if (!CreateSession()) { LogError("Failed to create VPL session"); CleanupVPL(); return false; } // Setup decoder parameters if (!SetupDecoder()) { LogError("Failed to setup decoder parameters"); CleanupVPL(); return false; } m_initialized = true; return true; } // InitializeVPL 구현 bool VPLAV1Decoder::InitializeVPL() { // Create loader m_loader = MFXLoad(); if (!m_loader) { LogError("Failed to create VPL loader"); return false; } // Create config m_config = MFXCreateConfig(m_loader); if (!m_config) { LogError("Failed to create VPL config"); return false; } // Set AV1 codec filter mfxVariant codecId; codecId.Type = MFX_VARIANT_TYPE_U32; codecId.Data.U32 = MFX_CODEC_AV1; mfxStatus status = MFXSetConfigFilterProperty(m_config, (const mfxU8*)"mfxImplDescription.mfxDecoderDescription.decoder.CodecID", codecId); if (status != MFX_ERR_NONE) { LogVPLError(status, "SetConfigFilterProperty"); return false; } return CheckVPLCapability(); } ``` ### **2. 프레임 디코딩 (DecodeFrame)** ```cpp bool VPLAV1Decoder::DecodeFrame(const uint8_t* packet_data, size_t packet_size, VideoFrame& output_frame) { if (!m_initialized) { LogError("Decoder not initialized"); ++m_decodeErrors; return false; } if (!packet_data || packet_size == 0) { LogError("Invalid packet data"); ++m_decodeErrors; return false; } auto start_time = high_resolution_clock::now(); try { // Prepare bitstream for decoding mfxBitstream bitstream = {}; if (!PrepareDataForDecode(packet_data, packet_size, bitstream)) { LogError("Failed to prepare data for decode"); ++m_decodeErrors; return false; } // Parse header if not done yet if (!m_headerParsed) { mfxStatus status = MFXVideoDECODE_DecodeHeader(m_session, &bitstream, &m_videoParams); if (status != MFX_ERR_NONE) { LogVPLError(status, "DecodeHeader"); ++m_decodeErrors; return false; } // Allocate surfaces after header parsing if (!AllocateSurfaces()) { LogError("Failed to allocate surfaces"); ++m_decodeErrors; return false; } m_headerParsed = true; } // Get free surface for decoding mfxFrameSurface1* work_surface = GetFreeSurface(); if (!work_surface) { LogError("No free surface available"); ++m_decodeErrors; return false; } // Perform async decoding mfxFrameSurface1* output_surface = nullptr; mfxSyncPoint sync_point = nullptr; mfxStatus status = MFXVideoDECODE_DecodeFrameAsync( m_session, &bitstream, work_surface, &output_surface, &sync_point); if (status == MFX_ERR_MORE_DATA) { // Need more data - not an error for AV1 stream return false; } if (status != MFX_ERR_NONE && status != MFX_WRN_VIDEO_PARAM_CHANGED) { LogVPLError(status, "DecodeFrameAsync"); ++m_decodeErrors; return false; } // Wait for decoding to complete if (sync_point) { status = MFXVideoCORE_SyncOperation(m_session, sync_point, MFX_INFINITE); if (status != MFX_ERR_NONE) { LogVPLError(status, "SyncOperation"); ++m_decodeErrors; return false; } } // Convert output surface to VideoFrame if (output_surface) { if (!ConvertVPLSurface(output_surface, output_frame)) { LogError("Failed to convert VPL surface"); ++m_decodeErrors; return false; } // Update statistics ++m_framesDecoded; m_bytesProcessed += packet_size; auto end_time = high_resolution_clock::now(); auto decode_time = duration_cast(end_time - start_time).count() / 1000.0; m_avgDecodeTime = (m_avgDecodeTime * (m_framesDecoded - 1) + decode_time) / m_framesDecoded; return true; } return false; } catch (const std::exception& e) { LogError("Exception in DecodeFrame: " + string(e.what())); ++m_decodeErrors; return false; } } ``` ### **3. 표면 관리 (Surface Management)** ```cpp bool VPLAV1Decoder::AllocateSurfaces() { if (!m_session) { LogError("Session not created"); return false; } // Query required number of surfaces mfxFrameAllocRequest allocRequest = {}; mfxStatus status = MFXVideoDECODE_QueryIOSurf(m_session, &m_videoParams, &allocRequest); if (status != MFX_ERR_NONE) { LogVPLError(status, "QueryIOSurf"); return false; } // Allocate surfaces (suggested number + buffer) m_numSurfaces = allocRequest.NumFrameSuggested + 4; // Extra buffer m_surfaces.resize(m_numSurfaces); for (mfxU16 i = 0; i < m_numSurfaces; i++) { memset(&m_surfaces[i], 0, sizeof(mfxFrameSurface1)); // Copy frame info from decoder parameters m_surfaces[i].Info = m_videoParams.mfx.FrameInfo; // For simplicity, we use system memory allocation // In production, you might want to use video memory if (!AllocateSystemMemoryForSurface(&m_surfaces[i])) { LogError("Failed to allocate system memory for surface " + to_string(i)); return false; } } return true; } mfxFrameSurface1* VPLAV1Decoder::GetFreeSurface() { // Find a free surface (not locked by VPL) for (auto& surface : m_surfaces) { if (surface.Data.Locked == 0) { return &surface; } } // All surfaces are busy - this might indicate a pipeline bottleneck LogError("All surfaces are locked - decoder pipeline bottleneck"); return nullptr; } bool VPLAV1Decoder::AllocateSystemMemoryForSurface(mfxFrameSurface1* surface) { mfxFrameInfo& info = surface->Info; // Calculate required buffer size for YUV420 format mfxU32 surfaceSize = info.Width * info.Height * 3 / 2; // YUV420P // For NV12, we might need different calculations if (info.FourCC == MFX_FOURCC_NV12) { surfaceSize = info.Width * info.Height + (info.Width * info.Height) / 2; } // Allocate buffer mfxU8* buffer = new (std::nothrow) mfxU8[surfaceSize]; if (!buffer) { return false; } // Set up surface data pointers surface->Data.Y = buffer; surface->Data.U = buffer + info.Width * info.Height; surface->Data.V = surface->Data.U + (info.Width * info.Height) / 4; surface->Data.Pitch = info.Width; // For NV12, UV is interleaved if (info.FourCC == MFX_FOURCC_NV12) { surface->Data.UV = surface->Data.U; surface->Data.V = surface->Data.UV + 1; } return true; } ``` ### **4. 데이터 변환 (VPL → VideoFrame)** ```cpp bool VPLAV1Decoder::ConvertVPLSurface(mfxFrameSurface1* surface, VideoFrame& output_frame) { if (!surface || !surface->Data.Y) { LogError("Invalid surface for conversion"); return false; } mfxFrameInfo& info = surface->Info; // Set output frame metadata output_frame.width = info.CropW ? info.CropW : info.Width; output_frame.height = info.CropH ? info.CropH : info.Height; output_frame.timestamp = 0; // VPL doesn't provide timestamp directly // Determine and set color space switch (info.FourCC) { case MFX_FOURCC_NV12: output_frame.color_space = ColorSpace::NV12; break; case MFX_FOURCC_I420: case MFX_FOURCC_IYUV: output_frame.color_space = ColorSpace::YUV420P; break; case MFX_FOURCC_YUY2: output_frame.color_space = ColorSpace::YUV422P; break; default: LogError("Unsupported pixel format: " + to_string(info.FourCC)); return false; } // Calculate plane sizes for the new VideoFrame structure output_frame.y_size = output_frame.width * output_frame.height; output_frame.u_size = output_frame.y_size / 4; output_frame.v_size = output_frame.y_size / 4; // Allocate memory for each plane output_frame.y_plane = std::make_unique(output_frame.y_size); output_frame.u_plane = std::make_unique(output_frame.u_size); output_frame.v_plane = std::make_unique(output_frame.v_size); try { if (info.FourCC == MFX_FOURCC_NV12) { // NV12: Y plane + interleaved UV return ConvertNV12ToYUV420P(surface, output_frame); } else { // I420/IYUV: Separate Y, U, V planes return ConvertI420ToYUV420P(surface, output_frame); } } catch (const std::exception& e) { LogError("Exception during surface conversion: " + string(e.what())); return false; } } bool VPLAV1Decoder::ConvertNV12ToYUV420P(mfxFrameSurface1* surface, VideoFrame& output_frame) { mfxFrameData& src = surface->Data; mfxFrameInfo& info = surface->Info; // Copy Y plane uint8_t* dst_y = output_frame.y_plane.get(); uint8_t* src_y = src.Y; for (uint32_t y = 0; y < output_frame.height; y++) { memcpy(dst_y, src_y, output_frame.width); dst_y += output_frame.width; src_y += src.Pitch; } // Convert interleaved UV to separate U and V planes uint8_t* dst_u = output_frame.u_plane.get(); uint8_t* dst_v = output_frame.v_plane.get(); uint8_t* src_uv = src.UV; uint32_t uv_width = output_frame.width / 2; uint32_t uv_height = output_frame.height / 2; for (uint32_t y = 0; y < uv_height; y++) { for (uint32_t x = 0; x < uv_width; x++) { dst_u[x] = src_uv[x * 2]; // U (even indices) dst_v[x] = src_uv[x * 2 + 1]; // V (odd indices) } dst_u += uv_width; dst_v += uv_width; src_uv += src.Pitch; } return true; } bool VPLAV1Decoder::ConvertI420ToYUV420P(mfxFrameSurface1* surface, VideoFrame& output_frame) { mfxFrameData& src = surface->Data; // Copy Y plane uint8_t* dst_y = output_frame.y_plane.get(); uint8_t* src_y = src.Y; for (uint32_t y = 0; y < output_frame.height; y++) { memcpy(dst_y, src_y, output_frame.width); dst_y += output_frame.width; src_y += src.Pitch; } // Copy U plane uint8_t* dst_u = output_frame.u_plane.get(); uint8_t* src_u = src.U; uint32_t uv_width = output_frame.width / 2; uint32_t uv_height = output_frame.height / 2; for (uint32_t y = 0; y < uv_height; y++) { memcpy(dst_u, src_u, uv_width); dst_u += uv_width; src_u += src.Pitch / 2; } // Copy V plane uint8_t* dst_v = output_frame.v_plane.get(); uint8_t* src_v = src.V; for (uint32_t y = 0; y < uv_height; y++) { memcpy(dst_v, src_v, uv_width); dst_v += uv_width; src_v += src.Pitch / 2; } return true; } ``` --- ## 🔧 VideoDecoderFactory 통합 ### **팩토리 클래스 수정** ```cpp // VideoDecoderFactory.h enum class DecoderType { AUTO = 0, ADAPTIVE_NVDEC, ADAPTIVE_DAV1D, NVDEC, DAV1D, MEDIA_FOUNDATION, VPL, // Intel VPL 추가 AMF // AMD AMF 추가 }; // VideoDecoderFactory.cpp 주요 부분 std::unique_ptr VideoDecoderFactory::CreateDecoder(DecoderType type) { switch (type) { case DecoderType::AUTO: // 우선순위: Adaptive NVDEC > VPL > AMF > Adaptive dav1d > NVDEC > dav1d > MediaFoundation if (s_nvdec_available) { OutputDebugStringA("[VideoDecoderFactory] Auto mode: trying Adaptive NVDEC AV1 decoder first\n"); auto decoder = std::make_unique(); if (decoder) { return decoder; } } if (s_vpl_available) { OutputDebugStringA("[VideoDecoderFactory] Auto mode: trying Intel VPL AV1 decoder\n"); auto decoder = std::make_unique(); if (decoder) { return decoder; } } if (s_amf_available) { OutputDebugStringA("[VideoDecoderFactory] Auto mode: trying AMD AMF AV1 decoder\n"); auto decoder = std::make_unique(); if (decoder) { return decoder; } } // Continue with other decoders... break; case DecoderType::VPL: if (s_vpl_available) { OutputDebugStringA("[VideoDecoderFactory] Creating Intel VPL AV1 decoder\n"); return std::make_unique(); } break; // ... 기존 케이스들 } return nullptr; } // 가용성 검사 구현 bool VideoDecoderFactory::CheckVPLAvailability() { try { // Try creating a temporary VPL decoder instance auto temp_decoder = std::make_unique(); if (temp_decoder && temp_decoder->IsVPLAvailable()) { OutputDebugStringA("[VideoDecoderFactory] Intel VPL AV1 decoder available\n"); return true; } } catch (...) { // VPL not available or initialization failed } OutputDebugStringA("[VideoDecoderFactory] Intel VPL AV1 decoder not available\n"); return false; } ``` --- ## 📊 구현 특징 및 장점 ### **주요 장점** - ✅ **Intel QuickSync 하드웨어 가속**: Intel GPU 전용 최적화 - ✅ **표준화된 API**: 업계 표준 VPL 라이브러리 사용 - ✅ **비동기 처리**: 높은 처리량과 성능 - ✅ **자동 Fallback**: 하드웨어 → 소프트웨어 자동 전환 - ✅ **메모리 효율성**: VPL 내부 표면 관리 활용 ### **성능 특성** - **대상 플랫폼**: Intel GPU (8세대 이후 권장) - **예상 성능**: 4K AV1 실시간 디코딩 (60fps+) - **메모리 사용**: 하드웨어 가속 시 GPU 메모리 활용 ### **호환성** - **Intel GPU**: 최고 성능 - **Non-Intel GPU**: 소프트웨어 구현 fallback - **AMD/NVIDIA**: 기존 디코더 우선 사용 (AUTO 모드) --- ## 🚧 구현 고려사항 ### **에러 처리** - VPL 특정 에러 코드 매핑 - 하드웨어 실패 시 graceful fallback - 메모리 부족 상황 처리 ### **성능 최적화** - 표면 풀링으로 할당 오버헤드 최소화 - 비동기 처리로 CPU-GPU 병렬 실행 - Zero-copy 메모리 전송 (가능한 경우) ### **테스트 계획** - Intel GPU 환경에서 성능 벤치마크 - 다양한 AV1 비트스트림 호환성 테스트 - 메모리 누수 및 안정성 검증 --- ## 📋 구현 우선순위 ### **Phase 1: 기본 구현** 1. VPLAV1Decoder 클래스 기본 구조 2. Initialize/DecodeFrame 핵심 로직 3. VideoDecoderFactory 통합 ### **Phase 2: 최적화** 1. 표면 관리 최적화 2. 에러 처리 강화 3. 성능 모니터링 추가 ### **Phase 3: 고급 기능** 1. 적응형 품질 조정 지원 2. 멀티스레드 디코딩 3. GPU 메모리 최적화 --- *작성일: 2025-09-26* *Intel VPL 버전: 2.x API 기준*