From aeaa8dd90eb28a7a769c114fd8e2cb6f936038cb Mon Sep 17 00:00:00 2001 From: Xaymar Date: Sat, 21 Oct 2017 18:49:36 +0200 Subject: [PATCH] Improved Threading, better Frame Submission and more The encoder is now capable of delayed or immediate threading (using fixed latency) which allows for a much better performance and works better with older VFW encoders that assume they are being used for transcoding instead of live-encoding. In addition to the above, the MPEG-2 rewrite code for Matrox MPEG-2 encoders is completely done now and will fix up the bitstream Matroxs encoder sends out to be mostly correct, allowing for better seeking. The last change this encompasses is the removal of the need to flip a RGBA frame to BGRA. This doesn't break any support and should work better with what we have. This exposed several bugs in OBS Studio: - Color Range, Space & Format are ignored by OBS Studio if set by the encoder. - The ffmpeg-mux process freezes when writing the header for a correct MPEG-2 stream that doesn't match what OBS thinks it should be. ffmpeg itself has no issues writing this header, so this is an OBS Studio issue. And exposed the following bugs in Matrox VFW: - The MPEG-2 bitstream is written as interlaced Top-Field-First instead of Progressive - the content itself is Progressive. - The encoder ignores FastCompress mode and just behaves identically in either mode. - Some parameters that should be set are not being set by Matrox (extended Framerate for example). --- CMakeLists.txt | 2 +- Include/enc-vfw.h | 31 +- Include/plugin.h | 36 +-- Source/enc-vfw.cpp | 689 +++++++++++++++++++++++++++++++++++---------- 4 files changed, 596 insertions(+), 162 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 699b136..b7a06f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ PROJECT(enc-vfw) # Version ################################################################################ SET(enc-vfw_VERSION_MAJOR 0) -SET(enc-vfw_VERSION_MINOR 1) +SET(enc-vfw_VERSION_MINOR 2) SET(enc-vfw_VERSION_PATCH 0) SET(enc-vfw_VERSION_BUILD 0) #configure_file( diff --git a/Include/enc-vfw.h b/Include/enc-vfw.h index efd4db8..0daf2d7 100644 --- a/Include/enc-vfw.h +++ b/Include/enc-vfw.h @@ -5,6 +5,10 @@ #include #include #include +#include +#include +#include +#include // VFW #define COMPMAN @@ -68,6 +72,12 @@ namespace VFW { static void get_video_info(void *data, struct video_scale_info *info); void get_video_info(struct video_scale_info *info); + static void threadMain(void *data, int32_t flag); + void threadLocal(int32_t flag); + void preProcessLocal(std::unique_lock& ul); + void encodeLocal(std::unique_lock& ul); + void postProcessLocal(std::unique_lock& ul); + private: VFW::Info* myInfo; HIC hIC; @@ -88,12 +98,27 @@ namespace VFW { m_width, m_height, m_fpsNum, m_fpsDen, m_keyframeInterval, - m_bitrate, m_quality; - bool - m_useNormalCompress, + m_bitrate, m_quality, + m_latency, m_maxQueueSize; + bool + m_useNormalCompress, m_useTemporalFlag, m_useBitrateFlag, m_useQualityFlag, m_forceKeyframes; + + struct thread_data { + std::thread worker; + std::mutex lock; + std::condition_variable cv; + // Data Vector, Frame, Keyframe + std::queue>, int64_t, bool>> data; + } m_preProcessData, + m_encodeData, + m_postProcessData; + std::mutex m_finalPacketsLock; + std::queue>, int64_t, bool>> m_finalPackets; + bool m_threadShutdown; + std::shared_ptr> m_donotuse_datastor; }; }; diff --git a/Include/plugin.h b/Include/plugin.h index 0bff0b6..d3740d1 100644 --- a/Include/plugin.h +++ b/Include/plugin.h @@ -20,26 +20,30 @@ #endif // Plugin -#define PLUGIN_NAME "Video For Windows" +#define PLUGIN_NAME "Video For Windows" #include "Version.h" // Logging -#define PLOG(level, ...) blog(level, "[VFW] " __VA_ARGS__); -#define PLOG_ERROR(...) PLOG(LOG_ERROR, __VA_ARGS__) -#define PLOG_WARNING(...) PLOG(LOG_WARNING, __VA_ARGS__) -#define PLOG_INFO(...) PLOG(LOG_INFO, __VA_ARGS__) -#define PLOG_DEBUG(...) PLOG(LOG_DEBUG, __VA_ARGS__) +#define PLOG(level, ...) blog(level, "[VFW] " __VA_ARGS__); +#define PLOG_ERROR(...) PLOG(LOG_ERROR, __VA_ARGS__) +#define PLOG_WARNING(...) PLOG(LOG_WARNING, __VA_ARGS__) +#define PLOG_INFO(...) PLOG(LOG_INFO, __VA_ARGS__) +#define PLOG_DEBUG(...) PLOG(LOG_DEBUG, __VA_ARGS__) // Properties -#define PROP_BITRATE "Bitrate" -#define PROP_QUALITY "Quality" -#define PROP_INTERVAL_TYPE "IntervalType" -#define PROP_KEYFRAME_INTERVAL "KeyframeInterval" -#define PROP_FORCE_KEYFRAMES "ForceKeyframes" +#define PROP_CONFIGURE "Configure" +#define PROP_BITRATE "Bitrate" +#define PROP_QUALITY "Quality" +#define PROP_INTERVAL_TYPE "IntervalType" +#define PROP_KEYFRAME_INTERVAL "KeyframeInterval" +#define PROP_KEYFRAME_INTERVAL2 "KeyframeInterval2" +#define PROP_FORCE_KEYFRAMES "ForceKeyframes" #define PROP_MODE "Mode" -#define PROP_MODE_NORMAL "Mode.Normal" -#define PROP_MODE_TEMPORAL "Mode.Temporal" -#define PROP_MODE_SEQUENTIAL "Mode.Sequential" - -#define PROP_CONFIGURE "Configure" +#define PROP_MODE_NORMAL "Mode.Normal" +#define PROP_MODE_TEMPORAL "Mode.Temporal" +#define PROP_MODE_SEQUENTIAL "Mode.Sequential" +#define PROP_ICMODE "ICMode" +#define PROP_ICMODE_COMPRESS "ICMode.Normal" +#define PROP_ICMODE_FASTCOMPRESS "ICMode.Fast" +#define PROP_LATENCY "Latency" #define PROP_ABOUT "About" \ No newline at end of file diff --git a/Source/enc-vfw.cpp b/Source/enc-vfw.cpp index 3775844..c7ef765 100644 --- a/Source/enc-vfw.cpp +++ b/Source/enc-vfw.cpp @@ -1,15 +1,19 @@ #include "enc-vfw.h" #include "libobs/obs-encoder.h" +#include #include +#include #include #include #include #include +#include std::map _IdToInfo; #define snprintf sprintf_s +static const size_t preprocessthreads = 4; std::vector> codecCorrections = { // Cinepak Codec @@ -44,6 +48,9 @@ std::vector> codecCorrections = { { "dv50", "dvvideo" }, // Matrox DVCPRO50 }; +std::map> mpeg2hertz; +const uint64_t mpeg2hertz_mult = 0xFFFFFFFF; + std::string FourCCFromInt32(DWORD& fccHandler) { return std::string(reinterpret_cast(&fccHandler), 4); } @@ -85,6 +92,43 @@ std::string FormattedICCError(LRESULT error) { } bool VFW::Initialize() { + // Initialize MPEG-2 Rewriting Map + std::pair native_hertz[] = { + std::make_pair(8, 60.0), + std::make_pair(7, (60000.0 / 1001.0)), + std::make_pair(6, 50.0), + std::make_pair(5, 30.0), + std::make_pair(4, (30000.0 / 1001.0)), + std::make_pair(3, 25.0), + std::make_pair(2, 24.0), + std::make_pair(1, (24000.0 / 1001.0)), + }; + for (auto kv : native_hertz) { + PLOG_DEBUG("(MPEG-2 Rewrite) Native Framerate: %f", kv.second); + mpeg2hertz.insert(std::make_pair(uint64_t(kv.second * mpeg2hertz_mult), + std::make_tuple(kv.first, 0, 0))); + + } + for (auto kv : native_hertz) { + std::stringstream buf; + for (uint8_t num = 0; num < (1 << 2); num++) { + for (uint8_t den = 0; den < (1 << 5); den++) { + if (num == den) + continue; // Don't need the 1:1 ones >_> + + double_t fps = kv.second * (double_t(num + 1) / double_t(den + 1)); + uint64_t key = uint64_t(fps * mpeg2hertz_mult); + if (mpeg2hertz.count(key)) { + continue; // Duplicate. + } + + mpeg2hertz.insert(std::make_pair(key, std::make_tuple(kv.first, num, den))); + buf << fps << " (" << kv.second << " * " << num + 1 << " / " << den + 1 << "), "; + } + } + PLOG_DEBUG("(MPEG-2 Rewrite) Extended Framerates for native %f: %s", kv.second, buf.str().c_str()); + } + // Initialize all VFW Encoders (we can only use one anyway) ICINFO icinfo; std::memset(&icinfo, 0, sizeof(ICINFO)); @@ -174,7 +218,13 @@ const char* VFW::Encoder::get_name(void* type_data) { void VFW::Encoder::get_defaults(obs_data_t *settings) { obs_data_set_default_int(settings, PROP_BITRATE, 0); obs_data_set_default_double(settings, PROP_QUALITY, 100.0); - obs_data_set_default_double(settings, PROP_KEYFRAME_INTERVAL, 0.0); + obs_data_set_default_int(settings, PROP_INTERVAL_TYPE, 0); + obs_data_set_default_double(settings, PROP_KEYFRAME_INTERVAL, 1.0); + obs_data_set_default_int(settings, PROP_KEYFRAME_INTERVAL2, 30); + obs_data_set_default_bool(settings, PROP_FORCE_KEYFRAMES, true); + obs_data_set_default_string(settings, PROP_MODE, PROP_MODE_SEQUENTIAL); + obs_data_set_default_string(settings, PROP_ICMODE, PROP_ICMODE_FASTCOMPRESS); + obs_data_set_default_int(settings, PROP_LATENCY, 3); } obs_properties_t* VFW::Encoder::get_properties(void *data) { @@ -184,8 +234,12 @@ obs_properties_t* VFW::Encoder::get_properties(void *data) { obs_properties_set_param(pr, data, nullptr); obs_property_t* p; - p = obs_properties_add_int_slider(pr, PROP_BITRATE, "Bitrate", 0, 300000, 1); + p = obs_properties_add_button(pr, PROP_CONFIGURE, "Configure", cb_configure); + obs_property_set_visible(p, info->hasConfigure); + + p = obs_properties_add_int_slider(pr, PROP_BITRATE, "Bitrate", 0, 1000000, 1); obs_property_set_visible(p, ((info->icInfo2.dwFlags & VIDCF_CRUNCH) != 0)); + p = obs_properties_add_float_slider(pr, PROP_QUALITY, "Quality", 1, 100, 0.01); obs_property_set_visible(p, ((info->icInfo2.dwFlags & VIDCF_QUALITY) != 0)); @@ -193,7 +247,9 @@ obs_properties_t* VFW::Encoder::get_properties(void *data) { obs_property_list_add_int(p, "Seconds", 0); obs_property_list_add_int(p, "Frames", 1); obs_property_set_modified_callback(p, cb_modified); + p = obs_properties_add_float(pr, PROP_KEYFRAME_INTERVAL, "Keyframe Interval", 0.00, 30.00, 0.01); + p = obs_properties_add_int(pr, PROP_KEYFRAME_INTERVAL2, "Keyframe Interval", 0, 300, 1); p = obs_properties_add_bool(pr, PROP_FORCE_KEYFRAMES, "Force Keyframes"); p = obs_properties_add_list(pr, PROP_MODE, "Mode", OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); @@ -202,8 +258,12 @@ obs_properties_t* VFW::Encoder::get_properties(void *data) { obs_property_list_add_string(p, "Temporal", PROP_MODE_TEMPORAL); obs_property_list_add_string(p, "Sequential", PROP_MODE_SEQUENTIAL); - p = obs_properties_add_button(pr, PROP_CONFIGURE, "Configure", cb_configure); - obs_property_set_visible(p, info->hasConfigure); + p = obs_properties_add_list(pr, PROP_ICMODE, "Compress Mode", OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + obs_property_list_add_string(p, "Normal", PROP_ICMODE_COMPRESS); + obs_property_list_add_string(p, "Fast", PROP_ICMODE_FASTCOMPRESS); + + p = obs_properties_add_int_slider(pr, PROP_LATENCY, "Frame Latency", 0, 10, 1); + p = obs_properties_add_button(pr, PROP_ABOUT, "About", cb_about); obs_property_set_visible(p, info->hasAbout); @@ -217,7 +277,12 @@ bool VFW::Encoder::cb_configure(obs_properties_t *pr, obs_property_t *p, void *d VFW::Info* info = static_cast(obs_properties_get_param(pr)); - HIC hIC = ICOpen(info->icInfo.fccType, info->icInfo.fccHandler, ICMODE_FASTCOMPRESS); + HIC hIC = ICOpen(info->icInfo.fccType, info->icInfo.fccHandler, ICMODE_COMPRESS); + if (hIC == 0) + hIC = ICOpen(info->icInfo.fccType, info->icInfo.fccHandler, ICMODE_FASTCOMPRESS); + if (hIC == 0) + return false; + if (info->stateInfo.size() > 0) { LRESULT err = ICSetState(hIC, info->stateInfo.data(), info->stateInfo.size()); if (err != ICERR_OK) { @@ -248,26 +313,21 @@ bool VFW::Encoder::cb_about(obs_properties_t *pr, obs_property_t *p, void *data) UNREFERENCED_PARAMETER(data); VFW::Info* info = static_cast(obs_properties_get_param(pr)); - HIC hIC = ICOpen(info->icInfo.fccType, info->icInfo.fccHandler, ICMODE_FASTCOMPRESS); + HIC hIC = ICOpen(info->icInfo.fccType, info->icInfo.fccHandler, ICMODE_COMPRESS); + if (hIC == 0) + hIC = ICOpen(info->icInfo.fccType, info->icInfo.fccHandler, ICMODE_FASTCOMPRESS); + if (hIC == 0) + return false; ICAbout(hIC, GetDesktopWindow()); ICClose(hIC); return false; } -bool VFW::Encoder::cb_modified(obs_properties_t *pr, obs_property_t *p, obs_data_t *data) { - if (strcmp(obs_property_name(p), PROP_INTERVAL_TYPE) == 0) { - int64_t v = obs_data_get_int(data, PROP_INTERVAL_TYPE); - - switch (v) { - case 0: - obs_property_int_set_limits(obs_properties_get(pr, PROP_KEYFRAME_INTERVAL), 0.00, 30.00, 0.01); - break; - case 1: - obs_property_int_set_limits(obs_properties_get(pr, PROP_KEYFRAME_INTERVAL), 0, 300, 1); - break; - } - } +bool VFW::Encoder::cb_modified(obs_properties_t *pr, obs_property_t *, obs_data_t *data) { + int64_t v = obs_data_get_int(data, PROP_INTERVAL_TYPE); + obs_property_set_visible(obs_properties_get(pr, PROP_KEYFRAME_INTERVAL), v == 0); + obs_property_set_visible(obs_properties_get(pr, PROP_KEYFRAME_INTERVAL2), v == 1); return true; } @@ -296,16 +356,18 @@ VFW::Encoder::Encoder(obs_data_t *settings, obs_encoder_t *encoder) { case 0: m_keyframeInterval = max(uint32_t( factor * obs_data_get_double(settings, PROP_KEYFRAME_INTERVAL) - ), 0); + ), 0); break; case 1: - m_keyframeInterval = - (uint32_t)obs_data_get_double(settings, PROP_KEYFRAME_INTERVAL); + m_keyframeInterval = + (uint32_t)obs_data_get_int(settings, PROP_KEYFRAME_INTERVAL2); break; } m_forceKeyframes = obs_data_get_bool(settings, PROP_FORCE_KEYFRAMES); m_bitrate = uint32_t(obs_data_get_int(settings, PROP_BITRATE)); m_quality = uint32_t(obs_data_get_double(settings, PROP_QUALITY) * 100); + m_latency = uint32_t(obs_data_get_int(settings, PROP_LATENCY)); + m_maxQueueSize = (m_latency + 1) * 2; PLOG_INFO("<%s> Initializing... (" "Resolution: %" PRIu32 "x%" PRIu32 ", " @@ -313,23 +375,42 @@ VFW::Encoder::Encoder(obs_data_t *settings, obs_encoder_t *encoder) { "Bitrate: %" PRIu32 ", " "Quality: %0.2f%%, " "Keyframe Interval: %" PRIu32 " (%s), " - "Mode: %s" + "Mode: %s, " + "Compress Mode: %s" ")", myInfo->Name.c_str(), m_width, m_height, m_fpsNum, m_fpsDen, (double_t)m_fpsNum / (double_t)m_fpsDen, m_bitrate, m_quality, m_keyframeInterval, m_forceKeyframes ? "Enforced" : "Standard", - obs_data_get_string(settings, PROP_MODE)); + obs_data_get_string(settings, PROP_MODE), + obs_data_get_string(settings, PROP_ICMODE)); - hIC = ICOpen(myInfo->icInfo.fccType, myInfo->icInfo.fccHandler, ICMODE_FASTCOMPRESS); - if (!hIC) { - PLOG_ERROR("<%s> Failed to initialize.", - myInfo->Name.c_str()); - throw std::exception(); + UINT mainIC = ICMODE_FASTCOMPRESS; + const char* mainICs = "Fast"; + UINT backupIC = ICMODE_COMPRESS; + const char* backupICs = "Normal"; + + if (strcmp(obs_data_get_string(settings, PROP_ICMODE), PROP_ICMODE_COMPRESS) == 0) { + std::swap(mainIC, backupIC); + std::swap(mainICs, backupICs); + } + + hIC = ICOpen(myInfo->icInfo.fccType, myInfo->icInfo.fccHandler, mainIC); + if (hIC == 0) { + PLOG_WARNING( + "<%s> Failed to initialize with %s compression mode, " + "falling back to %s compression mode...", + myInfo->Name.c_str(), mainICs, backupICs); + hIC = ICOpen(myInfo->icInfo.fccType, myInfo->icInfo.fccHandler, backupIC); + if (hIC == 0) { + PLOG_ERROR("<%s> Failed to initialize.", + myInfo->Name.c_str()); + throw std::exception(); + } } else { - PLOG_DEBUG("<%s> Initialized, setting up.", - myInfo->Name.c_str()); + PLOG_DEBUG("<%s> Initialized with %s compression mode, setting up...", + myInfo->Name.c_str(), mainICs); } // Store temporary flags @@ -350,7 +431,7 @@ VFW::Encoder::Encoder(obs_data_t *settings, obs_encoder_t *encoder) { // Load State from memory. if (myInfo->stateInfo.size() > 0) { - LRESULT err = ICSetState(hIC, myInfo->stateInfo.data(), myInfo->stateInfo.size()); + err = ICSetState(hIC, myInfo->stateInfo.data(), myInfo->stateInfo.size()); if (err != ICERR_OK) { PLOG_ERROR("Failed to set state before encoding: %s.", FormattedICCError(err).c_str()); @@ -359,7 +440,7 @@ VFW::Encoder::Encoder(obs_data_t *settings, obs_encoder_t *encoder) { ICSetState(hIC, NULL, 0); } -#pragma region Get Bitmap Information + #pragma region Get Bitmap Information m_bufferInputBitmapInfo.resize(sizeof(BITMAPINFOHEADER)); std::memset(m_bufferInputBitmapInfo.data(), 0, m_bufferInputBitmapInfo.size()); m_inputBitmapInfo = reinterpret_cast(m_bufferInputBitmapInfo.data()); @@ -393,7 +474,7 @@ VFW::Encoder::Encoder(obs_data_t *settings, obs_encoder_t *encoder) { FormattedICCError(err).c_str()); throw std::exception(); } -#pragma endregion Get Bitmap Information + #pragma endregion Get Bitmap Information // Prepare Input Buffers size_t alignedWidth = (m_width / 16 + 1) * 16; @@ -403,7 +484,7 @@ VFW::Encoder::Encoder(obs_data_t *settings, obs_encoder_t *encoder) { // Begin Compression if (m_useNormalCompress) { - LRESULT err = ICCompressBegin(hIC, m_inputBitmapInfo, m_outputBitmapInfo); + err = ICCompressBegin(hIC, m_inputBitmapInfo, m_outputBitmapInfo); if (err != ICERR_OK) { PLOG_ERROR("Unable to begin encoding: %s.", FormattedICCError(err).c_str()); throw std::runtime_error(FormattedICCError(err)); @@ -429,6 +510,12 @@ VFW::Encoder::Encoder(obs_data_t *settings, obs_encoder_t *encoder) { } } + // Thread stuff. These can't fail in most situations. + m_threadShutdown = false; + m_preProcessData.worker = std::thread(threadMain, this, 0); + m_encodeData.worker = std::thread(threadMain, this, 1); + m_postProcessData.worker = std::thread(threadMain, this, 2); + PLOG_INFO("<%s> Started.", myInfo->Name.c_str()); } @@ -438,6 +525,14 @@ void VFW::Encoder::destroy(void* data) { } VFW::Encoder::~Encoder() { + m_threadShutdown = true; + m_preProcessData.cv.notify_all(); + m_preProcessData.worker.join(); + m_encodeData.cv.notify_all(); + m_encodeData.worker.join(); + m_postProcessData.cv.notify_all(); + m_postProcessData.worker.join(); + if (m_useNormalCompress) { ICCompressEnd(hIC); } else { @@ -453,121 +548,54 @@ bool VFW::Encoder::encode(void *data, struct encoder_frame *frame, struct encode } bool VFW::Encoder::encode(struct encoder_frame *frame, struct encoder_packet *packet, bool *received_packet) { - // Vertically invert Image for some reason. + auto tbegin = std::chrono::high_resolution_clock::now(); - size_t maxX = (m_width * 4) / 16; - if (maxX <= 0) - maxX = 1; + namespace sc = std::chrono; + using schrc = std::chrono::high_resolution_clock; - // xF xE xD xC xB xA x9 x8 x7 x6 x5 x4 x3 x2 x1 x0 - // A B G R A B G R A B G R A B G R - - // Swizzle Mask (128-Bit swapping, could do higher but AMD decided to say fuck it) - /* - 0xF 0xE 0xD 0xC - 0xB 0xA 0x9 0x8 - 0x7 0x6 0x5 0x4 - 0x3 0x2 0x1 0x0 - */ - __m128i swizzle = _mm_set_epi8( - 0xF, 0xC, 0xD, 0xE, - 0xB, 0x8, 0x9, 0xA, - 0x7, 0x4, 0x5, 0x6, - 0x3, 0x0, 0x1, 0x2); - - const size_t lineSize = m_width * 4; - for (size_t y = 0; y < m_height; ++y) { - __m128i* pIn = reinterpret_cast<__m128i*>(frame->data[0] + (y * lineSize)); - __m128i* pOut = reinterpret_cast<__m128i*>(m_bufferInput.data() + ((m_height - 1 - y) * lineSize)); - - for (size_t x = 0; x < maxX; ++x) { - __m128i in = _mm_loadu_si128(pIn + x); - __m128i out = _mm_shuffle_epi8(in, swizzle); - _mm_storeu_si128(pOut + x, out); - } + size_t queueSize = 0; + { + std::unique_lock ulock(m_finalPacketsLock); + queueSize = m_finalPackets.size(); } - //for (size_t y = 0; y < m_height; ++y) { - - // uint8_t* lineOut = reinterpret_cast(m_bufferInput.data()) + (lineOutSize * (m_height - y - 1)); - - // for (size_t x = 0; x < m_width; ++x) { - // lineOut[x * 4] = lineIn[x * 4 + 2]; - // lineOut[x * 4 + 1] = lineIn[x * 4 + 1]; - // lineOut[x * 4 + 2] = lineIn[x * 4]; - // lineOut[x * 4 + 3] = lineIn[x * 4 + 3]; - // } - - //} - - bool isKeyframe = false; - bool makeKeyframe = (m_keyframeInterval > 0) && ((frame->pts % m_keyframeInterval) == 0); - - *received_packet = false; - if (m_useNormalCompress) { - DWORD dwFlags, cwCompFlags; - - PLOG_DEBUG("<%s:Normal> PTS: %" PRIu32 ", Keyframe: %s", - myInfo->Name.c_str(), frame->pts, makeKeyframe ? "Yes" : "No"); - LRESULT err = ICCompress(hIC, - makeKeyframe ? ICCOMPRESS_KEYFRAME : 0, - &(m_outputBitmapInfo->bmiHeader), m_bufferOutput.data(), - &(m_inputBitmapInfo->bmiHeader), m_bufferInput.data(), - &dwFlags, &cwCompFlags, - (LONG)frame->pts, - m_useBitrateFlag ? m_bitrate : 0, - m_useQualityFlag ? m_quality : 0, - !makeKeyframe && m_useTemporalFlag ? &(m_prevInputBitmapInfo->bmiHeader) : NULL, - !makeKeyframe && m_useTemporalFlag ? m_bufferPrevInput.data() : NULL); - if (err != ICERR_OK) { - PLOG_ERROR("Unable to encode: %s.", FormattedICCError(err).c_str()); - return false; + bool submittedFrame = false; + long long maxTime = size_t((double_t(m_fpsNum) / double_t(m_fpsDen)) * 1000000000); + while (((*received_packet == false && queueSize >= m_latency) || (submittedFrame == false)) + && (sc::nanoseconds((schrc::now() - tbegin)).count() < maxTime)) { + // Submit frame to PreProcessor + if (!submittedFrame) { + std::unique_lock ulock(m_preProcessData.lock); + if (m_preProcessData.data.size() < m_maxQueueSize) { + m_preProcessData.data.push(std::make_tuple( + std::make_shared>(frame->data[0], frame->data[0] + (frame->linesize[0] * this->m_height)), + frame->pts, + false)); + submittedFrame = true; + m_preProcessData.cv.notify_all(); + } } - // Swap Buffers - m_bufferPrevInput.swap(m_bufferInput); - - // Store some information we need right now. - packet->size = m_outputBitmapInfo->bmiHeader.biSizeImage; - isKeyframe = (cwCompFlags & AVIIF_KEYFRAME) != 0; - - PLOG_DEBUG("<%s:Normal> PTS: %" PRIu32 ", Keyframe: %s, Size: %" PRIu32, - myInfo->Name.c_str(), frame->pts, isKeyframe ? "Yes" : "No", packet->size); - } else { - BOOL keyframe; LONG plSize = (LONG)m_bufferInput.size(); - - PLOG_DEBUG("<%s:Sequential> PTS: %" PRIu32 ", Keyframe: %s", - myInfo->Name.c_str(), frame->pts, makeKeyframe ? "Yes" : "No"); - LPVOID fptr = ICSeqCompressFrame( - &cv, - makeKeyframe ? 1 : 0, - reinterpret_cast(m_bufferInput.data()), - &keyframe, - &plSize); - if (fptr == NULL) { - PLOG_ERROR("Unable to encode."); - return false; + if (!*received_packet) { + std::unique_lock ulock(m_finalPacketsLock); + if (m_finalPackets.size() > m_latency) { + auto front = m_finalPackets.front(); + m_donotuse_datastor = std::get<0>(front); + packet->type = OBS_ENCODER_VIDEO; + packet->data = reinterpret_cast(m_donotuse_datastor->data()); + packet->size = m_donotuse_datastor->size(); + packet->pts = packet->dts = std::get<1>(front); + packet->keyframe = std::get<2>(front); + *received_packet = true; + m_finalPackets.pop(); + PLOG_DEBUG("<%s> PTS: %" PRIu32 ", DTS: %" PRIu32 ", Keyframe: %s, Size: %" PRIu32, + myInfo->Name.c_str(), packet->pts, packet->dts, packet->keyframe ? "Yes" : "No", packet->size); + } } - if (plSize > m_bufferOutput.size()) - m_bufferOutput.resize(plSize); - std::memcpy(m_bufferOutput.data(), fptr, plSize); - packet->size = plSize; - isKeyframe = keyframe != 0; - PLOG_DEBUG("<%s:Sequential> PTS: %" PRIu32 ", Keyframe: %s, Size: %" PRIu32, - myInfo->Name.c_str(), frame->pts, isKeyframe ? "Yes" : "No", packet->size); - return true; + std::this_thread::sleep_for(sc::milliseconds(1)); } - *received_packet = true; - packet->type = OBS_ENCODER_VIDEO; - packet->data = reinterpret_cast(m_bufferOutput.data()); - packet->keyframe = m_forceKeyframes ? makeKeyframe || isKeyframe : isKeyframe; - packet->pts = frame->pts; - packet->dts = frame->pts; - PLOG_DEBUG("<%s> PTS: %" PRIu32 ", DTS: %" PRIu32 ", Keyframe: %s, Size: %" PRIu32, - myInfo->Name.c_str(), packet->pts, packet->dts, packet->keyframe ? "Yes" : "No", packet->size); - return true; } @@ -607,7 +635,384 @@ void VFW::Encoder::get_video_info(void *data, struct video_scale_info *info) { } void VFW::Encoder::get_video_info(struct video_scale_info *info) { - info->format = VIDEO_FORMAT_RGBA; + info->format = VIDEO_FORMAT_BGRA; info->range = VIDEO_RANGE_FULL; - info->colorspace = VIDEO_CS_DEFAULT; + info->colorspace = VIDEO_CS_709; +} + +void VFW::Encoder::threadMain(void *data, int32_t flag) { + reinterpret_cast(data)->threadLocal(flag); +} + +void VFW::Encoder::threadLocal(int32_t flag) { + thread_data* td = &m_preProcessData; + if (flag == 0) { + td = &m_preProcessData; + } else if (flag == 1) { + td = &m_encodeData; + } else if (flag == 2) { + td = &m_postProcessData; + } + + std::unique_lock ulock(td->lock); + while (!m_threadShutdown) { + td->cv.wait(ulock, [this, td] { + return m_threadShutdown || td->data.size() > 0; + }); + if (m_threadShutdown) + break; + + if (flag == 0) { + preProcessLocal(ulock); + } else if (flag == 1) { + encodeLocal(ulock); + } else if (flag == 2) { + postProcessLocal(ulock); + } + } +} + +void VFW::Encoder::preProcessLocal(std::unique_lock& ul) { + auto total_start = std::chrono::high_resolution_clock::now(); + + auto kv = m_preProcessData.data.front(); + ul.unlock(); + + auto invert_start = std::chrono::high_resolution_clock::now(); + std::shared_ptr> inbuf = std::get<0>(kv); + std::shared_ptr> outbuf = inbuf;// std::make_shared>(inbuf->size()); + + size_t halfHeight = m_height / 2; + size_t lineSize = inbuf->size() / m_height; + std::vector tempBuf(lineSize); + for (size_t line = 0; line < halfHeight; line++) { + size_t front = line * lineSize; + size_t back = (m_height - line - 1) * lineSize; + + std::memcpy(tempBuf.data(), inbuf->data() + front, lineSize); + std::memcpy(outbuf->data() + front, inbuf->data() + back, lineSize); + std::memcpy(outbuf->data() + back, tempBuf.data(), lineSize); + } + auto invert_end = std::chrono::high_resolution_clock::now(); + + auto wait_start = std::chrono::high_resolution_clock::now(); + // Do not fill queue if it is > latency. + size_t queueSize = m_maxQueueSize; + while (queueSize >= m_maxQueueSize) { + { + std::unique_lock elock(m_encodeData.lock); + queueSize = m_encodeData.data.size(); + } + + if (queueSize >= m_maxQueueSize) + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + auto wait_end = std::chrono::high_resolution_clock::now(); + + auto queue_start = std::chrono::high_resolution_clock::now(); + { + std::unique_lock plock(m_preProcessData.lock); + std::unique_lock elock(m_encodeData.lock); + m_encodeData.data.push(std::make_tuple(outbuf, std::get<1>(kv), std::get<2>(kv))); + m_encodeData.cv.notify_all(); + m_preProcessData.data.pop(); + } + auto queue_end = std::chrono::high_resolution_clock::now(); + + ul.lock(); + auto total_end = std::chrono::high_resolution_clock::now(); + + auto time_total = std::chrono::duration_cast(total_end - total_start); + auto time_invert = std::chrono::duration_cast(invert_end - invert_start); + auto time_wait = std::chrono::duration_cast(wait_end - wait_start); + auto time_queue = std::chrono::duration_cast(queue_end - queue_start); + PLOG_INFO("[Thread PrePro] Frame %" PRId64 ": " + "Total: %" PRId64 "ns, " + "Invert: %" PRId64 "ns, " + "Wait: %" PRId64 "ns, " + "Queue: %" PRId64 "ns", + std::get<1>(kv), + time_total.count(), + time_invert.count(), + time_wait.count(), + time_queue.count()); +} + +void VFW::Encoder::encodeLocal(std::unique_lock& ul) { + auto total_start = std::chrono::high_resolution_clock::now(); + + auto kv = m_encodeData.data.front(); + ul.unlock(); + + auto encode_start = std::chrono::high_resolution_clock::now(); + bool isKeyframe = false; + bool makeKeyframe = (m_keyframeInterval > 0) && ((std::get<1>(kv) % m_keyframeInterval) == 0); + std::shared_ptr> inbuf = std::get<0>(kv); + std::shared_ptr> outbuf = std::make_shared>(m_bufferOutput.size()); + if (m_useNormalCompress) { + DWORD dwFlags = 0, cwCompFlags = 0; + PLOG_DEBUG("<%s:Normal> PTS: %" PRIu32 ", Keyframe: %s", myInfo->Name.c_str(), std::get<1>(kv), makeKeyframe ? "Yes" : "No"); + LRESULT err = ICCompress(hIC, + makeKeyframe ? ICCOMPRESS_KEYFRAME : 0, + &(m_outputBitmapInfo->bmiHeader), outbuf->data(), + &(m_inputBitmapInfo->bmiHeader), inbuf->data(), + &dwFlags, &cwCompFlags, + (LONG)std::get<1>(kv), + m_useBitrateFlag ? m_bitrate : 0, + m_useQualityFlag ? m_quality : 0, + !makeKeyframe && m_useTemporalFlag ? &(m_prevInputBitmapInfo->bmiHeader) : NULL, + !makeKeyframe && m_useTemporalFlag ? m_bufferPrevInput.data() : NULL); + if (err == ICERR_OK) { + outbuf->resize(m_outputBitmapInfo->bmiHeader.biSizeImage); + //std::memcpy(outbuf->data(), m_bufferOutput.data(), outbuf->size()); + + isKeyframe = (cwCompFlags & AVIIF_KEYFRAME) != 0; + + // Swap Buffers + m_bufferPrevInput.swap(m_bufferInput); + + PLOG_DEBUG("<%s:Normal> PTS: %" PRIu32 ", Keyframe: %s, Size: %" PRIu32, + myInfo->Name.c_str(), std::get<1>(kv), isKeyframe ? "Yes" : "No", outbuf->size()); + } else { + PLOG_ERROR("Unable to encode: %s.", FormattedICCError(err).c_str()); + } + } else { + BOOL keyframe; LONG plSize = (LONG)inbuf->size(); + PLOG_DEBUG("<%s:Sequential> PTS: %" PRIu32 ", Keyframe: %s", + myInfo->Name.c_str(), std::get<1>(kv), makeKeyframe ? "Yes" : "No"); + LPVOID fptr = ICSeqCompressFrame( + &cv, + makeKeyframe ? 1 : 0, + reinterpret_cast(inbuf->data()), + &keyframe, + &plSize); + if (fptr == NULL) { + PLOG_ERROR("Unable to encode."); + } else { + outbuf->resize(plSize); + std::memcpy(outbuf->data(), fptr, outbuf->size()); + isKeyframe = keyframe != 0; + + PLOG_DEBUG("<%s:Sequential> PTS: %" PRIu32 ", Keyframe: %s, Size: %" PRIu32, + myInfo->Name.c_str(), std::get<1>(kv), isKeyframe ? "Yes" : "No", outbuf->size()); + } + } + + isKeyframe = m_forceKeyframes ? makeKeyframe || isKeyframe : isKeyframe; + auto encode_end = std::chrono::high_resolution_clock::now(); + + auto wait_start = std::chrono::high_resolution_clock::now(); + // Do not fill queue if it is > latency. + size_t queueSize = m_maxQueueSize; + while (queueSize >= m_maxQueueSize) { + { + std::unique_lock elock(m_postProcessData.lock); + queueSize = m_postProcessData.data.size(); + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + auto wait_end = std::chrono::high_resolution_clock::now(); + + auto queue_start = std::chrono::high_resolution_clock::now(); + { + std::unique_lock elock(m_encodeData.lock); + std::unique_lock plock(m_postProcessData.lock); + m_postProcessData.data.push(std::make_tuple(outbuf, std::get<1>(kv), isKeyframe)); + m_postProcessData.cv.notify_all(); + m_encodeData.data.pop(); + } + auto queue_end = std::chrono::high_resolution_clock::now(); + + ul.lock(); + auto total_end = std::chrono::high_resolution_clock::now(); + + auto time_total = std::chrono::duration_cast(total_end - total_start); + auto time_encode = std::chrono::duration_cast(encode_end - encode_start); + auto time_wait = std::chrono::duration_cast(wait_end - wait_start); + auto time_queue = std::chrono::duration_cast(queue_end - queue_start); + PLOG_INFO("[Thread Encode] Frame %" PRId64 ": " + "Total: %" PRId64 "ns, " + "Encode: %" PRId64 "ns, " + "Wait: %" PRId64 "ns, " + "Queue: %" PRId64 "ns", + std::get<1>(kv), + time_total.count(), + time_encode.count(), + time_wait.count(), + time_queue.count()); +} + +void MatroxM2VBitstreamFixer(std::shared_ptr>& ptr, std::pair framerate) { + // Matrox developers are idiots. Their MPEG-2 codec flags the content + // as interlaced top-field top-displayed, but in reality there is a + // progressive frame there. But that isn't the only issue. + // They also have structures in the stream that are larger than the + // standard allows for, or even invalid user data (all 0s). It's just + // a big bunch of "How did this ever work?" ... + + // Find best match FPS + double_t sourceHertz = double_t(framerate.first) / double_t(framerate.second); + uint64_t sourceKey = uint64_t(sourceHertz * mpeg2hertz_mult); + std::pair> bestMatch; + uint64_t bestMatchDiff = UINT64_MAX; + for (auto kv : mpeg2hertz) { + uint64_t diff = uint64_t(abs(int64_t(sourceKey) - int64_t(kv.first))); + if (diff < bestMatchDiff) { + bestMatch = kv; + bestMatchDiff = diff; + } + } + #ifdef _DEBUG + PLOG_DEBUG("(MPEG-2 Rewrite) Best Match for Content: %f (Diff: %llu idx: %i, extn: %i, extd: %i)", + double_t(bestMatch.first / mpeg2hertz_mult), + bestMatchDiff, + std::get<0>(bestMatch.second), + std::get<1>(bestMatch.second), + std::get<2>(bestMatch.second)); + #endif + + std::vector* buffer = ptr.get(); + + // Rewrite stream + size_t streamPosition = 0, streamSize = buffer->size(); + while ((streamPosition < streamSize) && ((streamSize - streamPosition) >= 4)) { + uint8_t blockId = (uint8_t)(*buffer)[streamPosition + 3]; + streamPosition += 4; + + switch (blockId) { + case 0xB3: // Sequence Header + { + #ifdef _DEBUG + PLOG_DEBUG("(MPEG-2 Rewrite) Sequence Header at %" PRIu64, streamPosition); + #endif + // Rewrite Framerate + char b = (*buffer)[streamPosition + 3]; + char fpsflag = std::get<0>(bestMatch.second); + (*buffer)[streamPosition + 3] = (b & 0xF0) + (fpsflag & 0x0F); + streamPosition += 8; + break; + } + case 0xB5: + { +// streamPosition += 1; + char type = ((*buffer)[streamPosition] & 0xF0) >> 4; + switch (type) { + case 0b0001: + { // Sequence Extension (Progressive, FPS, ChromaFormat possible) + #ifdef _DEBUG + PLOG_DEBUG("(MPEG-2 Rewrite) Sequence Extension at %" PRIu64, streamPosition); + #endif + (*buffer)[streamPosition + 1] |= 1 << 3; // Flag Progressive + (*buffer)[streamPosition + 5] = // Rewrite FPS Ext + ((*buffer)[streamPosition + 5] & 0x80) + | ((std::get<1>(bestMatch.second) & 0x3) << 5) + | ((std::get<2>(bestMatch.second) & 0x1F)); + streamPosition += 6; + break; + } + case 0b0010: + { + #ifdef _DEBUG + PLOG_DEBUG("(MPEG-2 Rewrite) Sequence Display Extension at %" PRIu64, streamPosition); + #endif + if ((*buffer)[streamPosition] & 0b1) { + streamPosition += 8; + } else { + streamPosition += 5; + } + } + break; + case 0b1000: + { // Chroma Stuff? + #ifdef _DEBUG + PLOG_DEBUG("(MPEG-2 Rewrite) Picture Coding Extension at %" PRIu64, streamPosition); + #endif + // Picture Structure + (*buffer)[streamPosition + 2] |= 0x3; // Full Frame + (*buffer)[streamPosition + 3] &= ~(1 << 7); // top field first + (*buffer)[streamPosition + 3] &= ~(1 << 1); // repeat first field + (*buffer)[streamPosition + 4] |= 1 << 7; // progressive + if ((*buffer)[streamPosition + 4] & 0b1000000) { + streamPosition += 7; + } else { + streamPosition += 5; + } + break; + } + #ifdef _DEBUG + default: + PLOG_DEBUG("(MPEG-2 Rewrite) Unknown Extension %" PRIx8 " at %" PRIu64, type, streamPosition); + break; + #endif + } + + break; + } + } + + // Seek to a valid position. + while ((streamPosition < buffer->size()) && ((buffer->size() - streamPosition) >= 4) + && ( + ((*buffer)[streamPosition] != 0) + || ((*buffer)[streamPosition + 1] != 0) + || ((*buffer)[streamPosition + 2] != 1) + )) { + ++streamPosition; + } + } +} + +void VFW::Encoder::postProcessLocal(std::unique_lock& ul) { + auto total_start = std::chrono::high_resolution_clock::now(); + + auto kv = m_postProcessData.data.front(); + ul.unlock(); + + auto bitstream_start = std::chrono::high_resolution_clock::now(); + if ((myInfo->Id == "mvcVfwMpeg2-mmes") + || (myInfo->Id == "mvcVfwMpeg2Alpha-m704") + || (myInfo->Id == "mvcVfwMpeg2HD-m701") + || (myInfo->Id == "mvcVfwMpeg2Alpha-m705")) { + MatroxM2VBitstreamFixer(std::get<0>(kv), std::make_pair(m_fpsNum, m_fpsDen)); + } + auto bitstream_end = std::chrono::high_resolution_clock::now(); + + auto wait_start = std::chrono::high_resolution_clock::now(); + // Do not fill queue if it is > latency. + size_t queueSize = m_maxQueueSize; + while (queueSize >= m_maxQueueSize) { + { + std::unique_lock flock(m_finalPacketsLock); + queueSize = m_finalPackets.size(); + } + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + auto wait_end = std::chrono::high_resolution_clock::now(); + + auto queue_start = std::chrono::high_resolution_clock::now(); + { + std::unique_lock plock(m_postProcessData.lock); + std::unique_lock flock(m_finalPacketsLock); + m_finalPackets.push(kv); + m_postProcessData.data.pop(); + } + auto queue_end = std::chrono::high_resolution_clock::now(); + + ul.lock(); + auto total_end = std::chrono::high_resolution_clock::now(); + + auto time_total = std::chrono::duration_cast(total_end - total_start); + auto time_bitstream = std::chrono::duration_cast(bitstream_end - bitstream_start); + auto time_wait = std::chrono::duration_cast(wait_end - wait_start); + auto time_queue = std::chrono::duration_cast(queue_end - queue_start); + PLOG_INFO("[Thread PostPr] Frame %" PRId64 ": " + "Total: %" PRId64 "ns, " + "Bitstream: %" PRId64 "ns, " + "Wait: %" PRId64 "ns, " + "Queue: %" PRId64 "ns", + std::get<1>(kv), + time_total.count(), + time_bitstream.count(), + time_wait.count(), + time_queue.count()); }