From: <sag...@us...> - 2014-02-23 22:43:46
|
Revision: 3765 http://sourceforge.net/p/modplug/code/3765 Author: saga-games Date: 2014-02-23 22:43:39 +0000 (Sun, 23 Feb 2014) Log Message: ----------- [Imp] Add SSE3 implementation for sample autotuner (for big samples, this doubles the processing speed for me) [Mod] Added command line switch /noAssembly which turns off MMX/SSE/3DNow!-accelerated code Modified Paths: -------------- trunk/OpenMPT/common/BuildSettings.h trunk/OpenMPT/mptrack/Autotune.cpp trunk/OpenMPT/mptrack/Autotune.h trunk/OpenMPT/mptrack/Mptrack.cpp Modified: trunk/OpenMPT/common/BuildSettings.h =================================================================== --- trunk/OpenMPT/common/BuildSettings.h 2014-02-23 20:45:52 UTC (rev 3764) +++ trunk/OpenMPT/common/BuildSettings.h 2014-02-23 22:43:39 UTC (rev 3765) @@ -40,7 +40,7 @@ #ifdef _M_IX86 -// Generate general x86 inline assembly. +// Generate general x86 inline assembly / intrinsics. #define ENABLE_X86 // Generate inline assembly using MMX instructions (only used when the CPU supports it). @@ -49,7 +49,7 @@ // Generate inline assembly using SSE instructions (only used when the CPU supports it). #define ENABLE_SSE -// Generate inline assembly using SSE2 instructions (only used when the CPU supports it). +// Generate inline assembly using SSE2/SSE3 instructions (only used when the CPU supports it). #define ENABLE_SSE2 // Generate inline assembly using AMD specific instruction set extensions (only used when the CPU supports it). @@ -57,10 +57,10 @@ #elif defined(_M_X64) -// Generate general x64 inline assembly. +// Generate general x64 inline assembly / intrinsics. #define ENABLE_X64 -// Generate inline assembly using SSE2 instructions (only used when the CPU supports it). +// Generate inline assembly using SSE2/SSE3 instructions (only used when the CPU supports it). #define ENABLE_SSE2 #endif Modified: trunk/OpenMPT/mptrack/Autotune.cpp =================================================================== --- trunk/OpenMPT/mptrack/Autotune.cpp 2014-02-23 20:45:52 UTC (rev 3764) +++ trunk/OpenMPT/mptrack/Autotune.cpp 2014-02-23 22:43:39 UTC (rev 3765) @@ -14,6 +14,9 @@ #include "../common/thread.h" #include "../soundlib/Sndfile.h" #include "Autotune.h" +#ifdef ENABLE_SSE2 +#include <emmintrin.h> +#endif // The more bins, the more autocorrelations are done and the more precise the result is. @@ -66,16 +69,16 @@ const T* sample = origSample + pos; - int16 data = 0; // Enough for 256 channels... :) + int32 data = 0; // More than enough for 256 channels... :) for(uint8 chn = 0; chn < channels; chn++) { // We only want the MSB. - data += static_cast<int16>(sample[chn] >> ((sizeof(T) - 1) * 8)); + data += static_cast<int32>(sample[chn] >> ((sizeof(T) - 1) * 8)); } data /= channels; - sampleData[i] = static_cast<int8>(data); + sampleData[i] = static_cast<int16>(data); } } @@ -109,12 +112,13 @@ // We should analyse at least a one second (= GetSampleRate() samples) long sample. sampleLength = std::max<SmpLength>(sampleLoopEnd, sample.GetSampleRate(modType)) + maxShift; + sampleLength = (sampleLength + 7) & ~7; if(sampleData != nullptr) { delete[] sampleData; } - sampleData = new (std::nothrow) int8[sampleLength]; + sampleData = new int16[sampleLength]; if(sampleData == nullptr) { return false; @@ -148,18 +152,21 @@ { std::vector<uint64> histogram; double pitchReference; - int8 *sampleData; + int16 *sampleData; SmpLength processLength; uint32 sampleFreq; int startNote, endNote; - }; + DWORD WINAPI Autotune::AutotuneThread(void *i) //-------------------------------------------- { AutotuneThreadData &info = *static_cast<AutotuneThreadData *>(i); info.histogram.resize(HISTORY_BINS, 0); +#ifdef ENABLE_SSE2 + const bool useSSE = (ProcSupport & PROCSUPPORT_SSE3) != 0; +#endif // Do autocorrelation and save results in a note histogram (restriced to one octave). for(int note = info.startNote, noteBin = note; note < info.endNote; note++, noteBin++) @@ -173,13 +180,35 @@ const SmpLength autocorrShift = NoteToShift(info.sampleFreq, note, info.pitchReference); uint64 autocorrSum = 0; - const int8 *normalData = info.sampleData; - const int8 *shiftedData = info.sampleData + autocorrShift; - // Add up squared differences of all values - for(SmpLength i = info.processLength; i != 0; i--, normalData++, shiftedData++) + +#ifdef ENABLE_SSE2 + if(useSSE) { - autocorrSum += (*normalData - *shiftedData) * (*normalData - *shiftedData); + const __m128i *normalData = reinterpret_cast<const __m128i *>(info.sampleData); + const __m128i *shiftedData = reinterpret_cast<const __m128i *>(info.sampleData + autocorrShift); + for(SmpLength i = info.processLength / 8; i != 0; i--) + { + __m128i normal = _mm_load_si128(normalData++); + __m128i shifted = _mm_loadu_si128(shiftedData++); + __m128i diff = _mm_sub_epi16(normal, shifted); // 8 16-bit differences + __m128i squares = _mm_madd_epi16(diff, diff); // Multiply and add: 4 32-bit squares + squares = _mm_hadd_epi32(squares, squares); // This is SSE3! + squares = _mm_hadd_epi32(squares, squares); + autocorrSum += _mm_cvtsi128_si32(squares); + //autocorrSum += squares.m128i_i32[0] +squares.m128i_i32[1] + squares.m128i_i32[2] + squares.m128i_i32[3]; // For SSE2 only + } + } else +#endif + { + const int16 *normalData = info.sampleData; + const int16 *shiftedData = info.sampleData + autocorrShift; + // Add up squared differences of all values + for(SmpLength i = info.processLength; i != 0; i--, normalData++, shiftedData++) + { + autocorrSum += (*normalData - *shiftedData) * (*normalData - *shiftedData); + } } + info.histogram[noteBin] += autocorrSum; } return 0; Modified: trunk/OpenMPT/mptrack/Autotune.h =================================================================== --- trunk/OpenMPT/mptrack/Autotune.h 2014-02-23 20:45:52 UTC (rev 3764) +++ trunk/OpenMPT/mptrack/Autotune.h 2014-02-23 22:43:39 UTC (rev 3765) @@ -23,7 +23,7 @@ SmpLength selectionStart, selectionEnd; - int8 *sampleData; + int16 *sampleData; SmpLength sampleLength; public: Modified: trunk/OpenMPT/mptrack/Mptrack.cpp =================================================================== --- trunk/OpenMPT/mptrack/Mptrack.cpp 2014-02-23 20:45:52 UTC (rev 3764) +++ trunk/OpenMPT/mptrack/Mptrack.cpp 2014-02-23 22:43:39 UTC (rev 3765) @@ -317,12 +317,13 @@ { if ((lpszParam) && (bFlag)) { - if (!lstrcmpi(lpszParam, "nologo")) { m_bShowSplash = FALSE; return; } else - if (!lstrcmpi(lpszParam, "nodls")) { m_bNoDls = true; return; } else - if (!lstrcmpi(lpszParam, "noplugs")) { m_bNoPlugins = true; return; } else - if (!lstrcmpi(lpszParam, "portable")) { m_bPortable = true; return; } else - if (!lstrcmpi(lpszParam, "noSettingsOnNewVersion")) { m_bNoSettingsOnNewVersion = true; return; } - if (!lstrcmpi(lpszParam, "fullMemDump")) { ExceptionHandler::fullMemDump = true; return; } + if (!lstrcmpi(lpszParam, _T("nologo"))) { m_bShowSplash = FALSE; return; } else + if (!lstrcmpi(lpszParam, _T("nodls"))) { m_bNoDls = true; return; } else + if (!lstrcmpi(lpszParam, _T("noplugs"))) { m_bNoPlugins = true; return; } else + if (!lstrcmpi(lpszParam, _T("portable"))) { m_bPortable = true; return; } else + if (!lstrcmpi(lpszParam, _T("noSettingsOnNewVersion"))) { m_bNoSettingsOnNewVersion = true; return; } + if (!lstrcmpi(lpszParam, _T("fullMemDump"))) { ExceptionHandler::fullMemDump = true; return; } + if (!lstrcmpi(lpszParam, _T("noAssembly"))) { ProcSupport = 0; return; } } CCommandLineInfo::ParseParam(lpszParam, bFlag, bLast); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |