|
From: <sv...@op...> - 2026-05-20 15:45:45
|
Author: manx Date: Wed May 20 17:45:31 2026 New Revision: 25333 URL: https://source.openmpt.org/browse/openmpt/?op=revision&rev=25333 Log: [Imp] openmpt/soundbase/SampleConvert.hpp: Reduce scope of fastround() implementation. Originally, this had been motivated by inefficient code generation with MSVC. However, this decision was made when we were using /fp:fast or equivalent settings in the past, where in particular it can avoid a call to a library function for SSE2 targets. With /fp:precise, this turns out to be a pessimization when targeting SSE2, which is the most common target for 32bit. Only use round(x)=(floor(x)+0.5) when MSVC actually generates simpler code, otherwise prefer the canonical std::round(x). For modern amd64 (SSE4.2 or AVX2), this probably needs further benchmarking in context in order to determine whether the inlined additional code folds away in context. For now, the previous behaviour is kept for these targets. Modified: trunk/OpenMPT/src/openmpt/soundbase/SampleConvert.hpp Modified: trunk/OpenMPT/src/openmpt/soundbase/SampleConvert.hpp ============================================================================== --- trunk/OpenMPT/src/openmpt/soundbase/SampleConvert.hpp Wed May 20 17:28:03 2026 (r25332) +++ trunk/OpenMPT/src/openmpt/soundbase/SampleConvert.hpp Wed May 20 17:45:31 2026 (r25333) @@ -7,6 +7,7 @@ #include "openmpt/all/BuildSettings.hpp" #include "mpt/base/arithmetic_shift.hpp" +#include "mpt/base/detect.hpp" #include "mpt/base/macros.hpp" #include "mpt/base/math.hpp" #include "mpt/base/saturate_cast.hpp" @@ -31,7 +32,66 @@ +// fastround: +// Improves code generation on certain MSVC targets. +// +// arch SSE /fp: std::round fastround better +// +// i386 x87 fast call fixup+call - +// i386 SSE2 fast call inline+fixup + +// amd64 SSE2 fast call inline + +// amd64 SSE4.2 fast inline+fixup inline + +// amd64 AVX2 fast inline+fixup inline + +// +// i386 x87 precise call fixup+call - +// i386 SSE2 precise call fixup+call - +// amd64 SSE2 precise call call+fixup - +// amd64 SSE4.2 precise inline+fixup inline + +// amd64 AVX2 precise inline+fixup inline + +// +// See <https://godbolt.org/z/dYYGenjKT> for a microbenchmark. +// +// TODO: Real-world benchmark of amd64 SSE4.2 and amd64 AVX2. +// The inlined additional code might fold away, or nearly away, +// in which case the canonical semantically correct std::round should be preferred. + #if MPT_COMPILER_MSVC +#if (defined(_M_FP_FAST) && (_M_FP_FAST == 1)) +// /fp:fast +#if 0 +#elif MPT_ARCH_X86 && (defined(MPT_ARCH_X86_FPU) && !defined(MPT_ARCH_X86_SSE2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 0 +#elif MPT_ARCH_X86 && (defined(MPT_ARCH_X86_SSE2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 1 +#elif MPT_ARCH_AMD64 && (defined(MPT_ARCH_X86_SSE2) && !defined(MPT_ARCH_X86_SSE4_2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 1 +#elif MPT_ARCH_AMD64 && (defined(MPT_ARCH_X86_SSE4_2) && !defined(MPT_ARCH_X86_AVX2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 1 +#elif MPT_ARCH_AMD64 && (defined(MPT_ARCH_X86_AVX2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 1 +#endif +#else +// /fp:precise / ... +#if 0 +#elif MPT_ARCH_X86 && (defined(MPT_ARCH_X86_FPU) && !defined(MPT_ARCH_X86_SSE2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 0 +#elif MPT_ARCH_X86 && (defined(MPT_ARCH_X86_SSE2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 0 +#elif MPT_ARCH_AMD64 && (defined(MPT_ARCH_X86_SSE2) && !defined(MPT_ARCH_X86_SSE4_2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 0 +#elif MPT_ARCH_AMD64 && (defined(MPT_ARCH_X86_SSE4_2) && !defined(MPT_ARCH_X86_AVX2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 1 +#elif MPT_ARCH_AMD64 && (defined(MPT_ARCH_X86_AVX2)) +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 1 +#endif +#endif +#endif // MPT_COMPILER_MSVC + +#ifndef MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND +#define MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND 0 +#endif + +#if MPT_SOUNDBASE_SAMPLECONVERT_HPP_FASTROUND template <typename Tfloat> MPT_ATTR_ALWAYSINLINE MPT_INLINE_FORCE Tfloat fastround(Tfloat x) { |