From dee89e25b887a3b02fccfe364ada18592df9aae2 Mon Sep 17 00:00:00 2001 From: Jeff Hutchinson Date: Mon, 28 Dec 2020 18:14:21 -0500 Subject: [PATCH] Changes profiler to use the high precision timer built into windows. Also removes the legacy GetTickCount() fallback as that is no longer necessary in modern versions of windows (Windows XP and greater support QueryPerformanceCounter) --- Engine/source/platform/profiler.cpp | 108 ++++++----------------- Engine/source/platform/profiler.h | 2 +- Engine/source/platformWin32/winTimer.cpp | 38 ++------ 3 files changed, 38 insertions(+), 110 deletions(-) diff --git a/Engine/source/platform/profiler.cpp b/Engine/source/platform/profiler.cpp index 21671218d..ead8380f0 100644 --- a/Engine/source/platform/profiler.cpp +++ b/Engine/source/platform/profiler.cpp @@ -23,11 +23,9 @@ #include "platform/platform.h" #if defined(TORQUE_OS_WIN) -#include // for SetThreadAffinityMask -#endif - -#if defined(TORQUE_OS_MAC) -#include +#include // for SetThreadAffinityMask, QueryPerformanceCounter, QueryPerformanceFrequency +#elif defined(TORQUE_OS_MAC) +#include // for mach_absolute_time, mach_timebase_info #endif #include "core/stream/fileStream.h" @@ -63,111 +61,61 @@ Vector gProfilerNodeStack; #define PROFILER_DEBUG_POP_NODE() ; #endif -#if defined(TORQUE_SUPPORTS_VC_INLINE_X86_ASM) -// platform specific get hires times... -void startHighResolutionTimer(U32 time[2]) -{ - //time[0] = Platform::getRealMilliseconds(); +#if defined(TORQUE_OS_WIN) - __asm - { - push eax - push edx - push ecx - rdtsc - mov ecx, time - mov DWORD PTR [ecx], eax - mov DWORD PTR [ecx + 4], edx - pop ecx - pop edx - pop eax - } -} - -U32 endHighResolutionTimer(U32 time[2]) -{ - U32 ticks; - //ticks = Platform::getRealMilliseconds() - time[0]; - //return ticks; - - __asm - { - push eax - push edx - push ecx - //db 0fh, 31h - rdtsc - mov ecx, time - sub edx, DWORD PTR [ecx+4] - sbb eax, DWORD PTR [ecx] - mov DWORD PTR ticks, eax - pop ecx - pop edx - pop eax - } - return ticks; -} - -#elif defined(TORQUE_SUPPORTS_GCC_INLINE_X86_ASM) +static bool sQueryPerformanceInit = false; +static U64 sQueryPerformanceFrequency = 0; // platform specific get hires times... -void startHighResolutionTimer(U32 time[2]) +void startHighResolutionTimer(U64 &time) { - __asm__ __volatile__( - "rdtsc\n" - : "=a" (time[0]), "=d" (time[1]) - ); + QueryPerformanceCounter((LARGE_INTEGER*)&time); } -U32 endHighResolutionTimer(U32 time[2]) +F64 endHighResolutionTimer(U64 time) { - U32 ticks; - __asm__ __volatile__( - "rdtsc\n" - "sub 0x4(%%ecx), %%edx\n" - "sbb (%%ecx), %%eax\n" - : "=a" (ticks) : "c" (time) - ); - return ticks; + if (!sQueryPerformanceInit) + { + sQueryPerformanceInit = true; + QueryPerformanceFrequency((LARGE_INTEGER*)&sQueryPerformanceFrequency); + } + + U64 current; + QueryPerformanceCounter((LARGE_INTEGER*)¤t); + + return ((1000.0 * static_cast(current-time)) / static_cast(sQueryPerformanceFrequency)); } #elif defined(TORQUE_OS_MAC) - -void startHighResolutionTimer(U32 time[2]) { - U64 now = mach_absolute_time(); - AssertFatal(sizeof(U32[2]) == sizeof(U64), "Can't pack mach_absolute_time into U32[2]"); - memcpy(time, &now, sizeof(U64)); +void startHighResolutionTimer(U64 &time) { + time = mach_absolute_time(); } -U32 endHighResolutionTimer(U32 time[2]) { +F64 endHighResolutionTimer(U64 time) { static mach_timebase_info_data_t sTimebaseInfo = {0, 0}; U64 now = mach_absolute_time(); - AssertFatal(sizeof(U32[2]) == sizeof(U64), "Can't pack mach_absolute_time into U32[2]"); - U64 then; - memcpy(&then, time, sizeof(U64)); if(sTimebaseInfo.denom == 0){ mach_timebase_info(&sTimebaseInfo); } // Handle the micros/nanos conversion first, because shedding a few bits is better than overflowing. - U64 elapsedMicros = ((now - then) / 1000) * sTimebaseInfo.numer / sTimebaseInfo.denom; + F64 elapsedMicros = (static_cast(now - time) / 1000.0) * static_cast(sTimebaseInfo.numer) / static_cast(sTimebaseInfo.denom); - return (U32)elapsedMicros; // Just truncate, and hope we didn't overflow + return elapsedMicros; // Just truncate, and hope we didn't overflow } #else -void startHighResolutionTimer(U32 time[2]) +void startHighResolutionTimer(U64 &time) { - time[0] = Platform::getRealMilliseconds(); + time = (U64)Platform::getRealMilliseconds(); } -U32 endHighResolutionTimer(U32 time[2]) +F64 endHighResolutionTimer(U64 time) { - U32 ticks = Platform::getRealMilliseconds() - time[0]; - return ticks; + return (F64)Platform::getRealMilliseconds() - time; } #endif diff --git a/Engine/source/platform/profiler.h b/Engine/source/platform/profiler.h index 3b11d93fd..b2cc70715 100644 --- a/Engine/source/platform/profiler.h +++ b/Engine/source/platform/profiler.h @@ -153,7 +153,7 @@ struct ProfilerData U32 mHash; U32 mSubDepth; U32 mInvokeCount; - U32 mStartTime[2]; + U64 mStartTime; F64 mTotalTime; F64 mSubTime; #ifdef TORQUE_ENABLE_PROFILE_PATH diff --git a/Engine/source/platformWin32/winTimer.cpp b/Engine/source/platformWin32/winTimer.cpp index 77e820b07..e5c8639a1 100644 --- a/Engine/source/platformWin32/winTimer.cpp +++ b/Engine/source/platformWin32/winTimer.cpp @@ -30,14 +30,11 @@ class Win32Timer : public PlatformTimer { private: - U32 mTickCountCurrent; - U32 mTickCountNext; S64 mPerfCountCurrent; S64 mPerfCountNext; S64 mFrequency; F64 mPerfCountRemainderCurrent; F64 mPerfCountRemainderNext; - bool mUsingPerfCounter; public: Win32Timer() @@ -45,43 +42,26 @@ public: mPerfCountRemainderCurrent = 0.0f; mPerfCountRemainderNext = 0.0f; - // Attempt to use QPC for high res timing, otherwise fallback to GTC. - mUsingPerfCounter = QueryPerformanceFrequency((LARGE_INTEGER *) &mFrequency); - if(mUsingPerfCounter) - mUsingPerfCounter = QueryPerformanceCounter((LARGE_INTEGER *) &mPerfCountCurrent); + QueryPerformanceFrequency((LARGE_INTEGER *) &mFrequency); + QueryPerformanceCounter((LARGE_INTEGER *) &mPerfCountCurrent); mPerfCountNext = 0.0; - if (!mUsingPerfCounter) - mTickCountCurrent = GetTickCount(); - else - mTickCountCurrent = 0; - mTickCountNext = 0; } const S32 getElapsedMs() { - if(mUsingPerfCounter) - { - // Use QPC, update remainders so we don't leak time, and return the elapsed time. - QueryPerformanceCounter( (LARGE_INTEGER *) &mPerfCountNext); - F64 elapsedF64 = (1000.0 * F64(mPerfCountNext - mPerfCountCurrent) / F64(mFrequency)); - elapsedF64 += mPerfCountRemainderCurrent; - U32 elapsed = (U32)mFloor(elapsedF64); - mPerfCountRemainderNext = elapsedF64 - F64(elapsed); + // Use QPC, update remainders so we don't leak time, and return the elapsed time. + QueryPerformanceCounter( (LARGE_INTEGER *) &mPerfCountNext); + F64 elapsedF64 = (1000.0 * F64(mPerfCountNext - mPerfCountCurrent) / F64(mFrequency)); + elapsedF64 += mPerfCountRemainderCurrent; + U32 elapsed = (U32)mFloor(elapsedF64); + mPerfCountRemainderNext = elapsedF64 - F64(elapsed); - return elapsed; - } - else - { - // Do something naive with GTC. - mTickCountNext = GetTickCount(); - return mTickCountNext - mTickCountCurrent; - } + return elapsed; } void reset() { // Do some simple copying to reset the timer to 0. - mTickCountCurrent = mTickCountNext; mPerfCountCurrent = mPerfCountNext; mPerfCountRemainderCurrent = mPerfCountRemainderNext; }