/* Copyright 2005-2009 Intel Corporation. All Rights Reserved. This file is part of Threading Building Blocks. Threading Building Blocks is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation. Threading Building Blocks is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Threading Building Blocks; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA As a special exception, you may use this file as part of a free software library without restriction. Specifically, if other files instantiate templates or use macros or inline functions from this file, or you compile this file and link it with other files to produce an executable, this file does not by itself cause the resulting executable to be covered by the GNU General Public License. This exception does not however invalidate any other reasons why the executable file might be covered by the GNU General Public License. */ #include "tbb/task_scheduler_init.h" #include "tbb/tick_count.h" #include #include #include #include #include #include #include "../src/test/harness.h" #if __linux__ || __APPLE__ || __FreeBSD__ #include #endif /* __APPLE__ */ // The code, performance of which is to be measured, is surrounded by the StartSimpleTiming // and StopSimpleTiming macros. It is called "target code" or "code of interest" hereafter. // // The target code is executed inside the nested loop. Nesting is necessary to allow // measurements on arrays that fit cache of a particular level, while making the load // big enough to eliminate the influence of random deviations. // // Macro StartSimpleTiming defines reduction variable "util::anchor", which may be modified (usually // by adding to) by the target code. This can be necessary to prevent optimizing compilers // from throwing out the code of interest. Besides, if the target code is complex enough, // make sure that all its branches contribute (directly or indirectly) to the value // being added to the "util::anchor" variable. // // To factor out overhead introduced by the measurement infra code it is recommended to make // a calibration run with target code replaced by a no-op (but still modifying "sum"), and // store the resulting time in the "util::base" variable. // // A generally good approach is to make the target code use elements of a preliminary // initialized array. Then for calibration run you just need to add vector elements // to the "sum" variable. To get rid of memory access delays make the array small // enough to fit L2 or L1 cache (play with StartSimpleTiming arguments if necessary). // // Macro CalibrateSimpleTiming performs default calibration using "util::anchor += i;" operation. // // Macro ANCHOR_TYPE defines the type of the reduction variable. If it was not // defined before including this header, it is defined as size_t. Depending on // the target code modern super scalar architectures may blend reduction operation // and instructions of interest differently for different target alternatives. So // you may play with the type to minimize out-of-order and parallel execution impact // on the calibration time veracity. You may even end up with different reduction // variable types (and different calibration times) for different measurements. namespace util { typedef std::vector durations_t; void trace_histogram ( const durations_t& t, char* histogramFileName ) { FILE* f = histogramFileName ? fopen(histogramFileName, "wt") : stdout; size_t n = t.size(); const size_t num_buckets = 100; double min_val = *std::min_element(t.begin(), t.end()), max_val = *std::max_element(t.begin(), t.end()), bucket_size = (max_val - min_val) / num_buckets; std::vector hist(num_buckets + 1, 0); for ( size_t i = 0; i < n; ++i ) ++hist[size_t((t[i]-min_val)/bucket_size)]; fprintf (f, "Histogram: nvals = %u, min = %g, max = %g, nbuckets = %u\n", (unsigned)n, min_val, max_val, (unsigned)num_buckets); double bucket = min_val; for ( size_t i = 0; i <= num_buckets; ++i, bucket+=bucket_size ) fprintf (f, "%12g\t%u\n", bucket, (unsigned)hist[i]); fclose(f); } double average ( const durations_t& d, double& variation_percent, double& std_dev_percent ) { durations_t t = d; if ( t.size() > 5 ) { t.erase(std::min_element(t.begin(), t.end())); t.erase(std::max_element(t.begin(), t.end())); } size_t n = t.size(); double sum = 0, min_val = *std::min_element(t.begin(), t.end()), max_val = *std::max_element(t.begin(), t.end()); for ( size_t i = 0; i < n; ++i ) sum += t[i]; double avg = sum / n, std_dev = 0; for ( size_t i = 0; i < n; ++i ) { double dev = fabs(t[i] - avg); std_dev += dev * dev; } std_dev = sqrt(std_dev / n); std_dev_percent = std_dev / avg * 100; variation_percent = 100 * (max_val - min_val) / avg; return avg; } static int num_threads; static double base = 0, base_dev = 0, base_dev_percent = 0; static char *empty_fmt = ""; static int rate_field_len = 11; #if !defined(ANCHOR_TYPE) #define ANCHOR_TYPE size_t #endif static ANCHOR_TYPE anchor = 0; static double sequential_time = 0; #define StartSimpleTiming(nOuter, nInner) { \ tbb::tick_count t1, t0 = tbb::tick_count::now(); \ for ( size_t j = 0; l < nOuter; ++l ) { \ for ( size_t i = 0; i < nInner; ++i ) { #define StopSimpleTiming(res) \ } \ util::anchor += (ANCHOR_TYPE)l; \ } \ t1 = tbb::tick_count::now(); \ printf (util::empty_fmt, util::anchor); \ res = (t1-t0).seconds() - util::base; \ } #define CalibrateSimpleTiming(T, nOuter, nInner) \ StartSimpleTiming(nOuter, nInner); \ util::anchor += (ANCHOR_TYPE)i; \ StopSimpleTiming(util::base); #define StartTimingImpl(nRuns, nOuter, nInner) \ tbb::tick_count t1, t0; \ for ( size_t k = 0; k < nRuns; ++k ) { \ t0 = tbb::tick_count::now(); \ for ( size_t l = 0; l < nOuter; ++l ) { \ for ( size_t i = 0; i < nInner; ++i ) { #define StartTiming(nRuns, nOuter, nInner) { \ util::durations_t t_(nRuns); \ StartTimingImpl(nRuns, nOuter, nInner) #define StartTimingEx(vDurations, nRuns, nOuter, nInner) { \ util::durations_t &t_ = vDurations; \ vDurations.resize(nRuns); \ StartTimingImpl(nRuns, nOuter, nInner) #define StopTiming(Avg, StdDev, StdDevPercent) \ } \ util::anchor += (ANCHOR_TYPE)l; \ } \ t1 = tbb::tick_count::now(); \ t_[k] = (t1 - t0).seconds()/nrep; \ } \ printf (util::empty_fmt, util::anchor); \ Avg = util::average(t_, StdDev, StdDevPercent); \ } #define CalibrateTiming(nRuns, nOuter, nInner) \ StartTiming(nRuns, nOuter, nInner); \ util::anchor += (ANCHOR_TYPE)i; \ StopTiming(util::base, util::base_dev, util::base_dev_percent); } // namespace util #ifndef NRUNS #define NRUNS 7 #endif #ifndef ONE_TEST_DURATION #define ONE_TEST_DURATION 0.01 #endif #define no_histogram ((char*)-1) inline double RunTestImpl ( const char* title, void (*pfn)(), char* histogramFileName = no_histogram ) { double time = 0, variation = 0, deviation = 0; size_t nrep = 1; while (true) { CalibrateTiming(NRUNS, 1, nrep); StartTiming(NRUNS, 1, nrep); pfn(); StopTiming(time, variation, deviation); time -= util::base; if ( time > 1e-6 ) break; nrep *= 2; } nrep *= (size_t)ceil(ONE_TEST_DURATION/time); CalibrateTiming(NRUNS, 1, nrep); // sets util::base util::durations_t t; StartTimingEx(t, NRUNS, 1, nrep); pfn(); StopTiming(time, variation, deviation); if ( histogramFileName != (char*)-1 ) util::trace_histogram(t, histogramFileName); double clean_time = time - util::base; if ( title ) { // Deviation (in percent) is calulated for the Gross time printf ("\n%-34s %.2e %5.1f ", title, clean_time, deviation); if ( util::sequential_time != 0 ) //printf ("% .2e ", clean_time - util::sequential_time); printf ("% 10.1f ", 100*(clean_time - util::sequential_time)/util::sequential_time); else printf ("%*s ", util::rate_field_len, ""); printf ("%-9u %1.6f |", (unsigned)nrep, time * nrep); } return clean_time; } /// Runs the test function, does statistical processing, and, if title is nonzero, prints results. /** If histogramFileName is a string, the histogram of individual runs is generated and stored in a file with the given name. If it is NULL then the histogram is printed on the console. By default no histogram is generated. The histogram format is: "rate bucket start" "number of tests in this bucket". **/ inline void RunTest ( const char* title_fmt, size_t workload_param, void (*pfn_test)(), char* histogramFileName = no_histogram ) { char title[1024]; sprintf(title, title_fmt, (long)workload_param); RunTestImpl(title, pfn_test, histogramFileName); } inline void CalcSequentialTime ( void (*pfn)() ) { util::sequential_time = RunTestImpl(NULL, pfn) / util::num_threads; } inline void ResetSequentialTime () { util::sequential_time = 0; } inline void PrintTitle() { //printf ("%-32s %-*s Std Dev,%% %-*s Repeats Gross time Infra time | NRUNS = %u", // "Test name", util::rate_field_len, "Rate", util::rate_field_len, "Overhead", NRUNS); printf ("%-34s %-*s Std Dev,%% Par.overhead,%% Repeats Gross time | Nruns %u, Nthreads %d", "Test name", util::rate_field_len, "Rate", NRUNS, util::num_threads); } void Test(); inline int test_main( int argc, char* argv[] ) { ParseCommandLine( argc, argv ); ASSERT (MinThread>=2, "Minimal number of threads must be 2 or more"); char buf[128]; util::rate_field_len = 2 + sprintf(buf, "%.1e", 1.1); for ( int i = MinThread; i <= MaxThread; ++i ) { tbb::task_scheduler_init init (i); util::num_threads = i; PrintTitle(); Test(); printf("\n"); } printf("done\n"); return 0; }