diff --git a/dep/tbb/Makefile b/dep/tbb/Makefile
new file mode 100644
index 000000000..ceac272c3
--- /dev/null
+++ b/dep/tbb/Makefile
@@ -0,0 +1,85 @@
+# Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+#
+# This file is part of Threading Building Blocks.
+#
+# Threading Building Blocks is free software; you can redistribute it
+# and/or modify it under the terms of the GNU General Public License
+# version 2 as published by the Free Software Foundation.
+#
+# Threading Building Blocks is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Threading Building Blocks; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+# As a special exception, you may use this file as part of a free software
+# library without restriction.  Specifically, if other files instantiate
+# templates or use macros or inline functions from this file, or you compile
+# this file and link it with other files to produce an executable, this
+# file does not by itself cause the resulting executable to be covered by
+# the GNU General Public License.  This exception does not however
+# invalidate any other reasons why the executable file might be covered by
+# the GNU General Public License.
+
+tbb_root?=.
+include $(tbb_root)/build/common.inc
+.PHONY: default all tbb tbbmalloc test examples
+
+#workaround for non-depend targets tbb and tbbmalloc which both depend on version_string.tmp
+#According to documentation submakes should run in parallel
+.NOTPARALLEL: tbb tbbmalloc
+
+default: tbb tbbmalloc
+
+all: tbb tbbmalloc test examples
+
+tbb: mkdir
+	$(MAKE) -C "$(work_dir)_debug"  -r -f $(tbb_root)/build/Makefile.tbb cfg=debug tbb_root=$(tbb_root)
+	$(MAKE) -C "$(work_dir)_release"  -r -f $(tbb_root)/build/Makefile.tbb cfg=release tbb_root=$(tbb_root)
+
+tbbmalloc: mkdir
+	$(MAKE) -C "$(work_dir)_debug"  -r -f $(tbb_root)/build/Makefile.tbbmalloc cfg=debug malloc tbb_root=$(tbb_root)
+	$(MAKE) -C "$(work_dir)_release"  -r -f $(tbb_root)/build/Makefile.tbbmalloc cfg=release malloc tbb_root=$(tbb_root)
+
+test: tbb tbbmalloc
+	-$(MAKE) -C "$(work_dir)_debug"  -r -f $(tbb_root)/build/Makefile.tbbmalloc cfg=debug malloc_test tbb_root=$(tbb_root)
+	-$(MAKE) -C "$(work_dir)_debug"  -r -f $(tbb_root)/build/Makefile.test cfg=debug tbb_root=$(tbb_root)
+	-$(MAKE) -C "$(work_dir)_release"  -r -f $(tbb_root)/build/Makefile.tbbmalloc cfg=release malloc_test tbb_root=$(tbb_root)
+	-$(MAKE) -C "$(work_dir)_release"  -r -f $(tbb_root)/build/Makefile.test cfg=release tbb_root=$(tbb_root) 
+
+rml: mkdir
+	$(MAKE) -C "$(work_dir)_debug"  -r -f $(tbb_root)/build/Makefile.rml cfg=debug tbb_root=$(tbb_root)
+	$(MAKE) -C "$(work_dir)_release"  -r -f $(tbb_root)/build/Makefile.rml cfg=release tbb_root=$(tbb_root)
+
+
+examples: tbb tbbmalloc
+	$(MAKE) -C examples -r -f Makefile tbb_root=.. release test
+
+.PHONY: clean clean_examples mkdir info
+
+clean: clean_examples
+	$(shell $(RM) $(work_dir)_release$(SLASH)*.* >$(NUL) 2>$(NUL))
+	$(shell $(RD) $(work_dir)_release >$(NUL) 2>$(NUL))
+	$(shell $(RM) $(work_dir)_debug$(SLASH)*.* >$(NUL) 2>$(NUL))
+	$(shell $(RD) $(work_dir)_debug >$(NUL) 2>$(NUL))
+	@echo clean done
+
+clean_examples:
+	$(shell $(MAKE) -s -i -r -C examples -f Makefile tbb_root=.. clean >$(NUL) 2>$(NUL))
+
+mkdir:
+	$(shell $(MD) "$(work_dir)_release" >$(NUL) 2>$(NUL))
+	$(if $(subst undefined,,$(origin_build_dir)),,cd "$(work_dir)_release" && $(MAKE_TBBVARS) $(tbb_build_prefix)_release)
+	$(shell $(MD) "$(work_dir)_debug" >$(NUL) 2>$(NUL))
+	$(if $(subst undefined,,$(origin_build_dir)),,cd "$(work_dir)_debug" && $(MAKE_TBBVARS) $(tbb_build_prefix)_debug)
+
+info:
+	@echo OS: $(tbb_os)
+	@echo arch=$(arch)
+	@echo compiler=$(compiler)
+	@echo runtime=$(runtime)
+	@echo tbb_build_prefix=$(tbb_build_prefix)
+
diff --git a/dep/tbb/Makefile.am b/dep/tbb/Makefile.am
deleted file mode 100644
index d85e69268..000000000
--- a/dep/tbb/Makefile.am
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
-#
-# This file is part of Threading Building Blocks.
-#
-# Threading Building Blocks is free software; you can redistribute it
-# and/or modify it under the terms of the GNU General Public License
-# version 2 as published by the Free Software Foundation.
-#
-# Threading Building Blocks is distributed in the hope that it will be
-# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
-# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Threading Building Blocks; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-#
-# As a special exception, you may use this file as part of a free software
-# library without restriction.  Specifically, if other files instantiate
-# templates or use macros or inline functions from this file, or you compile
-# this file and link it with other files to produce an executable, this
-# file does not by itself cause the resulting executable to be covered by
-# the GNU General Public License.  This exception does not however
-# invalidate any other reasons why the executable file might be covered by
-# the GNU General Public License.
-
-tbb_root = $(srcdir)
-
-include $(tbb_root)/build/common.inc
-
-# change these
-override work_dir = $(CWD)
-export work_dir
-override tbb_root = $(srcdir)
-export work_dir
-
-.PHONY: all tbb tbbmalloc
-
-#workaround for non-depend targets tbb and tbbmalloc which both depend on version_string.tmp
-#According to documentation submakes should run in parallel
-.NOTPARALLEL: tbb tbbmalloc
-
-all: tbb tbbmalloc
-
-tbb:
-	$(MAKE) -r -f $(tbb_root)/build/Makefile.tbb cfg=release tbb_root=$(tbb_root)
-
-tbbmalloc:
-	$(MAKE) -r -f $(tbb_root)/build/Makefile.tbbmalloc cfg=release malloc tbb_root=$(tbb_root)
-
-install-exec-local:
-	$(INSTALL) $(work_dir)/lib*.so* $(DESTDIR)$(libdir)
-
-clean-local:
-	-rm -f *.d *.o
-	-rm -f lib*.so*
-	-rm -f *.def *.tmp tbbvars.*
-
diff --git a/dep/tbb/build/version_info_winlrb.js b/dep/tbb/build/version_info_winlrb.js
deleted file mode 100644
index 67f2a2920..000000000
--- a/dep/tbb/build/version_info_winlrb.js
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
-//
-// This file is part of Threading Building Blocks.
-//
-// Threading Building Blocks is free software; you can redistribute it
-// and/or modify it under the terms of the GNU General Public License
-// version 2 as published by the Free Software Foundation.
-//
-// Threading Building Blocks is distributed in the hope that it will be
-// useful, but WITHOUT ANY WARRANTY; without even the implied warranty
-// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with Threading Building Blocks; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-//
-// As a special exception, you may use this file as part of a free software
-// library without restriction.  Specifically, if other files instantiate
-// templates or use macros or inline functions from this file, or you compile
-// this file and link it with other files to produce an executable, this
-// file does not by itself cause the resulting executable to be covered by
-// the GNU General Public License.  This exception does not however
-// invalidate any other reasons why the executable file might be covered by
-// the GNU General Public License.
-
-var WshShell = WScript.CreateObject("WScript.Shell");
-
-var tmpExec;
-
-WScript.Echo("#define __TBB_VERSION_STRINGS \\");
-
-//Getting BUILD_HOST
-WScript.echo( "\"TBB: BUILD_HOST\\t\\t" + 
-			  WshShell.ExpandEnvironmentStrings("%COMPUTERNAME%") +
-			  "\" ENDL \\" );
-
-//Getting BUILD_OS
-tmpExec = WshShell.Exec("cmd /c ver");
-while ( tmpExec.Status == 0 ) {
-	WScript.Sleep(100);
-}
-tmpExec.StdOut.ReadLine();
-
-WScript.echo( "\"TBB: BUILD_OS\\t\\t" + 
-			  tmpExec.StdOut.ReadLine() +
-			  "\" ENDL \\" );
-
-var Unknown = "Unknown";
-
-WScript.echo( "\"TBB: BUILD_KERNEL\\t" + 
-              Unknown +
-              "\" ENDL \\" );
-
-//Getting BUILD_COMPILER
-tmpExec = WshShell.Exec("icc --version");
-while ( tmpExec.Status == 0 ) {
-	WScript.Sleep(100);
-}
-var ccVersion = tmpExec.StdErr.ReadLine();
-WScript.echo( "\"TBB: BUILD_GCC\\t" + 
-              ccVersion +
-              "\" ENDL \\" );
-WScript.echo( "\"TBB: BUILD_COMPILER\\t" + 
-              ccVersion +
-              "\" ENDL \\" );
-
-WScript.echo( "\"TBB: BUILD_GLIBC\\t" + 
-              Unknown +
-              "\" ENDL \\" );
-
-WScript.echo( "\"TBB: BUILD_LD\\t" + 
-              Unknown +
-              "\" ENDL \\" );
-
-//Getting BUILD_TARGET
-WScript.echo( "\"TBB: BUILD_TARGET\\t" + 
-			  WScript.Arguments(1) + 
-			  "\" ENDL \\" );
-
-//Getting BUILD_COMMAND
-WScript.echo( "\"TBB: BUILD_COMMAND\\t" + WScript.Arguments(2) + "\" ENDL" );
-
-//Getting __TBB_DATETIME and __TBB_VERSION_YMD
-var date = new Date();
-WScript.echo( "#define __TBB_DATETIME \"" + date.toUTCString() + "\"" );
-WScript.echo( "#define __TBB_VERSION_YMD " + date.getUTCFullYear() + ", " + 
-			  (date.getUTCMonth() > 8 ? (date.getUTCMonth()+1):("0"+(date.getUTCMonth()+1))) + 
-			  (date.getUTCDate() > 9 ? date.getUTCDate():("0"+date.getUTCDate())) );
-
-
diff --git a/dep/tbb/build/winlrb.cl.inc b/dep/tbb/build/winlrb.cl.inc
deleted file mode 100644
index 618dba5bf..000000000
--- a/dep/tbb/build/winlrb.cl.inc
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
-#
-# This file is part of Threading Building Blocks.
-#
-# Threading Building Blocks is free software; you can redistribute it
-# and/or modify it under the terms of the GNU General Public License
-# version 2 as published by the Free Software Foundation.
-#
-# Threading Building Blocks is distributed in the hope that it will be
-# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
-# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Threading Building Blocks; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-#
-# As a special exception, you may use this file as part of a free software
-# library without restriction.  Specifically, if other files instantiate
-# templates or use macros or inline functions from this file, or you compile
-# this file and link it with other files to produce an executable, this
-# file does not by itself cause the resulting executable to be covered by
-# the GNU General Public License.  This exception does not however
-# invalidate any other reasons why the executable file might be covered by
-# the GNU General Public License.
-
-include $(tbb_root)/build/windows.cl.inc
-
-ifeq ($(cfg), debug)
-    CFG_LETTER = d
-else
-    CFG_LETTER = r
-endif
-
-_CPLUS_FLAGS_HOST := $(CPLUS_FLAGS) /I$(LRB_INC_DIR) $(LINK_FLAGS) /LIBPATH:$(LRB_LIB_DIR) xn_host$(LRB_HOST_ARCH)$(CFG_LETTER).lib
-
-TEST_EXT = dll
-CPLUS_FLAGS += /I$(LRB_INC_DIR) /D__LRB__
-LIB_LINK_FLAGS += /LIBPATH:$(LRB_LIB_DIR) xn_lrb$(LRB_HOST_ARCH)$(CFG_LETTER).lib
-LINK_FLAGS = $(LIB_LINK_FLAGS)
-OPENMP_FLAG =
-
-ifdef TEST_RESOURCE
-LINK_FLAGS += $(TEST_RESOURCE)
-
-TEST_LAUNCHER_NAME = harness_lrb_host
-AUX_TEST_DEPENDENCIES = $(TEST_LAUNCHER_NAME).exe
-
-$(TEST_LAUNCHER_NAME).exe: $(TEST_LAUNCHER_NAME).cpp
-	cl /Fe$@ $< $(_CPLUS_FLAGS_HOST)
-
-NO_LEGACY_TESTS = 1
-NO_C_TESTS = 1
-TEST_LAUNCHER=
-endif # TEST_RESOURCE
-
-#test_model_plugin.%:
-#	@echo test_model_plugin is not supported for LRB architecture so far
-
-ifeq ($(BUILDING_PHASE),0)  # examples
-    export RM = del /Q /F
-    export LIBS = -shared -lthr -z muldefs -L$(work_dir)_debug -L$(work_dir)_release
-    export UI = con
-    export x64 = 64
-    export CXXFLAGS = -xR -I..\..\..\include
-endif # examples
diff --git a/dep/tbb/build/winlrb.icc.inc b/dep/tbb/build/winlrb.icc.inc
deleted file mode 100644
index 427d06c9d..000000000
--- a/dep/tbb/build/winlrb.icc.inc
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
-#
-# This file is part of Threading Building Blocks.
-#
-# Threading Building Blocks is free software; you can redistribute it
-# and/or modify it under the terms of the GNU General Public License
-# version 2 as published by the Free Software Foundation.
-#
-# Threading Building Blocks is distributed in the hope that it will be
-# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
-# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Threading Building Blocks; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-#
-# As a special exception, you may use this file as part of a free software
-# library without restriction.  Specifically, if other files instantiate
-# templates or use macros or inline functions from this file, or you compile
-# this file and link it with other files to produce an executable, this
-# file does not by itself cause the resulting executable to be covered by
-# the GNU General Public License.  This exception does not however
-# invalidate any other reasons why the executable file might be covered by
-# the GNU General Public License.
-
-
-include $(tbb_root)/build/winlrb.cl.inc
-
-TEST_EXT = so
-.PRECIOUS: %.$(TEST_EXT)
-
-include $(tbb_root)/build/freebsd.gcc.inc
-
-WARNING_KEY = -w1
-CPLUS = icpc
-CONLY = icc
-#LIBS = -u _read -lcprts -lthr -lc
-#LIBS = -lthr
-LIBS = -u _read -lcprts -lthr -limf -lc
-LINK_FLAGS = -L$(LRB_LIB_DIR) $(DYLIB_KEY) -lxn$(XN_VER)_lrb64$(CFG_LETTER)
-CPLUS_FLAGS += -xR $(PIC_KEY) -I$(LRB_INC_DIR) -DXENSIM
-C_FLAGS = $(CPLUS_FLAGS)
-LIB_LINK_FLAGS = $(LINK_FLAGS)
-
-ifeq ($(cfg), release)
-    # workaround for LRB compiler issues
-    CPLUS_FLAGS := $(subst -O2,-O0, $(CPLUS_FLAGS))
-endif
diff --git a/dep/tbb/build/winlrb.inc b/dep/tbb/build/winlrb.inc
deleted file mode 100644
index f72c66fde..000000000
--- a/dep/tbb/build/winlrb.inc
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
-#
-# This file is part of Threading Building Blocks.
-#
-# Threading Building Blocks is free software; you can redistribute it
-# and/or modify it under the terms of the GNU General Public License
-# version 2 as published by the Free Software Foundation.
-#
-# Threading Building Blocks is distributed in the hope that it will be
-# useful, but WITHOUT ANY WARRANTY; without even the implied warranty
-# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Threading Building Blocks; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-#
-# As a special exception, you may use this file as part of a free software
-# library without restriction.  Specifically, if other files instantiate
-# templates or use macros or inline functions from this file, or you compile
-# this file and link it with other files to produce an executable, this
-# file does not by itself cause the resulting executable to be covered by
-# the GNU General Public License.  This exception does not however
-# invalidate any other reasons why the executable file might be covered by
-# the GNU General Public License.
-
-ifndef XN_VER
-export LRBSDK = $(LARRABEE_CORE_LATEST)
-export LRB_LIB_DIR = "$(LRBSDK)lib"
-export LRB_INC_DIR = "$(LRBSDK)include"
-
-# Function $(wildcard pattern) does not work with paths containing spaces!
-_lrb_lib = $(shell cmd /C "dir /B "$(LRBSDK)lib\libxn*_lrb64d.so")
-export XN_VER = $(patsubst libxn%_lrb64d.so,%,$(_lrb_lib))
-
-ifeq (1,$(NETSIM_LRB_32_OVERRIDE))
-    export LRB_HOST_ARCH = 32
-else
-    export LRB_HOST_ARCH = 64
-endif
-
-export run_cmd = harness_lrb_host.exe
-
-export UI = con
-
-endif #XN_VER
-
-include $(tbb_root)/build/windows.inc
-
-ifneq (1,$(netsim))
-# Target environment is native LRB or LrbFSim
-
-export compiler = icc
-export arch := lrb
-
-target_machine = $(subst -,_,$(shell icpc -dumpmachine))
-runtime = $(subst _lrb_,_,$(target_machine))
-# -dumpmachine option does not work in R9 Core SDK 5
-ifeq ($(runtime),)
-    runtime = x86_64_freebsd
-endif
-export runtime:=$(runtime)_xn$(XN_VER)
-
-OBJ = o
-DLL = so
-LIBEXT = so
-
-TBB.DEF =
-TBB.DLL = libtbb$(DEBUG_SUFFIX).$(DLL)
-TBB.LIB = $(TBB.DLL)
-LINK_TBB.LIB = $(TBB.DLL)
-TBB.RES =
-
-MALLOC.DEF :=
-MALLOC.DLL = libtbbmalloc$(DEBUG_SUFFIX).$(DLL)
-MALLOC.LIB = $(MALLOC.DLL)
-MALLOC.RES = 
-
-MAKE_VERSIONS = cmd /C cscript /nologo /E:jscript $(subst \,/,$(tbb_root))/build/version_info_winlrb.js $(compiler) $(arch) $(subst \,/,"$(CPLUS) $(CPLUS_FLAGS) $(INCLUDES)") > version_string.tmp
-MAKE_TBBVARS  = cmd /C "$(subst /,\,$(tbb_root))\build\generate_tbbvars.bat"
-
-ifneq (1,$(XENSIM_ENABLED))
-    export run_cmd = rem
-endif
-
-TBB_NOSTRICT = 1
-
-endif # lrbfsim
diff --git a/dep/tbb/src/perf/fibonacci_cutoff.cpp b/dep/tbb/src/perf/fibonacci_cutoff.cpp
new file mode 100644
index 000000000..2f2f710af
--- /dev/null
+++ b/dep/tbb/src/perf/fibonacci_cutoff.cpp
@@ -0,0 +1,134 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+
+#include "tbb/task_scheduler_init.h"
+#include "tbb/task.h"
+#include "tbb/tick_count.h"
+
+long CutOff = 1;
+
+long SerialFib( const long n );
+
+long ParallelFib( const long n ); 
+
+inline void dump_title() {
+    printf("Serial/Parallel, P, N, cutoff, repetitions, time, fib, speedup\n");
+}
+
+inline void output(int P, long n, long c, int T, double serial_elapsed, double elapsed, long result) {
+    printf("%s, %d, %ld, %ld, %d, %g, %ld, %g\n", ( (P == 0) ? "Serial" : "Parallel" ), P, n, c, T, elapsed, result, serial_elapsed / elapsed);
+}
+
+#define MOVE_BY_FOURTHS 1
+inline long calculate_new_cutoff(const long lo, const long hi) {
+#if MOVE_BY_FOURTHS    
+    return lo + (3 + hi - lo ) / 4;
+#else
+    return (hi + lo)/2;
+#endif
+}
+
+void find_cutoff(const int P, const long n, const int T, const double serial_elapsed) {
+    long lo = 1, hi = n;
+    double elapsed = 0, lo_elapsed = 0, hi_elapsed = 0;
+    long final_cutoff = -1;
+
+    tbb::task_scheduler_init init(P);
+
+    while(true) {
+       CutOff = calculate_new_cutoff(lo, hi);
+       long result = 0;
+       tbb::tick_count t0;
+       for (int t = -1; t < T; ++t) {
+           if (t == 0) t0 = tbb::tick_count::now();
+           result += ParallelFib(n);
+       }
+       elapsed = (tbb::tick_count::now() - t0).seconds();
+       output(P,n,CutOff,T,serial_elapsed,elapsed,result);
+
+       if (serial_elapsed / elapsed >= P/2.0) {
+           final_cutoff = CutOff;
+           if (hi == CutOff) {
+               if (hi == lo) {
+                  // we have had this value at both above and below 50%
+                  lo = 1; lo_elapsed = 0;
+               } else  {
+                  break;
+               }
+           }
+           hi = CutOff;
+           hi_elapsed = elapsed;
+       } else {
+           if (lo == CutOff) break;
+           lo = CutOff;
+           lo_elapsed = elapsed;
+       }
+    } 
+
+    double interpolated_cutoff = lo + ( P/2.0 - serial_elapsed/lo_elapsed ) * ( (hi - lo) / ( serial_elapsed/hi_elapsed - serial_elapsed/lo_elapsed ));
+
+    if (final_cutoff != -1) {
+        printf("50%% efficiency cutoff is %ld ( linearly interpolated cutoff is %g )\n", final_cutoff, interpolated_cutoff);
+    } else {
+        printf("Cannot achieve 50%% efficiency\n");
+    }
+
+    return;
+}
+
+int main(int argc, char *argv[]) {
+    if (argc < 4) {
+        printf("Usage: %s threads n repetitions\n",argv[0]); 
+        return 1;
+    }
+
+    dump_title();
+
+    int P = atoi(argv[1]);
+    long n = atol(argv[2]);
+    int T = atoi(argv[3]);
+
+    // collect serial time
+    long serial_result = 0;
+    tbb::tick_count t0; 
+    for (int t = -1; t < T; ++t) {
+        if (t == 0) t0 = tbb::tick_count::now();        
+        serial_result += SerialFib(n);
+    }
+    double serial_elapsed = (tbb::tick_count::now() - t0).seconds();
+    output(0,n,0,T,serial_elapsed,serial_elapsed,serial_result);
+
+    // perform search
+    find_cutoff(P,n,T,serial_elapsed);
+
+    return 0;
+}
+
diff --git a/dep/tbb/src/perf/fibonacci_impl_tbb.cpp b/dep/tbb/src/perf/fibonacci_impl_tbb.cpp
new file mode 100644
index 000000000..66c6f24a0
--- /dev/null
+++ b/dep/tbb/src/perf/fibonacci_impl_tbb.cpp
@@ -0,0 +1,86 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include <cstdio>
+#include <cstdlib>
+
+#include "tbb/task_scheduler_init.h"
+#include "tbb/task.h"
+#include "tbb/tick_count.h"
+
+extern long CutOff;
+
+long SerialFib( const long n ) {
+    if( n<2 )
+        return n;
+    else
+        return SerialFib(n-1)+SerialFib(n-2);
+}
+
+struct FibContinuation: public tbb::task {
+    long* const sum;
+    long x, y;
+    FibContinuation( long* sum_ ) : sum(sum_) {}
+    tbb::task* execute() {
+        *sum = x+y;
+        return NULL;
+    }
+};
+
+struct FibTask: public tbb::task {
+    long n;
+    long * sum;
+    FibTask( const long n_, long * const sum_ ) :
+        n(n_), sum(sum_)
+    {}
+    tbb::task* execute() {
+        if( n<CutOff ) {
+            *sum = SerialFib(n);
+            return NULL;
+        } else {
+            FibContinuation& c = 
+                *new( allocate_continuation() ) FibContinuation(sum);
+            FibTask& b = *new( c.allocate_child() ) FibTask(n-1,&c.y);
+            recycle_as_child_of(c);
+            n -= 2;
+            sum = &c.x;
+            // Set ref_count to "two children".
+            c.set_ref_count(2);
+            c.spawn( b );
+            return this;
+        }
+    }
+};
+
+long ParallelFib( const long n ) {
+    long sum;
+    FibTask& a = *new(tbb::task::allocate_root()) FibTask(n,&sum);
+    tbb::task::spawn_root_and_wait(a);
+    return sum;
+}
+
diff --git a/dep/tbb/src/perf/perf_util.h b/dep/tbb/src/perf/perf_util.h
new file mode 100644
index 000000000..812f5d6c8
--- /dev/null
+++ b/dep/tbb/src/perf/perf_util.h
@@ -0,0 +1,292 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "tbb/task_scheduler_init.h"
+#include "tbb/tick_count.h"
+#include <cmath>
+#include <cstdlib>
+#include <cerrno>
+#include <cfloat>
+#include <vector>
+#include <algorithm>
+
+#include "../src/test/harness.h"
+
+#if  __linux__ || __APPLE__ || __FreeBSD__
+    #include <sys/resource.h>
+#endif /* __APPLE__ */
+
+// The code, performance of which is to be measured, is surrounded by the StartSimpleTiming
+// and StopSimpleTiming macros. It is called "target code" or "code of interest" hereafter.
+//
+// The target code is executed inside the nested loop. Nesting is necessary to allow
+// measurements on arrays that fit cache of a particular level, while making the load
+// big enough to eliminate the influence of random deviations.
+//
+// Macro StartSimpleTiming defines reduction variable "util::anchor", which may be modified (usually 
+// by adding to) by the target code. This can be necessary to prevent optimizing compilers 
+// from throwing out the code of interest. Besides, if the target code is complex enough, 
+// make sure that all its branches contribute (directly or indirectly) to the value 
+// being added to the "util::anchor" variable.
+//
+// To factor out overhead introduced by the measurement infra code it is recommended to make 
+// a calibration run with target code replaced by a no-op (but still modifying "sum"), and
+// store the resulting time in the "util::base" variable.
+//
+// A generally good approach is to make the target code use elements of a preliminary 
+// initialized array. Then for calibration run you just need to add vector elements 
+// to the "sum" variable. To get rid of memory access delays make the array small 
+// enough to fit L2 or L1 cache (play with StartSimpleTiming arguments if necessary).
+//
+// Macro CalibrateSimpleTiming performs default calibration using "util::anchor += i;" operation.
+//
+// Macro ANCHOR_TYPE defines the type of the reduction variable. If it was not 
+// defined  before including this header, it is defined as size_t. Depending on 
+// the target code modern super scalar architectures may blend reduction operation
+// and instructions of interest differently for different target alternatives. So
+// you may play with the type to minimize out-of-order and parallel execution impact
+// on the calibration time veracity. You may even end up with different reduction 
+// variable types (and different calibration times) for different measurements.
+
+
+namespace util {
+
+typedef std::vector<double>    durations_t;
+
+    void trace_histogram ( const durations_t& t, char* histogramFileName )
+    {
+        FILE* f = histogramFileName ? fopen(histogramFileName, "wt") : stdout;
+        size_t  n = t.size();
+        const size_t num_buckets = 100;
+        double  min_val = *std::min_element(t.begin(), t.end()),
+                max_val = *std::max_element(t.begin(), t.end()),
+                bucket_size = (max_val - min_val) / num_buckets;
+        std::vector<size_t> hist(num_buckets + 1, 0);
+        for ( size_t i = 0; i < n; ++i )
+            ++hist[size_t((t[i]-min_val)/bucket_size)];
+        fprintf (f, "Histogram: nvals = %u, min = %g, max = %g, nbuckets = %u\n", (unsigned)n, min_val, max_val, (unsigned)num_buckets);
+        double bucket = min_val;
+        for ( size_t i = 0; i <= num_buckets; ++i, bucket+=bucket_size )
+            fprintf (f, "%12g\t%u\n", bucket, (unsigned)hist[i]);
+        fclose(f);
+    }
+
+    double average ( const durations_t& d, double& variation_percent, double& std_dev_percent )
+    {
+        durations_t t = d;
+        if ( t.size() > 5 ) {
+            t.erase(std::min_element(t.begin(), t.end()));
+            t.erase(std::max_element(t.begin(), t.end()));
+        }
+        size_t  n = t.size();
+        double  sum = 0,
+                min_val = *std::min_element(t.begin(), t.end()),
+                max_val = *std::max_element(t.begin(), t.end());
+        for ( size_t i = 0; i < n; ++i )
+            sum += t[i];
+        double  avg = sum / n,
+                std_dev = 0;
+        for ( size_t i = 0; i < n; ++i ) {
+            double    dev = fabs(t[i] - avg);
+            std_dev += dev * dev;
+        }
+        std_dev = sqrt(std_dev / n);
+        std_dev_percent = std_dev / avg * 100;
+        variation_percent = 100 * (max_val - min_val) / avg;
+        return avg;
+    }
+
+    static int num_threads;
+
+    static double   base = 0,
+                    base_dev = 0,
+                    base_dev_percent = 0;
+
+    static char *empty_fmt = "";
+    static int rate_field_len = 11;
+
+#if !defined(ANCHOR_TYPE)
+    #define ANCHOR_TYPE size_t
+#endif
+
+    static ANCHOR_TYPE anchor = 0;
+    
+    static double sequential_time = 0;
+
+
+#define StartSimpleTiming(nOuter, nInner) {             \
+    tbb::tick_count t1, t0 = tbb::tick_count::now();    \
+    for ( size_t j = 0; l < nOuter; ++l ) {             \
+        for ( size_t i = 0; i < nInner; ++i ) {
+
+#define StopSimpleTiming(res)                   \
+        }                                       \
+        util::anchor += (ANCHOR_TYPE)l;         \
+    }                                           \
+    t1 = tbb::tick_count::now();                \
+    printf (util::empty_fmt, util::anchor);     \
+    res = (t1-t0).seconds() - util::base;       \
+}
+
+#define CalibrateSimpleTiming(T, nOuter, nInner)    \
+    StartSimpleTiming(nOuter, nInner);              \
+        util::anchor += (ANCHOR_TYPE)i;             \
+    StopSimpleTiming(util::base);
+
+
+#define StartTimingImpl(nRuns, nOuter, nInner)      \
+    tbb::tick_count t1, t0;                         \
+    for ( size_t k = 0; k < nRuns; ++k )  {         \
+        t0 = tbb::tick_count::now();                \
+        for ( size_t l = 0; l < nOuter; ++l ) {     \
+            for ( size_t i = 0; i < nInner; ++i ) {
+
+#define StartTiming(nRuns, nOuter, nInner) {        \
+    util::durations_t  t_(nRuns);                   \
+    StartTimingImpl(nRuns, nOuter, nInner)
+
+#define StartTimingEx(vDurations, nRuns, nOuter, nInner) {  \
+    util::durations_t  &t_ = vDurations;                    \
+    vDurations.resize(nRuns);                               \
+    StartTimingImpl(nRuns, nOuter, nInner)
+
+#define StopTiming(Avg, StdDev, StdDevPercent)      \
+            }                                       \
+            util::anchor += (ANCHOR_TYPE)l;         \
+        }                                           \
+        t1 = tbb::tick_count::now();                \
+        t_[k] = (t1 - t0).seconds()/nrep;           \
+    }                                               \
+    printf (util::empty_fmt, util::anchor);         \
+    Avg = util::average(t_, StdDev, StdDevPercent); \
+}
+
+#define CalibrateTiming(nRuns, nOuter, nInner)      \
+    StartTiming(nRuns, nOuter, nInner);             \
+        util::anchor += (ANCHOR_TYPE)i;             \
+    StopTiming(util::base, util::base_dev, util::base_dev_percent);
+
+} // namespace util
+
+
+#ifndef NRUNS
+    #define NRUNS               7
+#endif
+
+#ifndef ONE_TEST_DURATION
+    #define ONE_TEST_DURATION   0.01
+#endif
+
+#define no_histogram  ((char*)-1)
+
+inline 
+double RunTestImpl ( const char* title, void (*pfn)(), char* histogramFileName = no_histogram ) {
+    double  time = 0, variation = 0, deviation = 0;
+    size_t nrep = 1;
+    for (;;) {
+        CalibrateTiming(NRUNS, 1, nrep);
+        StartTiming(NRUNS, 1, nrep);
+        pfn();
+        StopTiming(time, variation, deviation);
+        time -= util::base;
+        if ( time > 1e-6 )
+            break;
+        nrep *= 2;
+    }
+    nrep *= (size_t)ceil(ONE_TEST_DURATION/time);
+    CalibrateTiming(NRUNS, 1, nrep);    // sets util::base
+    util::durations_t  t;
+    StartTimingEx(t, NRUNS, 1, nrep);
+        pfn();
+    StopTiming(time, variation, deviation);
+    if ( histogramFileName != (char*)-1 )
+        util::trace_histogram(t, histogramFileName);
+    double clean_time = time - util::base;
+    if ( title ) {
+        // Deviation (in percent) is calculated for the Gross time
+        printf ("\n%-34s %.2e  %5.1f      ", title, clean_time, deviation);
+        if ( util::sequential_time != 0  )
+            //printf ("% .2e  ", clean_time - util::sequential_time);
+            printf ("% 10.1f      ", 100*(clean_time - util::sequential_time)/util::sequential_time);
+        else
+            printf ("%*s ", util::rate_field_len, "");
+        printf ("%-9u %1.6f    |", (unsigned)nrep, time * nrep);
+    }
+    return clean_time;
+}
+
+
+/// Runs the test function, does statistical processing, and, if title is nonzero, prints results.
+/** If histogramFileName is a string, the histogram of individual runs is generated and stored
+    in a file with the given name. If it is NULL then the histogram is printed on the console.
+    By default no histogram is generated. 
+    The histogram format is: "rate bucket start" "number of tests in this bucket". **/
+inline 
+void RunTest ( const char* title_fmt, size_t workload_param, void (*pfn_test)(), char* histogramFileName = no_histogram ) {
+    char title[1024];
+    sprintf(title, title_fmt, (long)workload_param);
+    RunTestImpl(title, pfn_test, histogramFileName);
+}
+
+inline 
+void CalcSequentialTime ( void (*pfn)() ) {
+    util::sequential_time = RunTestImpl(NULL, pfn) / util::num_threads;
+}
+
+inline 
+void ResetSequentialTime () {
+    util::sequential_time = 0;
+}
+
+
+inline void PrintTitle() {
+    //printf ("%-32s %-*s Std Dev,%%  %-*s  Repeats   Gross time  Infra time  | NRUNS = %u", 
+    //        "Test name", util::rate_field_len, "Rate", util::rate_field_len, "Overhead", NRUNS);
+    printf ("%-34s %-*s Std Dev,%%  Par.overhead,%%  Repeats   Gross time  | Nruns %u, Nthreads %d", 
+            "Test name", util::rate_field_len, "Rate", NRUNS, util::num_threads);
+}
+
+void Test();
+
+inline
+int test_main( int argc, char* argv[] ) {
+    MinThread = 1;
+    MaxThread = tbb::task_scheduler_init::default_num_threads();
+    ParseCommandLine( argc, argv );
+    char buf[128];
+    util::rate_field_len = 2 + sprintf(buf, "%.1e", 1.1);
+    for ( int i = MinThread; i <= MaxThread; ++i ) {
+        tbb::task_scheduler_init init (i);
+        util::num_threads = i;
+        PrintTitle();
+        Test();
+        printf("\n");
+    }
+    printf("done\n");
+    return 0;
+}
diff --git a/dep/tbb/src/perf/statistics.cpp b/dep/tbb/src/perf/statistics.cpp
new file mode 100644
index 000000000..5edebb8ab
--- /dev/null
+++ b/dep/tbb/src/perf/statistics.cpp
@@ -0,0 +1,408 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "statistics.h"
+#include "statistics_xml.h"
+
+#define COUNT_PARAMETERS 3
+
+#ifdef _MSC_VER
+#define snprintf _snprintf
+#endif
+
+void GetTime(char* buff,int size_buff) 
+{
+    tm *newtime;
+    time_t timer;
+    time(&timer);
+    newtime=localtime(&timer);
+    strftime(buff,size_buff,"%H:%M:%S",newtime); 
+}
+
+void GetDate(char* buff,int size_buff) 
+{
+    tm *newtime;
+    time_t timer;
+    time(&timer);  
+    newtime=localtime(&timer);
+    strftime(buff,size_buff,"%Y-%m-%d",newtime); 
+}
+
+
+StatisticsCollector::TestCase StatisticsCollector::SetTestCase(const char *name, const char *mode, int threads)
+{
+    string KeyName(name);
+    switch (SortMode)
+    {
+    case ByThreads: KeyName += Format("_%02d_%s", threads, mode); break;
+    default:
+    case ByAlg: KeyName += Format("_%s_%02d", mode, threads); break;
+    }
+    CurrentKey = Statistics[KeyName];
+    if(!CurrentKey) {
+        CurrentKey = new StatisticResults;
+        CurrentKey->Mode = mode;
+        CurrentKey->Name = name;
+        CurrentKey->Threads = threads;
+        CurrentKey->Results.reserve(RoundTitles.size());
+        Statistics[KeyName] = CurrentKey;
+    }
+    return TestCase(CurrentKey);
+}
+
+StatisticsCollector::~StatisticsCollector()
+{
+    for(Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+        delete i->second;
+}
+
+void StatisticsCollector::ReserveRounds(size_t index)
+{
+    size_t i = RoundTitles.size();
+    if (i > index) return;
+    char buf[16];
+    RoundTitles.resize(index+1);
+    for(; i <= index; i++) {
+        snprintf( buf, 15, "%u", unsigned(i+1) );
+        RoundTitles[i] = buf;
+    }
+    for(Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++) {
+        if(!i->second) printf("!!!'%s' = NULL\n", i->first.c_str());
+        else i->second->Results.reserve(index+1);
+    }
+}
+
+void StatisticsCollector::AddRoundResult(const TestCase &key, value_t v)
+{
+    ReserveRounds(key.access->Results.size());
+    key.access->Results.push_back(v);
+}
+
+void StatisticsCollector::SetRoundTitle(size_t index, const char *fmt, ...)
+{
+    vargf2buff(buff, 128, fmt);
+    ReserveRounds(index);
+    RoundTitles[index] = buff;
+}
+
+void StatisticsCollector::AddStatisticValue(const TestCase &key, const char *type, const char *fmt, ...)
+{
+    vargf2buff(buff, 128, fmt);
+    AnalysisTitles.insert(type);
+    key.access->Analysis[type] = buff;
+}
+
+void StatisticsCollector::AddStatisticValue(const char *type, const char *fmt, ...)
+{
+    vargf2buff(buff, 128, fmt);
+    AnalysisTitles.insert(type);
+    CurrentKey->Analysis[type] = buff;
+}
+
+void StatisticsCollector::SetStatisticFormula(const char *name, const char *formula)
+{
+    Formulas[name] = formula;
+}
+
+void StatisticsCollector::SetTitle(const char *fmt, ...)
+{
+    vargf2buff(buff, 256, fmt);
+    Title = buff;
+}
+
+string ExcelFormula(const string &fmt, size_t place, size_t rounds, bool is_horizontal)
+{
+    char buff[16];
+    if(is_horizontal)
+        snprintf(buff, 15, "RC[%u]:RC[%u]", unsigned(place), unsigned(place+rounds-1));
+    else
+        snprintf(buff, 15, "R[%u]C:R[%u]C", unsigned(place+1), unsigned(place+rounds));
+    string result(fmt); size_t pos = 0;
+    while ( (pos = result.find("ROUNDS", pos, 6)) != string::npos )
+        result.replace(pos, 6, buff);
+    return result;
+}
+
+void StatisticsCollector::Print(int dataOutput, const char *ModeName)
+{
+    FILE *OutputFile;
+    if (dataOutput & StatisticsCollector::Stdout)
+    {
+        printf("\n-=# %s #=-\n", Title.c_str());
+        if(SortMode == ByThreads)
+            printf("    Name    |  #  | %s ", ModeName);
+        else
+            printf("    Name    | %s |  #  ", ModeName);
+        for (AnalysisTitles_t::iterator i = AnalysisTitles.begin(); i != AnalysisTitles.end(); i++)
+            printf("|%s", i->c_str()+1);
+
+        for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+        {
+            if(SortMode == ByThreads)
+                printf("\n%12s|% 5d|%6s", i->second->Name.c_str(), i->second->Threads, i->second->Mode.c_str());
+            else
+                printf("\n%12s|%6s|% 5d", i->second->Name.c_str(), i->second->Mode.c_str(), i->second->Threads);
+            Analysis_t &analisis = i->second->Analysis;
+            AnalysisTitles_t::iterator t = AnalysisTitles.begin();
+            for (Analysis_t::iterator a = analisis.begin(); a != analisis.end(); t++)
+            {
+                char fmt[8]; snprintf(fmt, 7, "|%% %us", unsigned(max(size_t(3), t->size())));
+                if(*t != a->first)
+                    printf(fmt, "");
+                else {
+                    printf(fmt, a->second.c_str()); a++;
+                }
+            }
+        }
+        printf("\n");
+    }
+    if (dataOutput & StatisticsCollector::HTMLFile)
+    {
+        if ((OutputFile = fopen((Name+".html").c_str(), "w+t")) != NULL)
+        {
+            char TimerBuff[100], DateBuff[100];
+            GetTime(TimerBuff,sizeof(TimerBuff));
+            GetDate(DateBuff,sizeof(DateBuff));
+            fprintf(OutputFile, "<html><head>\n<title>%s</title>\n</head><body>\n", Title.c_str());
+            //-----------------------
+            fprintf(OutputFile, "<table id=\"h\" style=\"position:absolute;top:20\" border=1 cellspacing=0 cellpadding=2>\n");
+            fprintf(OutputFile, "<tr><td><a name=hr href=#vr onclick=\"v.style.visibility='visible';"
+                                "h.style.visibility='hidden';\">Flip[H]</a></td>"
+                                "<td>%s</td><td>%s</td><td colspan=%u>%s</td>",
+                DateBuff, TimerBuff, unsigned(AnalysisTitles.size() + RoundTitles.size()), Title.c_str());
+            fprintf(OutputFile, "</tr>\n<tr bgcolor=#CCFFFF><td>Name</td><td>Threads</td><td>%s</td>", ModeName);
+            for (AnalysisTitles_t::iterator i = AnalysisTitles.begin(); i != AnalysisTitles.end(); i++)
+                fprintf(OutputFile, "<td>%s</td>", i->c_str()+1);
+            for (size_t i = 0; i < RoundTitles.size(); i++)
+                fprintf(OutputFile, "<td>%s</td>", RoundTitles[i].c_str());
+            for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+            {
+                fprintf(OutputFile, "</tr>\n<tr><td bgcolor=#CCFFCC>%s</td><td bgcolor=#CCFFCC>%d</td><td bgcolor=#CCFFCC>%4s</td>",
+                    i->second->Name.c_str(), i->second->Threads, i->second->Mode.c_str());
+                //statistics
+                AnalysisTitles_t::iterator t = AnalysisTitles.begin();
+                for (Analysis_t::iterator j = i->second->Analysis.begin(); j != i->second->Analysis.end(); t++)
+                {
+                    fprintf(OutputFile, "<td bgcolor=#FFFF99>%s</td>", (*t != j->first)?" ":(i->second->Analysis[j->first]).c_str());
+                    if(*t == j->first) j++;
+                }
+                //data
+                Results_t &r = i->second->Results;
+                for (size_t k = 0; k < r.size(); k++)
+                {
+                    fprintf(OutputFile, "<td>");
+                    fprintf(OutputFile, ResultsFmt, r[k]);
+                    fprintf(OutputFile, "</td>");
+                }
+            }
+            fprintf(OutputFile, "</tr>\n</table>\n");
+            //////////////////////////////////////////////////////
+            fprintf(OutputFile, "<table id=\"v\" style=\"visibility:hidden;position:absolute;top:20\" border=1 cellspacing=0 cellpadding=2>\n");
+            fprintf(OutputFile, "<tr><td><a name=vr href=#hr onclick=\"h.style.visibility='visible';"
+                                "v.style.visibility='hidden';\">Flip[V]</a></td>\n"
+                                "<td>%s</td><td>%s</td><td colspan=%u>%s</td>", 
+                DateBuff, TimerBuff, unsigned(max(Statistics.size()-2,size_t(1))), Title.c_str());
+
+            fprintf(OutputFile, "</tr>\n<tr bgcolor=#CCFFCC><td bgcolor=#CCFFFF>Name</td>");
+            for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                fprintf(OutputFile, "<td>%s</td>", i->second->Name.c_str());
+            fprintf(OutputFile, "</tr>\n<tr bgcolor=#CCFFCC><td bgcolor=#CCFFFF>Threads</td>");
+            for (Statistics_t::iterator n = Statistics.begin(); n != Statistics.end(); n++)
+                fprintf(OutputFile, "<td>%d</td>", n->second->Threads);
+            fprintf(OutputFile, "</tr>\n<tr bgcolor=#CCFFCC><td bgcolor=#CCFFFF>%s</td>", ModeName);
+            for (Statistics_t::iterator m = Statistics.begin(); m != Statistics.end(); m++)
+                fprintf(OutputFile, "<td>%s</td>", m->second->Mode.c_str());
+
+            for (AnalysisTitles_t::iterator t = AnalysisTitles.begin(); t != AnalysisTitles.end(); t++)
+            {
+                fprintf(OutputFile, "</tr>\n<tr bgcolor=#FFFF99><td bgcolor=#CCFFFF>%s</td>", t->c_str()+1);
+                for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                    fprintf(OutputFile, "<td>%s</td>", i->second->Analysis.count(*t)?i->second->Analysis[*t].c_str():" ");
+            }
+
+            for (size_t r = 0; r < RoundTitles.size(); r++)
+            {
+                fprintf(OutputFile, "</tr>\n<tr><td bgcolor=#CCFFFF>%s</td>", RoundTitles[r].c_str());
+                for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                {
+                    Results_t &result = i->second->Results;
+                    fprintf(OutputFile, "<td>");
+                    if(result.size() > r)
+                        fprintf(OutputFile, ResultsFmt, result[r]);
+                    fprintf(OutputFile, "</td>");
+                }
+            }
+            fprintf(OutputFile, "</tr>\n</table>\n</body></html>\n");
+            fclose(OutputFile);
+        }
+    }
+    if (dataOutput & StatisticsCollector::ExcelXML)
+    {
+        if ((OutputFile = fopen((Name+".xml").c_str(), "w+t")) == NULL) {
+            printf("Can't open .xml file\n");
+        } else {
+            //vector<value_t> *TmpVect;
+            //Statistics_t::iterator ii, i = Statistics.begin();
+            //Analysis_t::iterator jj, j = i->second.Analysis.begin();
+            char UserName[100];
+            char SheetName[20];
+            char TimerBuff[100], DateBuff[100];
+#if _WIN32 || _WIN64
+            strcpy(UserName,getenv("USERNAME"));
+#else
+            strcpy(UserName,getenv("USER"));
+#endif
+            //--------------------------------
+            strcpy(SheetName,"Horizontal");
+            GetTime(TimerBuff,sizeof(TimerBuff));
+            GetDate(DateBuff,sizeof(DateBuff));
+            //--------------------------
+            fprintf(OutputFile, XMLHead, UserName, TimerBuff);
+            fprintf(OutputFile, XMLStyles);
+            fprintf(OutputFile, XMLBeginSheet, SheetName);
+            fprintf(OutputFile, XMLNames,1,1,1,int(AnalysisTitles.size()+Formulas.size()+COUNT_PARAMETERS));
+            fprintf(OutputFile, XMLBeginTable, int(RoundTitles.size()+Formulas.size()+AnalysisTitles.size()+COUNT_PARAMETERS+1/*title*/), int(Statistics.size()+1));
+            fprintf(OutputFile, XMLBRow);
+            fprintf(OutputFile, XMLCellTopName);
+            fprintf(OutputFile, XMLCellTopThread);
+            fprintf(OutputFile, XMLCellTopMode, ModeName);
+            for (AnalysisTitles_t::iterator j = AnalysisTitles.begin(); j != AnalysisTitles.end(); j++)
+                fprintf(OutputFile, XMLAnalysisTitle, j->c_str()+1);
+            for (Formulas_t::iterator j = Formulas.begin(); j != Formulas.end(); j++)
+                fprintf(OutputFile, XMLAnalysisTitle, j->first.c_str()+1);
+            for (RoundTitles_t::iterator j = RoundTitles.begin(); j != RoundTitles.end(); j++)
+                fprintf(OutputFile, XMLAnalysisTitle, j->c_str());
+            fprintf(OutputFile, XMLCellEmptyWhite, Title.c_str());
+            fprintf(OutputFile, XMLERow);
+            //------------------------
+            for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+            {
+                fprintf(OutputFile, XMLBRow);
+                fprintf(OutputFile, XMLCellName,  i->second->Name.c_str());
+                fprintf(OutputFile, XMLCellThread,i->second->Threads);
+                fprintf(OutputFile, XMLCellMode,  i->second->Mode.c_str());
+                //statistics
+                AnalysisTitles_t::iterator at = AnalysisTitles.begin();
+                for (Analysis_t::iterator j = i->second->Analysis.begin(); j != i->second->Analysis.end(); at++)
+                {
+                    fprintf(OutputFile, XMLCellAnalysis, (*at != j->first)?"":(i->second->Analysis[j->first]).c_str());
+                    if(*at == j->first) j++;
+                }
+                //formulas
+                size_t place = 0;
+                Results_t &v = i->second->Results;
+                for (Formulas_t::iterator f = Formulas.begin(); f != Formulas.end(); f++, place++)
+                    fprintf(OutputFile, XMLCellFormula, ExcelFormula(f->second, Formulas.size()-place, v.size(), true).c_str());
+                //data
+                for (size_t k = 0; k < v.size(); k++)
+                {
+                    fprintf(OutputFile, XMLCellData, v[k]);
+                }
+                if(v.size() < RoundTitles.size())
+                    fprintf(OutputFile, XMLMergeRow, int(RoundTitles.size() - v.size()));
+                fprintf(OutputFile, XMLERow);
+            }
+            //------------------------
+            fprintf(OutputFile, XMLEndTable);
+            fprintf(OutputFile, XMLWorkSheetProperties,1,1,3,3,int(RoundTitles.size()+AnalysisTitles.size()+Formulas.size()+COUNT_PARAMETERS));
+            fprintf(OutputFile, XMLAutoFilter,1,1,1,int(AnalysisTitles.size()+Formulas.size()+COUNT_PARAMETERS));
+            fprintf(OutputFile, XMLEndWorkSheet);
+            //----------------------------------------
+            strcpy(SheetName,"Vertical");
+            fprintf(OutputFile, XMLBeginSheet, SheetName);
+            fprintf(OutputFile, XMLNames, int(Formulas.size()+AnalysisTitles.size()+COUNT_PARAMETERS+2),2,int(AnalysisTitles.size()+Formulas.size()+COUNT_PARAMETERS+2),int(Statistics.size()+1));
+            fprintf(OutputFile, XMLBeginTable, int(max(Statistics.size()+1, size_t(7))), int(RoundTitles.size()+AnalysisTitles.size()+Formulas.size()+COUNT_PARAMETERS+2));
+            //fprintf(OutputFile, XMLColumsVerticalTable, Statistics.size()+1);
+            //----------------------------------------
+
+            fprintf(OutputFile, XMLBRow);
+            fprintf(OutputFile, XMLNameAndTime, Name.c_str(), TimerBuff, DateBuff);
+            fprintf(OutputFile, XMLTableParamAndTitle, int(Statistics.size()), int(AnalysisTitles.size()), int(RoundTitles.size()), Title.c_str());
+            fprintf(OutputFile, XMLERow);
+            fprintf(OutputFile, XMLBRow);
+            //-------------------
+            fprintf(OutputFile, XMLCellTopName);
+            for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                fprintf(OutputFile, XMLCellName, i->second->Name.c_str());
+            fprintf(OutputFile, XMLERow);
+            fprintf(OutputFile, XMLBRow);
+            fprintf(OutputFile, XMLCellTopThread);
+            for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                fprintf(OutputFile, XMLCellThread, i->second->Threads);
+            fprintf(OutputFile, XMLERow);
+            fprintf(OutputFile, XMLBRow);
+            fprintf(OutputFile, XMLCellTopMode, ModeName);
+            for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                fprintf(OutputFile, XMLCellMode, i->second->Mode.c_str());
+            fprintf(OutputFile, XMLERow);
+            //-----------------
+            for (AnalysisTitles_t::iterator t = AnalysisTitles.begin(); t != AnalysisTitles.end(); t++)
+            {
+                fprintf(OutputFile, XMLBRow);
+                fprintf(OutputFile, XMLAnalysisTitle, t->c_str()+1);
+                for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                    fprintf(OutputFile, XMLCellAnalysis, i->second->Analysis.count(*t)?(i->second->Analysis[*t]).c_str():"");
+                fprintf(OutputFile, XMLERow);
+            }
+            //-------------------------------------
+            for (Formulas_t::iterator t = Formulas.begin(); t != Formulas.end(); t++)
+            {
+                fprintf(OutputFile, XMLBRow);
+                fprintf(OutputFile, XMLAnalysisTitle, t->first.c_str()+1);
+                size_t place = 0;
+                for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                    fprintf(OutputFile, XMLCellAnalysis, ExcelFormula(t->second, Formulas.size()-place, i->second->Results.size(), false).c_str());
+                fprintf(OutputFile, XMLERow);
+            }
+            //--------------------------------------
+            fprintf(OutputFile, XMLBRow);
+            fprintf(OutputFile, XMLCellEmptyWhite, "Result");
+            fprintf(OutputFile, XMLERow);
+            
+            for (size_t k = 0; k < RoundTitles.size(); k++)
+            {
+                fprintf(OutputFile, XMLBRow);
+                fprintf(OutputFile, XMLAnalysisTitle, RoundTitles[k].c_str());
+                for (Statistics_t::iterator i = Statistics.begin(); i != Statistics.end(); i++)
+                    if(i->second->Results.size() > k)
+                        fprintf(OutputFile, XMLCellData, i->second->Results[k]);
+                    else
+                        fprintf(OutputFile, XMLCellEmptyWhite, "");
+                fprintf(OutputFile, XMLERow);
+            }
+            fprintf(OutputFile, XMLEndTable);
+            //----------------------------------------
+            fprintf(OutputFile, XMLWorkSheetProperties, int(Formulas.size()+AnalysisTitles.size()+COUNT_PARAMETERS+2), int(Formulas.size()+AnalysisTitles.size()+COUNT_PARAMETERS+2),1,1,6);
+            fprintf(OutputFile, XMLAutoFilter, int(Formulas.size()+AnalysisTitles.size()+COUNT_PARAMETERS+2),2, int(Formulas.size()+AnalysisTitles.size()+COUNT_PARAMETERS+2), int(Statistics.size()+1));
+            //----------------------------------------
+            fprintf(OutputFile, XMLEndWorkSheet);
+            fprintf(OutputFile, XMLEndWorkbook);
+            fclose(OutputFile);
+        }
+    }
+}
diff --git a/dep/tbb/src/perf/statistics.h b/dep/tbb/src/perf/statistics.h
new file mode 100644
index 000000000..3066190d0
--- /dev/null
+++ b/dep/tbb/src/perf/statistics.h
@@ -0,0 +1,188 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+// Internal Intel tool
+
+#ifndef __STATISTICS_H__
+#define __STATISTICS_H__
+
+#define _CRT_SECURE_NO_DEPRECATE 1
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include <time.h>
+
+using namespace std;
+typedef double value_t;
+
+/*
+   Statistical collector class.
+  
+   Resulting table output:
+        +---------------------------------------------------------------------------+
+        | [Date] <Title>...                                                         |
+        +----------+----v----+--v---+----------------+------------+-..-+------------+
+        | TestName | Threads | Mode | Rounds results | Stat_type1 | .. | Stat_typeN |
+        +----------+---------+------+-+-+-+-..-+-+-+-+------------+-..-+------------+
+        |          |         |      | | | | .. | | | |            |    |            |
+        ..        ...       ...     ..................            ......           ..
+        |          |         |      | | | | .. | | | |            |    |            |
+        +----------+---------+------+-+-+-+-..-+-+-+-+------------+-..-+------------+
+
+   Iterating table output:
+        +---------------------------------------------------------------------------+
+        | [Date] <TestName>, Threads: <N>, Mode: <M>; for <Title>...                |
+        +----------+----v----+--v---+----------------+------------+-..-+------------+
+        
+*/
+
+class StatisticsCollector
+{
+public:
+    typedef map<string, string> Analysis_t;
+    typedef vector<value_t> Results_t;
+
+protected:
+    StatisticsCollector(const StatisticsCollector &);
+
+    struct StatisticResults
+    {
+        string              Name;
+        string              Mode;
+        int                 Threads;
+        Results_t           Results;
+        Analysis_t          Analysis;
+    };
+
+    // internal members
+	//bool OpenFile;
+    StatisticResults *CurrentKey;
+    string Title;
+    const char /**Name,*/ *ResultsFmt;
+	string Name;
+    //! Data
+    typedef map<string, StatisticResults*> Statistics_t;
+    Statistics_t Statistics;
+    typedef vector<string> RoundTitles_t;
+    RoundTitles_t RoundTitles;
+    //TODO: merge those into one structure
+    typedef map<string, string> Formulas_t;
+    Formulas_t   Formulas;
+    typedef set<string> AnalysisTitles_t;
+    AnalysisTitles_t AnalysisTitles;
+
+public:
+    struct TestCase {
+        StatisticResults *access;
+        TestCase() : access(0) {}
+        TestCase(StatisticResults *link) : access(link) {}
+        const char *getName() const { return access->Name.c_str(); }
+        const char *getMode() const { return access->Mode.c_str(); }
+        int getThreads()       const { return access->Threads; }
+        const Results_t &getResults() const { return access->Results; }
+        const Analysis_t &getAnalysis() const { return access->Analysis; }
+    };
+
+    enum Sorting {
+        ByThreads, ByAlg
+    };
+
+    //! Data and output types
+    enum DataOutput {
+        // Verbosity level enumeration
+        Statistic = 1,     //< Analytical data - computed after all iterations and rounds passed
+        Result    = 2,     //< Testing data    - collected after all iterations passed
+        Iteration = 3,     //< Verbose data    - collected at each iteration (for each size - in case of containers)
+        // ExtraVerbose is not applicabe yet :) be happy, but flexibility is always welcome
+
+        // Next constants are bit-fields
+        Stdout   = 1<<8,    //< Output to the console
+        TextFile = 1<<9,    //< Output to plain text file "name.txt" (delimiter is TAB by default)
+        ExcelXML = 1<<10,   //< Output to Excel-readable XML-file "name.xml"
+        HTMLFile = 1<<11    //< Output to HTML file "name.html"
+    };
+
+    //! Constructor. Specify tests set name which used as name of output files
+    StatisticsCollector(const char *name, Sorting mode = ByThreads, const char *fmt = "%g")
+        :  CurrentKey(NULL), ResultsFmt(fmt), Name(name), SortMode(mode) {}
+
+    ~StatisticsCollector();
+
+    //! Set tests set title, supporting printf-like arguments
+    void SetTitle(const char *fmt, ...);
+
+    //! Specify next test key
+    TestCase SetTestCase(const char *name, const char *mode, int threads);
+    //! Specify next test key
+    void SetTestCase(const TestCase &t) { SetTestCase(t.getName(), t.getMode(), t.getThreads()); }
+    //! Reserve specified number of rounds. Use for effeciency. Used mostly internally
+    void ReserveRounds(size_t index);
+    //! Add result of the measure
+    void AddRoundResult(const TestCase &, value_t v);
+    //! Add result of the current measure
+    void AddRoundResult(value_t v) { if(CurrentKey) AddRoundResult(TestCase(CurrentKey), v); }
+    //! Add title of round
+    void SetRoundTitle(size_t index, const char *fmt, ...);
+    //! Add numbered title of round
+    void SetRoundTitle(size_t index, int num) { SetRoundTitle(index, "%d", num); }
+    //! Get number of rounds
+    size_t GetRoundsCount() const { return RoundTitles.size(); }
+    // Set statistic value for the test
+    void AddStatisticValue(const TestCase &, const char *type, const char *fmt, ...);
+    // Set statistic value for the current test
+    void AddStatisticValue(const char *type, const char *fmt, ...);
+    //! Add Excel-processing formulas. @arg formula can contain more than one instances of
+    //! ROUNDS template which transforms into the range of cells with result values
+    //TODO://! #1 .. #n templates represent data cells from the first to the last
+    //TODO: merge with Analisis
+    void SetStatisticFormula(const char *name, const char *formula);
+
+    //! Data output
+    void Print(int dataOutput, const char *ModeName = "Mode");
+
+private:
+    Sorting SortMode;
+};
+
+//! using: Func(const char *fmt, ...) { vargf2buff(buff, 128, fmt);...
+#define vargf2buff(name, size, fmt) char name[size]; memset(name, 0, size); va_list args; va_start(args, fmt); vsnprintf( name, size-1, fmt, args)
+
+inline std::string Format(const char *fmt, ...) {
+    vargf2buff(buf, 1024, fmt); // from statistics.h
+    return std::string(buf);
+}
+
+#ifdef STATISTICS_INLINE
+#include "statistics.cpp"
+#endif
+#endif //__STATISTICS_H__
diff --git a/dep/tbb/src/perf/statistics_xml.h b/dep/tbb/src/perf/statistics_xml.h
new file mode 100644
index 000000000..7521825be
--- /dev/null
+++ b/dep/tbb/src/perf/statistics_xml.h
@@ -0,0 +1,208 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+const char XMLBRow[]=
+"   <Row>\n";
+
+const char XMLERow[]=
+"   </Row>\n";
+
+const char XMLHead[]=
+"<?xml version=\"1.0\"?>\n"
+"<?mso-application progid=\"Excel.Sheet\"?>\n\
+<Workbook xmlns=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
+ xmlns:o=\"urn:schemas-microsoft-com:office:office\"\n\
+ xmlns:x=\"urn:schemas-microsoft-com:office:excel\"\n\
+ xmlns:ss=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
+ xmlns:html=\"http://www.w3.org/TR/REC-html40\">\n\
+ <DocumentProperties xmlns=\"urn:schemas-microsoft-com:office:office\">\n\
+  <Author>%s</Author>\n\
+  <Created>%s</Created>\n\
+  <Company>Intel Corporation</Company>\n\
+ </DocumentProperties>\n\
+ <ExcelWorkbook xmlns=\"urn:schemas-microsoft-com:office:excel\">\n\
+  <RefModeR1C1/>\n\
+ </ExcelWorkbook>\n";
+ 
+ const char XMLStyles[]=
+ " <Styles>\n\
+  <Style ss:ID=\"Default\" ss:Name=\"Normal\">\n\
+   <Alignment ss:Vertical=\"Bottom\" ss:Horizontal=\"Left\" ss:WrapText=\"0\"/>\n\
+  </Style>\n\
+  <Style ss:ID=\"s26\">\n\
+   <Alignment ss:Vertical=\"Top\"  ss:Horizontal=\"Left\" ss:WrapText=\"0\"/>\n\
+   <Borders>\n\
+    <Border ss:Position=\"Bottom\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Left\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Right\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Top\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+   </Borders>\n\
+   <Interior ss:Color=\"#FFFF99\" ss:Pattern=\"Solid\"/>\n\
+  </Style>\n\
+  <Style ss:ID=\"s25\">\n\
+   <Alignment ss:Vertical=\"Top\"  ss:Horizontal=\"Left\" ss:WrapText=\"0\"/>\n\
+   <Borders>\n\
+    <Border ss:Position=\"Bottom\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Left\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Right\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Top\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+   </Borders>\n\
+   <Interior ss:Color=\"#CCFFFF\" ss:Pattern=\"Solid\"/>\n\
+  </Style>\n\
+  <Style ss:ID=\"s24\">\n\
+   <Alignment ss:Vertical=\"Top\"  ss:Horizontal=\"Left\" ss:WrapText=\"0\"/>\n\
+   <Borders>\n\
+    <Border ss:Position=\"Bottom\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Left\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Right\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Top\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+   </Borders>\n\
+   <Interior ss:Color=\"#CCFFCC\" ss:Pattern=\"Solid\"/>\n\
+  </Style>\n\
+  <Style ss:ID=\"s23\">\n\
+   <Alignment ss:Vertical=\"Top\"  ss:Horizontal=\"Left\" ss:WrapText=\"0\"/>\n\
+   <Borders>\n\
+    <Border ss:Position=\"Bottom\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Left\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Right\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+    <Border ss:Position=\"Top\" ss:LineStyle=\"Continuous\" ss:Weight=\"1\"/>\n\
+   </Borders>\n\
+  </Style>\n\
+ </Styles>\n";
+
+const char XMLBeginSheet[]=
+" <Worksheet ss:Name=\"%s\">\n";
+
+const char XMLNames[]=
+"  <Names>\n\
+   <NamedRange ss:Name=\"_FilterDatabase\" ss:RefersTo=\"R%dC%d:R%dC%d\" ss:Hidden=\"1\"/>\n\
+  </Names>\n";
+
+const char XMLBeginTable[]=
+"  <Table ss:ExpandedColumnCount=\"%d\" ss:ExpandedRowCount=\"%d\" x:FullColumns=\"1\"\n\
+   x:FullRows=\"1\">\n";
+   
+const char XMLColumsHorizontalTable[]=
+"   <Column ss:Index=\"1\" ss:Width=\"108.75\"/>\n\
+   <Column ss:Index=\"%d\" ss:Width=\"77.25\" ss:Span=\"%d\"/>\n";
+ 
+const char XMLColumsVerticalTable[]= 
+"   <Column ss:Index=\"1\" ss:Width=\"77.25\" ss:Span=\"%d\"/>\n";
+
+const char XMLNameAndTime[]=
+"    <Cell><Data ss:Type=\"String\">%s</Data></Cell>\n\
+    <Cell><Data ss:Type=\"String\">%s</Data></Cell>\n\
+    <Cell><Data ss:Type=\"String\">%s</Data></Cell>\n";
+
+const char XMLTableParamAndTitle[]=
+"    <Cell><Data ss:Type=\"Number\">%d</Data></Cell>\n\
+    <Cell><Data ss:Type=\"Number\">%d</Data></Cell>\n\
+    <Cell><Data ss:Type=\"Number\">%d</Data></Cell>\n\
+    <Cell><Data ss:Type=\"String\">%s</Data></Cell>\n";
+
+//--------------
+const char XMLCellTopName[]=
+"   <Cell ss:StyleID=\"s25\"><Data ss:Type=\"String\">Name</Data></Cell>\n";
+const char XMLCellTopThread[]=
+"   <Cell ss:StyleID=\"s25\"><Data ss:Type=\"String\">Threads</Data></Cell>\n";
+const char XMLCellTopMode[]=
+"   <Cell ss:StyleID=\"s25\"><Data ss:Type=\"String\">%s</Data></Cell>\n";
+//---------------------
+const char XMLAnalysisTitle[]=
+"   <Cell ss:StyleID=\"s25\"><Data ss:Type=\"String\">%s</Data></Cell>\n";
+
+const char XMLCellName[]=
+"    <Cell ss:StyleID=\"s24\"><Data ss:Type=\"String\">%s</Data></Cell>\n";
+
+const char XMLCellThread[]=
+"    <Cell ss:StyleID=\"s24\"><Data ss:Type=\"Number\">%d</Data></Cell>\n";
+
+const char XMLCellMode[]=
+"    <Cell ss:StyleID=\"s24\"><Data ss:Type=\"String\">%s</Data></Cell>\n";
+
+const char XMLCellAnalysis[]=
+"    <Cell ss:StyleID=\"s26\"><Data ss:Type=\"String\">%s</Data></Cell>\n";
+
+const char XMLCellFormula[]=
+"    <Cell ss:StyleID=\"s26\" ss:Formula=\"%s\"><Data ss:Type=\"Number\"></Data></Cell>\n";
+
+const char XMLCellData[]=
+"    <Cell ss:StyleID=\"s23\"><Data ss:Type=\"Number\">%g</Data></Cell>\n";
+
+const char XMLMergeRow[]=
+"   <Cell ss:StyleID=\"s23\" ss:MergeAcross=\"%d\" ><Data ss:Type=\"String\"></Data></Cell>\n";
+
+const char XMLCellEmptyWhite[]=
+"    <Cell><Data ss:Type=\"String\">%s</Data></Cell>\n";
+
+const char XMLCellEmptyTitle[]=
+"    <Cell ss:StyleID=\"s25\"><Data ss:Type=\"String\"></Data></Cell>\n";
+
+const char XMLEndTable[]=
+"  </Table>\n";
+
+const char XMLAutoFilter[]=
+"  <AutoFilter x:Range=\"R%dC%d:R%dC%d\" xmlns=\"urn:schemas-microsoft-com:office:excel\">\n\
+  </AutoFilter>\n";
+
+const char XMLEndWorkSheet[]=
+ " </Worksheet>\n";
+
+const char XMLWorkSheetProperties[]=
+"  <WorksheetOptions xmlns=\"urn:schemas-microsoft-com:office:excel\">\n\
+   <Unsynced/>\n\
+   <Selected/>\n\
+   <FreezePanes/>\n\
+   <FrozenNoSplit/>\n\
+   <SplitHorizontal>%d</SplitHorizontal>\n\
+   <TopRowBottomPane>%d</TopRowBottomPane>\n\
+   <SplitVertical>%d</SplitVertical>\n\
+   <LeftColumnRightPane>%d</LeftColumnRightPane>\n\
+   <ActivePane>0</ActivePane>\n\
+   <Panes>\n\
+    <Pane>\n\
+     <Number>3</Number>\n\
+    </Pane>\n\
+    <Pane>\n\
+     <Number>1</Number>\n\
+    </Pane>\n\
+    <Pane>\n\
+     <Number>2</Number>\n\
+    </Pane>\n\
+    <Pane>\n\
+     <Number>0</Number>\n\
+     <ActiveRow>0</ActiveRow>\n\
+     <ActiveCol>%d</ActiveCol>\n\
+    </Pane>\n\
+   </Panes>\n\
+   <ProtectObjects>False</ProtectObjects>\n\
+   <ProtectScenarios>False</ProtectScenarios>\n\
+  </WorksheetOptions>\n";
+
+const char XMLEndWorkbook[]=
+ "</Workbook>\n";
diff --git a/dep/tbb/src/perf/time_base.cpp b/dep/tbb/src/perf/time_base.cpp
new file mode 100644
index 000000000..78cbef2e1
--- /dev/null
+++ b/dep/tbb/src/perf/time_base.cpp
@@ -0,0 +1,262 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "tbb/blocked_range.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_reduce.h"
+
+
+#define NRUNS               10
+#define ONE_TEST_DURATION   0.01
+
+#include "perf_util.h"
+
+
+#define NUM_CHILD_TASKS     128
+#define NUM_ROOT_TASKS      16
+
+#define N               1000000
+#define FINE_GRAIN      50
+#define MED_GRAIN       500
+#define COARSE_GRAIN    10000
+
+
+typedef ANCHOR_TYPE count_type;
+typedef tbb::blocked_range<count_type> range_type;
+
+const count_type NUM_leaf_tasks = NUM_CHILD_TASKS * NUM_ROOT_TASKS;
+
+const count_type N_finest = (count_type)(N/log((double)N)/10);
+const count_type N_fine = N_finest * 10;
+
+
+class static_task_holder {
+public:
+    tbb::task   *my_simple_leaf_task_ptr;
+    
+    static_task_holder ();
+};
+
+static static_task_holder s_tasks;
+
+
+static size_t s_num_iterations = 0;
+
+
+class simple_leaf_task : public tbb::task
+{
+    task* execute () {
+        for ( size_t i=0; i < s_num_iterations; ++i )
+            util::anchor += i;
+            //util::anchor += size_t(log10((double)util::anchor)*10);
+        return NULL;
+    }
+};
+
+class simple_root_task : public tbb::task
+{
+    task* execute () {
+        set_ref_count(NUM_leaf_tasks + 1);
+        for ( size_t i = 0; i < NUM_leaf_tasks; ++i ) {
+            simple_leaf_task &t = *new( allocate_child() ) simple_leaf_task;
+            spawn(t);
+        }
+        wait_for_all();
+        return NULL;
+    }
+};
+
+void Work1 () {
+    for ( size_t i=0; i < NUM_leaf_tasks; ++i )
+        s_tasks.my_simple_leaf_task_ptr->execute();
+}
+
+void Test1_1 () {
+    tbb::empty_task &r = *new( tbb::task::allocate_root() ) tbb::empty_task;
+    r.set_ref_count(NUM_leaf_tasks + 1);
+    for ( size_t i = 0; i < NUM_leaf_tasks; ++i ) {
+        simple_leaf_task &t = *new( r.allocate_child() ) simple_leaf_task;
+        r.spawn(t);
+    }
+    r.wait_for_all();
+    r.destroy(r);
+}
+
+void Test1_2 ()
+{
+    simple_root_task &r = *new( tbb::task::allocate_root() ) simple_root_task;
+    tbb::task::spawn_root_and_wait(r);
+}
+
+
+class children_launcher_task : public tbb::task
+{
+    task* execute () {
+        set_ref_count(NUM_CHILD_TASKS + 1);
+        for ( size_t i = 0; i < NUM_CHILD_TASKS; ++i ) {
+            simple_leaf_task &t = *new( allocate_child() ) simple_leaf_task;
+            spawn(t);
+        }
+        wait_for_all();
+        return NULL;
+    }
+};
+
+class root_launcher_task : public tbb::task
+{
+    task* execute () {
+        children_launcher_task &r = *new( allocate_root() ) children_launcher_task;
+        spawn_root_and_wait(r);
+        return NULL;
+    }
+};
+
+class hierarchy_root_task : public tbb::task
+{
+    task* execute () {
+        tbb::task_list  tl;
+        for ( size_t i = 0; i < NUM_ROOT_TASKS; ++i ) {
+            root_launcher_task &r = *new( allocate_root() ) root_launcher_task;
+            tl.push_back(r);
+        }
+        spawn_root_and_wait(tl);
+        return NULL;
+    }
+};
+
+void Test1_3 ()
+{
+    hierarchy_root_task &r = *new( tbb::task::allocate_root() ) hierarchy_root_task;
+    tbb::task::spawn_root_and_wait(r);
+}
+
+
+static size_t   s_range = N,
+                s_grain = 1;
+
+class simple_pfor_body {
+public:
+    void operator()( const range_type& r ) const {
+        count_type end = r.end();
+        for( count_type i = r.begin(); i < end; ++i )
+            util::anchor += i;
+    }
+};
+
+void Work2 () {
+    simple_pfor_body body;
+    range_type range(0, s_range, s_grain);
+    body(range);
+}
+
+void Test2 () {
+    tbb::parallel_for( range_type(0, s_range, s_grain), simple_pfor_body() );
+}
+
+void Test2_0 () {
+    volatile count_type zero = 0;
+    tbb::parallel_for( range_type(0, zero, 1), simple_pfor_body() );
+}
+
+
+class simple_preduce_body {
+public:
+    count_type my_sum;
+    simple_preduce_body () : my_sum(0) {}
+    simple_preduce_body ( simple_preduce_body&, tbb::split ) : my_sum(0) {}
+    void join( simple_preduce_body& rhs ) { my_sum += rhs.my_sum;}
+    void operator()( const range_type& r ) {
+        count_type end = r.end();
+        for( count_type i = r.begin(); i < end; ++i )
+            util::anchor += i;
+        my_sum = util::anchor;
+    }
+};
+
+void Work3 () {
+    simple_preduce_body body;
+    range_type range(0, s_range, s_grain);
+    body(range);
+}
+
+void Test3 () {
+    simple_preduce_body body;
+    tbb::parallel_reduce( range_type(0, s_range, s_grain), body );
+}
+
+void Test3_0 () {
+    volatile count_type zero = 0;
+    simple_preduce_body body;
+    tbb::parallel_reduce( range_type(0, zero, 1), body );
+}
+
+
+static_task_holder::static_task_holder () {
+    static simple_leaf_task s_t1;
+    my_simple_leaf_task_ptr = &s_t1;
+}
+
+void Test () {
+    const size_t num_task_tree_workloads = 4;
+    size_t task_tree_workloads[num_task_tree_workloads] = {0, 50, 500, 10000};
+    for (size_t i = 0; i < num_task_tree_workloads; ++i ) {
+        size_t n = task_tree_workloads[i];
+        s_num_iterations = n;
+        CalcSequentialTime(Work1);
+        RunTest ("Bunch of leaves: %d adds/task", n, Test1_1);
+        RunTest ("Simple task tree: %d adds/task", n, Test1_2);
+        RunTest ("Complex task tree: %d adds/task", n, Test1_3);
+    }
+
+    // Using N_fine constant in the body of this function results in incorrect code
+    // generation by icl 10.1.014
+    const size_t num_alg_workloads = 4;
+    size_t alg_ranges[num_alg_workloads] = {N_fine/10, N_fine, N, N};
+    size_t alg_grains[num_alg_workloads] = {1, FINE_GRAIN, MED_GRAIN, COARSE_GRAIN};
+    
+    //RunTest ("Empty pfor", 0, Test2_0);
+    for (size_t i = 0; i < num_alg_workloads; ++i ) {
+        s_range = alg_ranges[i];
+        s_grain = alg_grains[i];
+        CalcSequentialTime(Work2);
+        RunTest ("pfor: %d adds/body", s_grain, Test2);
+    }
+
+    //RunTest ("Empty preduce", Test3_0);
+    for (size_t i = 0; i < num_alg_workloads; ++i ) {
+        s_range = alg_ranges[i];
+        s_grain = alg_grains[i];
+        CalcSequentialTime(Work3);
+        RunTest ("preduce: %d adds/body", s_grain, Test3);
+    }
+}
+
+int main( int argc, char* argv[] ) {
+    test_main(argc, argv);
+    return 0;
+}
diff --git a/dep/tbb/src/perf/time_framework.h b/dep/tbb/src/perf/time_framework.h
new file mode 100644
index 000000000..d301c2b3d
--- /dev/null
+++ b/dep/tbb/src/perf/time_framework.h
@@ -0,0 +1,343 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TIME_FRAMEWORK_H__
+#define __TIME_FRAMEWORK_H__
+
+#include <cstdlib>
+#include <math.h>
+#include <vector>
+#include <string>
+#include <sstream>
+#include "tbb/tbb_stddef.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/tick_count.h"
+#include "../test/harness.h"
+#include "../test/harness_barrier.h"
+#define STATISTICS_INLINE
+#include "statistics.h"
+
+#ifndef ARG_TYPE
+typedef intptr_t arg_t;
+#else
+typedef ARG_TYPE arg_t;
+#endif
+
+class Timer {
+    tbb::tick_count tick;
+public:
+    Timer() { tick = tbb::tick_count::now(); }
+    double get_time()  { return (tbb::tick_count::now() - tick).seconds(); }
+    double diff_time(const Timer &newer) { return (newer.tick - tick).seconds(); }
+    double mark_time() { tbb::tick_count t1(tbb::tick_count::now()), t2(tick); tick = t1; return (t1 - t2).seconds(); }
+    double mark_time(const Timer &newer) { tbb::tick_count t(tick); tick = newer.tick; return (tick - t).seconds(); }
+};
+
+class TesterBase /*: public tbb::internal::no_copy*/ {
+protected:
+    friend class TestProcessor;
+    friend class TestRunner;
+
+    //! it is barrier for synchronizing between threads
+    Harness::SpinBarrier *barrier;
+    
+    //! number of tests per this tester
+    const int tests_count;
+    
+    //! number of threads to operate
+    int threads_count;
+
+    //! some value for tester
+    arg_t value;
+
+    // avoid false sharing
+    char pad[128 - sizeof(arg_t) - sizeof(int)*2 - sizeof(void*) ];
+
+public:
+    //! init tester base. @arg ntests is number of embeded tests in this tester.
+    TesterBase(int ntests)
+        : barrier(NULL), tests_count(ntests)
+    {}
+    virtual ~TesterBase() {}
+
+    //! internal function
+    void base_init(arg_t v, int t, Harness::SpinBarrier &b) {
+        threads_count = t;
+        barrier = &b;
+        value = v;
+        init();
+    }
+
+    //! optionally override to init after value and threads count were set.
+    virtual void init() { }
+
+    //! Override to provide your names
+    virtual std::string get_name(int testn) {
+        return Format("test %d", testn);
+    }
+
+    //! optionally override to init test mode just before execution for a given thread number.
+    virtual void test_prefix(int testn, int threadn) { }
+
+    //! Override to provide main test's entry function returns a value to record
+    virtual value_t test(int testn, int threadn) = 0;
+
+    //! Type of aggregation from results of threads
+    enum result_t {
+        SUM, AVG, MIN, MAX
+    };
+
+    //! Override to change result type for the test. Return postfix for test name or 0 if result type is not needed.
+    virtual const char *get_result_type(int /*testn*/, result_t type) const {
+        return type == AVG ? "" : 0; // only average result by default
+    }
+};
+
+/*****
+a user's tester concept:
+
+class tester: public TesterBase {
+public:
+    //! init tester with known amount of work
+    tester() : TesterBase(<user-specified tests count>) { ... }
+
+    //! run a test with sequental number @arg test_number for @arg thread.
+    / *override* / value_t test(int test_number, int thread);
+};
+
+******/
+
+template<typename Tester, int scale = 1>
+class TimeTest : public Tester {
+    /*override*/ value_t test(int testn, int threadn) {
+        Timer timer;
+        Tester::test(testn, threadn);
+        return timer.get_time() * double(scale);
+    }
+};
+
+template<typename Tester>
+class NanosecPerValue : public Tester {
+    /*override*/ value_t test(int testn, int threadn) {
+        Timer timer;
+        Tester::test(testn, threadn);
+        // return time (ns) per value
+        return timer.get_time()*1000000.0/double(Tester::value);
+    }
+};
+
+template<typename Tester, int scale = 1>
+class ValuePerSecond : public Tester {
+    /*override*/ value_t test(int testn, int threadn) {
+        Timer timer;
+        Tester::test(testn, threadn);
+        // return time value per seconds/scale
+        return double(Tester::value)/(timer.get_time()*scale);
+    }
+};
+
+// operate with single tester
+class TestRunner {
+    friend class TestProcessor;
+    friend struct RunArgsBody;
+    TestRunner(const TestRunner &); // don't copy
+
+    const char *tester_name;
+    StatisticsCollector *stat;
+    std::vector<std::vector<StatisticsCollector::TestCase> > keys;
+
+public:
+    TesterBase &tester;
+
+    template<typename Test>
+    TestRunner(const char *name, Test *test)
+        : tester_name(name), tester(*static_cast<TesterBase*>(test))
+    {}
+    
+    ~TestRunner() { delete &tester; }
+
+    void init(arg_t value, int threads, Harness::SpinBarrier &barrier, StatisticsCollector *s) {
+        tester.base_init(value, threads, barrier);
+        stat = s;
+        keys.resize(tester.tests_count);
+        for(int testn = 0; testn < tester.tests_count; testn++) {
+            keys[testn].resize(threads);
+            std::string test_name(tester.get_name(testn));
+            for(int threadn = 0; threadn < threads; threadn++)
+                keys[testn][threadn] = stat->SetTestCase(tester_name, test_name.c_str(), threadn);
+        }
+    }
+
+    void run_test(int threadn) {
+        for(int testn = 0; testn < tester.tests_count; testn++) {
+            tester.test_prefix(testn, threadn);
+            tester.barrier->wait();                                 // <<<<<<<<<<<<<<<<< Barrier before running test mode
+            value_t result = tester.test(testn, threadn);
+            stat->AddRoundResult(keys[testn][threadn], result);
+        }
+    }
+
+    void post_process(StatisticsCollector &report) {
+        const int threads = tester.threads_count;
+        for(int testn = 0; testn < tester.tests_count; testn++) {
+            size_t coln = keys[testn][0].getResults().size()-1;
+            value_t rsum = keys[testn][0].getResults()[coln];
+            value_t rmin = rsum, rmax = rsum;
+            for(int threadn = 1; threadn < threads; threadn++) {
+                value_t result = keys[testn][threadn].getResults()[coln];
+                rsum += result; // for both SUM or AVG
+                if(rmin > result) rmin = result;
+                if(rmax < result) rmax = result;
+            }
+            std::string test_name(tester.get_name(testn));
+            const char *rname = tester.get_result_type(testn, TesterBase::SUM);
+            if( rname ) {
+                report.SetTestCase(tester_name, (test_name+rname).c_str(), threads);
+                report.AddRoundResult(rsum);
+            }
+            rname = tester.get_result_type(testn, TesterBase::MIN);
+            if( rname ) {
+                report.SetTestCase(tester_name, (test_name+rname).c_str(), threads);
+                report.AddRoundResult(rmin);
+            }
+            rname = tester.get_result_type(testn, TesterBase::AVG);
+            if( rname ) {
+                report.SetTestCase(tester_name, (test_name+rname).c_str(), threads);
+                report.AddRoundResult(rsum / threads);
+            }
+            rname = tester.get_result_type(testn, TesterBase::MAX);
+            if( rname ) {
+                report.SetTestCase(tester_name, (test_name+rname).c_str(), threads);
+                report.AddRoundResult(rmax);
+            }
+        }
+    }
+};
+
+struct RunArgsBody {
+    const vector<TestRunner*> &run_list;
+    RunArgsBody(const vector<TestRunner*> &a) : run_list(a) { }
+#ifndef __TBB_parallel_for_H
+    void operator()(int thread) const {
+#else
+    void operator()(const tbb::blocked_range<int> &r) const {
+        ASSERT( r.begin() + 1 == r.end(), 0);
+        int thread = r.begin();
+#endif
+        for(size_t i = 0; i < run_list.size(); i++)
+            run_list[i]->run_test(thread);
+    }
+};
+
+//! Main test processor.
+/** Override or use like this:
+ class MyTestCollection : public TestProcessor {
+    void factory(arg_t value, int threads) {
+        process( value, threads,
+            run("my1", new tester<my1>() ),
+            run("my2", new tester<my2>() ),
+        end );
+        if(value == threads)
+            stat->Print();
+    }
+};
+*/
+
+class TestProcessor {
+    friend class TesterBase;
+
+    // <threads, collector>
+    typedef std::map<int, StatisticsCollector *> statistics_collection;
+    statistics_collection stat_by_threads;
+
+protected:
+    // Members
+    const char *collection_name;
+    // current stat
+    StatisticsCollector *stat;
+    // token
+    size_t end;
+
+public:
+    StatisticsCollector report;
+
+    // token of tests list
+    template<typename Test>
+    TestRunner *run(const char *name, Test *test) {
+        return new TestRunner(name, test);
+    }
+
+    // iteration processing
+    void process(arg_t value, int threads, ...) {
+        // prepare items
+        stat = stat_by_threads[threads];
+        if(!stat) {
+            stat_by_threads[threads] = stat = new StatisticsCollector((collection_name + Format("@%d", threads)).c_str(), StatisticsCollector::ByAlg);
+            stat->SetTitle("Detailed log of %s running with %d threads.", collection_name, threads);
+        }
+        Harness::SpinBarrier barrier(threads);
+        // init args
+        va_list args; va_start(args, threads);
+        vector<TestRunner*> run_list; run_list.reserve(16);
+        while(true) {
+            TestRunner *item = va_arg(args, TestRunner*);
+            if( !item ) break;
+            item->init(value, threads, barrier, stat);
+            run_list.push_back(item);
+        }
+        va_end(args);
+        std::ostringstream buf;
+        buf << value;
+        const size_t round_number = stat->GetRoundsCount();
+        stat->SetRoundTitle(round_number, buf.str().c_str());
+        report.SetRoundTitle(round_number, buf.str().c_str());
+        // run them
+#ifndef __TBB_parallel_for_H
+        NativeParallelFor(threads, RunArgsBody(run_list));
+#else
+        tbb::parallel_for(tbb::blocked_range<int>(0,threads,1), RunArgsBody(run_list));
+#endif
+        // destroy args
+        for(size_t i = 0; i < run_list.size(); i++) {
+            run_list[i]->post_process(report);
+            delete run_list[i];
+        }
+    }
+
+public:
+    TestProcessor(const char *name, StatisticsCollector::Sorting sort_by = StatisticsCollector::ByAlg)
+        : collection_name(name), stat(NULL), end(0), report(collection_name, sort_by)
+    { }
+
+    ~TestProcessor() {
+        for(statistics_collection::iterator i = stat_by_threads.begin(); i != stat_by_threads.end(); i++)
+            delete i->second;
+    }
+};
+
+#endif// __TIME_FRAMEWORK_H__
diff --git a/dep/tbb/src/perf/time_hash_map.cpp b/dep/tbb/src/perf/time_hash_map.cpp
new file mode 100644
index 000000000..a72cf48a0
--- /dev/null
+++ b/dep/tbb/src/perf/time_hash_map.cpp
@@ -0,0 +1,366 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+// configuration:
+
+//! enable/disable std::map tests
+#define STDTABLE 1
+
+//! enable/disable old implementation tests (correct include file also)
+#define OLDTABLE 0
+#define OLDTABLEHEADER "tbb/concurrent_hash_map-4078.h"//-4329
+
+//! enable/disable experimental implementation tests (correct include file also)
+
+#define TESTTABLE 0
+#define TESTTABLEHEADER "tbb/concurrent_unordered_map.h"
+
+//////////////////////////////////////////////////////////////////////////////////
+
+#include <cstdlib>
+#include <math.h>
+#include "tbb/tbb_stddef.h"
+#include <vector>
+#include <map>
+// needed by hash_maps
+#include <stdexcept>
+#include <iterator>
+#include <algorithm>                 // std::swap
+#include <utility>      // Need std::pair from here
+#include "tbb/cache_aligned_allocator.h"
+#include "tbb/tbb_allocator.h"
+#include "tbb/spin_rw_mutex.h"
+#include "tbb/aligned_space.h"
+#include "tbb/atomic.h"
+// for test
+#include "tbb/spin_mutex.h"
+#include "time_framework.h"
+
+
+using namespace tbb;
+using namespace tbb::internal;
+
+struct IntHashCompare {
+    size_t operator() ( int x ) const { return x; }
+    bool operator() ( int x, int y ) const { return x==y; }
+    static long hash( int x ) { return x; }
+    bool equal( int x, int y ) const { return x==y; }
+};
+
+namespace version_current {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+    #include "tbb/concurrent_hash_map.h"
+}
+typedef version_current::tbb::concurrent_hash_map<int,int,IntHashCompare> IntTable;
+
+#if OLDTABLE
+#undef __TBB_concurrent_hash_map_H
+namespace version_base {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+    #include OLDTABLEHEADER
+}
+typedef version_base::tbb::concurrent_hash_map<int,int,IntHashCompare> OldTable;
+#endif
+
+#if TESTTABLE
+#undef __TBB_concurrent_hash_map_H
+namespace version_new {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+    #include TESTTABLEHEADER
+}
+typedef version_new::tbb::concurrent_unordered_map<int,int,IntHashCompare,IntHashCompare> TestTable;
+#define TESTTABLE 1
+#endif
+
+///////////////////////////////////////
+
+static const char *map_testnames[] = {
+    "1.insert", "2.count(w/rehash)", "3.find/wr", "4.erase"
+};
+
+template<typename TableType>
+struct TestTBBMap : TesterBase {
+    typedef typename TableType::accessor accessor;
+    typedef typename TableType::const_accessor const_accessor;
+    TableType Table;
+    int n_items;
+
+    TestTBBMap() : TesterBase(4) {}
+    void init() { n_items = value/threads_count; }
+
+    std::string get_name(int testn) {
+        return std::string(map_testnames[testn]);
+    }
+
+    double test(int test, int t)
+    {
+        switch(test) {
+          case 0: // fill
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                accessor a;
+                Table.insert( a, i );
+                a->second = 0;
+            }
+            break;
+          case 1: // work1
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                size_t c = Table.count( i );
+                ASSERT( c == 1, NULL);
+            }
+            break;
+          case 2: // work2
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                accessor a;
+                Table.find( a, i );
+                ASSERT( !a->second, "A key should be incremented only once");
+                a->second += 1;
+            }
+            break;
+          case 3: // clean
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                ASSERT( Table.erase( i ), NULL);
+            }
+        }
+        return 0;
+    }
+};
+
+template<typename M>
+struct TestSTLMap : TesterBase {
+    std::map<int, int> Table;
+    M mutex;
+
+    int n_items;
+    TestSTLMap() : TesterBase(4) {}
+    void init() { n_items = value/threads_count; }
+
+    std::string get_name(int testn) {
+        return std::string(map_testnames[testn]);
+    }
+
+    double test(int test, int t)
+    {
+        switch(test) {
+          case 0: // fill
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                typename M::scoped_lock with(mutex);
+                Table[i] = 0;
+            }
+            break;
+          case 1: // work1
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                typename M::scoped_lock with(mutex);
+                size_t c = Table.count(i);
+                ASSERT( c == 1, NULL);
+            }
+            break;
+          case 2: // work2
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                typename M::scoped_lock with(mutex);
+                Table[i] += 1;
+            }
+            break;
+          case 3: // clean
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                typename M::scoped_lock with(mutex);
+                Table.erase(i);
+            }
+        }
+        return 0;
+    }
+};
+
+class fake_mutex {
+    int a;
+public:
+    class scoped_lock {
+        fake_mutex *p;
+
+    public:
+        scoped_lock() {}
+        scoped_lock( fake_mutex &m ) { p = &m; }
+        ~scoped_lock() { p->a = 0; }
+        void acquire( fake_mutex &m ) { p = &m; }
+        void release() { }
+    };
+};
+
+class test_hash_map : public TestProcessor {
+public:
+    test_hash_map() : TestProcessor("test_hash_map") {}
+    void factory(int value, int threads) {
+        if(Verbose) printf("Processing with %d threads: %d...\n", threads, value);
+        process( value, threads,
+#if STDTABLE
+            run("std::map ", new NanosecPerValue<TestSTLMap<spin_mutex> >() ),
+#endif
+#if OLDTABLE
+            run("old::hmap", new NanosecPerValue<TestTBBMap<OldTable> >() ),
+#endif
+            run("tbb::hmap", new NanosecPerValue<TestTBBMap<IntTable> >() ),
+#if TESTTABLE
+            run("new::hmap", new NanosecPerValue<TestTBBMap<TestTable> >() ),
+#endif
+        end );
+        //stat->Print(StatisticsCollector::Stdout);
+        if(value >= 2097152) stat->Print(StatisticsCollector::HTMLFile);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+template<typename TableType>
+struct TestHashMapFind : TesterBase {
+    typedef typename TableType::accessor accessor;
+    typedef typename TableType::const_accessor const_accessor;
+    TableType Table;
+    int n_items;
+
+    std::string get_name(int testn) {
+        return std::string(!testn?"find":"insert");
+    }
+
+    TestHashMapFind() : TesterBase(2) {}
+    void init() {
+        n_items = value/threads_count;
+        for(int i = 0; i < value; i++) {
+            accessor a; Table.insert( a, i );
+        }
+    }
+
+    double test(int test, int t)
+    {
+        switch(test) {
+          case 0: // find
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                accessor a;
+                Table.find( a, i );
+                a->second = i;
+            }
+            break;
+          case 1: // insert
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                accessor a;
+                Table.insert( a, i );
+                a->second = -i;
+            }
+            break;
+        }
+        return 0;
+    }
+};
+
+const int test2_size = 65536;
+int Data[test2_size];
+
+template<typename TableType>
+struct TestHashCountStrings : TesterBase {
+    typedef typename TableType::accessor accessor;
+    typedef typename TableType::const_accessor const_accessor;
+    TableType Table;
+    int n_items;
+
+    std::string get_name(int testn) {
+        return !testn?"insert":"find";
+    }
+
+    TestHashCountStrings() : TesterBase(2) {}
+    void init() {
+        n_items = value/threads_count;
+    }
+
+    double test(int testn, int t)
+    {
+        if(!testn) {
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                accessor a; Table.insert( a, Data[i] );
+            }
+        } else { // 
+            for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+                accessor a; Table.find( a, Data[i] );
+            }
+        }
+        return 0;
+    }
+};
+
+class test_hash_map_find : public TestProcessor {
+public:
+    test_hash_map_find() : TestProcessor("test_hash_map_find") {}
+    void factory(int value, int threads) {
+        if(Verbose) printf("Processing with %d threads: %d...\n", threads, value);
+        process( value, threads,
+#if OLDTABLE
+            run("Filled old::hashmap", new NanosecPerValue<TestHashMapFind<OldTable> >() ),
+#endif
+            run("Filled tbb::hashmap", new NanosecPerValue<TestHashMapFind<IntTable> >() ),
+#if TESTTABLE
+            run("Filled new::hashmap", new NanosecPerValue<TestHashMapFind<TestTable> >() ),
+#endif
+#if OLDTABLE
+            run("CountStr old::hashmap", new TimeTest<TestHashCountStrings<OldTable> >() ),
+#endif
+            run("CountStr tbb::hashmap", new TimeTest<TestHashCountStrings<IntTable> >() ),
+#if TESTTABLE
+            run("CountStr new::hashmap", new TimeTest<TestHashCountStrings<TestTable> >() ),
+#endif
+        end );
+        //stat->Print(StatisticsCollector::HTMLFile);
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char* argv[]) {
+    if(argc>1) Verbose = true;
+    //if(argc>2) ExtraVerbose = true;
+    MinThread = 1; MaxThread = task_scheduler_init::default_num_threads();
+    ParseCommandLine( argc, argv );
+
+    ASSERT(tbb_allocator<int>::allocator_type() == tbb_allocator<int>::scalable, "expecting scalable allocator library to be loaded. Please build it by:\n\t\tmake tbbmalloc");
+
+    {
+        test_hash_map_find test_find; int o = test2_size;
+        for(int i = 0; i < o; i++)
+            Data[i] = i%60;
+        for( int t=MinThread; t <= MaxThread; t++)
+            test_find.factory(o, t);
+        test_find.report.SetTitle("Nanoseconds per operation of finding operation (Mode) for %d items", o);
+        test_find.report.Print(StatisticsCollector::HTMLFile|StatisticsCollector::ExcelXML);
+    }
+    {
+        test_hash_map the_test;
+        for( int t=MinThread; t <= MaxThread; t*=2)
+            for( int o=/*2048*/(1<<8)*8; o<2200000; o*=2 )
+                the_test.factory(o, t);
+        the_test.report.SetTitle("Nanoseconds per operation of (Mode) for N items in container (Name)");
+        the_test.report.SetStatisticFormula("1AVG per size", "=AVERAGE(ROUNDS)");
+        the_test.report.Print(StatisticsCollector::HTMLFile|StatisticsCollector::ExcelXML);
+    }
+    return 0;
+}
+
diff --git a/dep/tbb/src/perf/time_hash_map_fill.cpp b/dep/tbb/src/perf/time_hash_map_fill.cpp
new file mode 100644
index 000000000..1b9644724
--- /dev/null
+++ b/dep/tbb/src/perf/time_hash_map_fill.cpp
@@ -0,0 +1,155 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+// configuration:
+
+// Size of input array
+const int INPUT_SIZE = 2000000;
+// Specify list of unique percents to test against. Max - 10
+#define SOURCE_ARRAY UNIQUE_PERCENT(5); UNIQUE_PERCENT(10); UNIQUE_PERCENT(20); UNIQUE_PERCENT(40)
+
+// enable/disable tests for:
+#define BOX1 "TBB"
+#define BOX1TEST ValuePerSecond<Uniques<tbb::concurrent_hash_map<int,int> >, 1000000/*ns*/>
+#define BOX1HEADER "tbb/concurrent_hash_map.h"
+
+// enable/disable tests for:
+#define BOX2 "OLD"
+#define BOX2TEST ValuePerSecond<Uniques<tbb::concurrent_hash_map<int,int> >, 1000000/*ns*/>
+#define BOX2HEADER "tbb/concurrent_hash_map-5468.h"
+
+#define TBB_USE_THREADING_TOOLS 0
+//////////////////////////////////////////////////////////////////////////////////
+
+#include <cstdlib>
+#include <math.h>
+#include "tbb/tbb_stddef.h"
+#include <vector>
+#include <map>
+// needed by hash_maps
+#include <stdexcept>
+#include <iterator>
+#include <algorithm>                 // std::swap
+#include <utility>      // Need std::pair
+#include <cstring>      // Need std::memset
+#include <typeinfo>
+#include "tbb/cache_aligned_allocator.h"
+#include "tbb/tbb_allocator.h"
+#include "tbb/spin_rw_mutex.h"
+#include "tbb/aligned_space.h"
+#include "tbb/atomic.h"
+// for test
+#include "tbb/spin_mutex.h"
+#include "time_framework.h"
+
+
+using namespace tbb;
+using namespace tbb::internal;
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// Input data built for SOURCE_ARRAY settings
+int Mixtures = 0;
+int Percents[10];
+int *Data[10];
+
+// Main test class used to run the timing tests. All overridden methods are called by the framework
+template<typename TableType>
+struct Uniques : TesterBase {
+    typedef typename TableType::accessor accessor;
+    typedef typename TableType::const_accessor const_accessor;
+    TableType *Table;
+    int n_items;
+
+    // Returns name of test mode specified by number
+    /*override*/ std::string get_name(int testn) {
+        return Format("%d%% uniques", Percents[testn]);
+    }
+
+    // Initializes base class with number of test modes
+    Uniques() : TesterBase(Mixtures), Table(0) {}
+    ~Uniques() { if(Table) delete Table; }
+    
+    // Informs the class that value and threads number become known
+    /*override*/ void init() {
+        n_items = value/threads_count;
+    }
+
+    // Informs the class that the test mode for specified thread is about to start
+    /*override*/ void test_prefix(int testn, int t) {
+        barrier->wait();
+        if( t ) return;
+        if(Table) delete Table;
+        Table = new TableType(MaxThread*4);
+    }
+
+    // Executes test mode for a given thread. Return value is ignored when used with timing wrappers.
+    /*override*/ double test(int testn, int t)
+    {
+        for(int i = t*n_items, e = (t+1)*n_items; i < e; i++) {
+            Table->insert( std::make_pair(Data[testn][i],t) );
+        }
+        return 0;
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+
+// Using BOX declarations from configuration
+#include "time_sandbox.h"
+
+// Prepares the input data for given unique percent
+inline void UNIQUE_PERCENT(int p) {
+    Percents[Mixtures] = p;
+    Data[Mixtures] = new int[INPUT_SIZE];
+    int uniques = INPUT_SIZE/100*p;
+    srand(10101);
+    for(int i = 0; i < INPUT_SIZE; i++)
+        Data[Mixtures][i] = rand()%uniques;
+    Mixtures++;
+}
+
+int main(int argc, char* argv[]) {
+    if(argc>1) Verbose = true;
+    //if(argc>2) ExtraVerbose = true;
+    MinThread = 1; MaxThread = task_scheduler_init::default_num_threads();
+    ParseCommandLine( argc, argv );
+
+    ASSERT(tbb_allocator<int>::allocator_type() == tbb_allocator<int>::scalable, "expecting scalable allocator library to be loaded. Please build it by:\n\t\tmake tbbmalloc");
+    SOURCE_ARRAY; // prepare source array
+
+    {
+        // Declares test processor
+        TEST_PROCESSOR_NAME the_test("time_hash_map_fill"/*, StatisticsCollector::ByThreads*/);
+        for( int t=MinThread; t <= MaxThread; t++)
+            the_test.factory(INPUT_SIZE, t); // executes the tests specified in BOX-es for given 'value' and threads
+        the_test.report.SetTitle("Operations per nanosecond", INPUT_SIZE);
+        the_test.report.Print(StatisticsCollector::HTMLFile|StatisticsCollector::ExcelXML); // Write files
+    }
+    return 0;
+}
+
diff --git a/dep/tbb/src/perf/time_locked_work.cpp b/dep/tbb/src/perf/time_locked_work.cpp
new file mode 100644
index 000000000..62b9f38a6
--- /dev/null
+++ b/dep/tbb/src/perf/time_locked_work.cpp
@@ -0,0 +1,174 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+////// Test configuration ////////////////////////////////////////////////////
+#define SECONDS_RATIO 1000000 // microseconds
+
+#ifndef REPEAT_K
+#define REPEAT_K 50 // repeat coefficient
+#endif
+
+int outer_work[] = {/*256,*/ 64, 16, 4, 0};
+int inner_work[] = {32, 8, 0 };
+
+// keep it to calibrate the time of work without synchronization
+#define BOX1 "baseline"
+#define BOX1TEST TimeTest< TBB_Mutex<tbb::null_mutex>, SECONDS_RATIO >
+
+// enable/disable tests for:
+#define BOX2 "spin_mutex"
+#define BOX2TEST TimeTest< TBB_Mutex<tbb::spin_mutex>, SECONDS_RATIO >
+
+// enable/disable tests for:
+#define BOX3 "spin_rw_mutex"
+#define BOX3TEST TimeTest< TBB_Mutex<tbb::spin_rw_mutex>, SECONDS_RATIO >
+
+// enable/disable tests for:
+#define BOX4 "queuing_mutex"
+#define BOX4TEST TimeTest< TBB_Mutex<tbb::queuing_mutex>, SECONDS_RATIO >
+
+// enable/disable tests for:
+//#define BOX5 "queuing_rw_mutex"
+#define BOX5TEST TimeTest< TBB_Mutex<tbb::queuing_rw_mutex>, SECONDS_RATIO >
+
+//////////////////////////////////////////////////////////////////////////////
+
+#include <cstdlib>
+#include <math.h>
+#include <algorithm>                 // std::swap
+#include <utility>      // Need std::pair from here
+#include <sstream>
+#include "tbb/tbb_stddef.h"
+#include "tbb/null_mutex.h"
+#include "tbb/spin_rw_mutex.h"
+#include "tbb/spin_mutex.h"
+#include "tbb/queuing_mutex.h"
+#include "tbb/queuing_rw_mutex.h"
+#include "tbb/mutex.h"
+
+#if INTEL_TRIAL==2
+#include "tbb/parallel_for.h" // enable threading by TBB scheduler
+#include "tbb/task_scheduler_init.h"
+#include "tbb/blocked_range.h" 
+#endif
+// for test
+#include "time_framework.h"
+
+using namespace tbb;
+using namespace tbb::internal;
+
+/////////////////////////////////////////////////////////////////////////////////////////
+
+//! base class for tests family
+struct TestLocks : TesterBase {
+    // Inherits "value", "threads_count", and other variables
+    TestLocks() : TesterBase(/*number of modes*/sizeof(outer_work)/sizeof(int)) {}
+    //! returns name of test part/mode
+    /*override*/std::string get_name(int testn) {
+        std::ostringstream buf;
+        buf.width(4); buf.fill('0');
+        buf << outer_work[testn]; // mode number
+        return buf.str();
+    }
+    //! enables results types and returns theirs suffixes
+    /*override*/const char *get_result_type(int, result_t type) const {
+        switch(type) {
+            case MIN: return " min";
+            case MAX: return " max";
+            default: return 0;
+        }
+    }
+    //! repeats count
+    int repeat_until(int /*test_n*/) const {
+        return REPEAT_K*100;//TODO: suggest better?
+    }
+    //! fake work
+    void do_work(int work) volatile {
+        for(int i = 0; i < work; i++) {
+            volatile int x = i;
+            __TBB_Pause(0); // just to call inline assembler
+            x *= work/threads_count;
+        }
+    }
+};
+
+//! template test unit for any of TBB mutexes
+template<typename M>
+struct TBB_Mutex : TestLocks {
+    M mutex;
+
+    double test(int testn, int /*threadn*/)
+    {
+        for(int r = 0; r < repeat_until(testn); ++r) {
+            do_work(outer_work[testn]);
+            {
+                typename M::scoped_lock with(mutex);
+                do_work(/*inner work*/value);
+            }
+        }
+        return 0;
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+
+//Using BOX declarations
+#include "time_sandbox.h"
+
+// run tests for each of inner work value
+void RunLoops(test_sandbox &the_test, int thread) {
+    for( unsigned i=0; i<sizeof(inner_work)/sizeof(int); ++i )
+        the_test.factory(inner_work[i], thread);
+}
+
+int main(int argc, char* argv[]) {
+    if(argc>1) Verbose = true;
+    int DefThread = task_scheduler_init::default_num_threads();
+    MinThread = 1; MaxThread = DefThread+1;
+    ParseCommandLine( argc, argv );
+    ASSERT(MinThread <= MaxThread, 0);
+#if INTEL_TRIAL && defined(__TBB_parallel_for_H)
+    task_scheduler_init me(MaxThread);
+#endif
+    {
+        test_sandbox the_test("time_locked_work", StatisticsCollector::ByThreads);
+        //TODO: refactor this out as RunThreads(test&)
+        for( int t = MinThread; t < DefThread && t <= MaxThread; t *= 2)
+            RunLoops( the_test, t ); // execute undersubscribed threads
+        if( DefThread > MinThread && DefThread <= MaxThread )
+            RunLoops( the_test, DefThread ); // execute on all hw threads
+        if( DefThread < MaxThread)
+            RunLoops( the_test, MaxThread ); // execute requested oversubscribed threads
+
+        the_test.report.SetTitle("Time of lock/unlock for mutex Name with Outer and Inner work");
+        //the_test.report.SetStatisticFormula("1AVG per size", "=AVERAGE(ROUNDS)");
+        the_test.report.Print(StatisticsCollector::HTMLFile|StatisticsCollector::ExcelXML, /*ModeName*/ "Outer work");
+    }
+    return 0;
+}
+
diff --git a/dep/tbb/src/perf/time_sandbox.h b/dep/tbb/src/perf/time_sandbox.h
new file mode 100644
index 000000000..158ebe746
--- /dev/null
+++ b/dep/tbb/src/perf/time_sandbox.h
@@ -0,0 +1,168 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#ifndef __TIME_FRAMEWORK_H__
+#error time_framework.h must be included
+#endif
+
+#ifdef BOX1
+namespace sandbox1 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX1HEADER
+#   include BOX1HEADER
+#   endif
+    typedef ::BOX1TEST testbox;
+}
+#endif
+#ifdef BOX2
+namespace sandbox2 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX2HEADER
+#   include BOX2HEADER
+#   endif
+    typedef ::BOX2TEST testbox;
+}
+#endif
+#ifdef BOX3
+namespace sandbox3 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX3HEADER
+#   include BOX3HEADER
+#   endif
+    typedef ::BOX3TEST testbox;
+}
+#endif
+#ifdef BOX4
+namespace sandbox4 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX4HEADER
+#   include BOX4HEADER
+#   endif
+    typedef ::BOX4TEST testbox;
+}
+#endif
+#ifdef BOX5
+namespace sandbox5 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX5HEADER
+#   include BOX5HEADER
+#   endif
+    typedef ::BOX5TEST testbox;
+}
+#endif
+#ifdef BOX6
+namespace sandbox6 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX6HEADER
+#   include BOX6HEADER
+#   endif
+    typedef ::BOX6TEST testbox;
+}
+#endif
+#ifdef BOX7
+namespace sandbox7 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX7HEADER
+#   include BOX7HEADER
+#   endif
+    typedef ::BOX7TEST testbox;
+}
+#endif
+#ifdef BOX8
+namespace sandbox8 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX8HEADER
+#   include BOX8HEADER
+#   endif
+    typedef ::BOX8TEST testbox;
+}
+#endif
+#ifdef BOX9
+namespace sandbox9 {
+    namespace tbb { using namespace ::tbb; namespace internal { using namespace ::tbb::internal; } }
+#   ifdef BOX9HEADER
+#   include BOX9HEADER
+#   endif
+    typedef ::BOX9TEST testbox;
+}
+#endif
+
+//if harness.h included
+#if defined(ASSERT) && !HARNESS_NO_PARSE_COMMAND_LINE
+#ifndef TEST_PREFIX
+#define TEST_PREFIX if(Verbose) printf("Processing with %d threads: %ld...\n", threads, value);
+#endif
+#endif//harness included
+
+#ifndef TEST_PROCESSOR_NAME
+#define TEST_PROCESSOR_NAME test_sandbox
+#endif
+
+class TEST_PROCESSOR_NAME : public TestProcessor {
+public:
+    TEST_PROCESSOR_NAME(const char *name, StatisticsCollector::Sorting sort_by = StatisticsCollector::ByAlg)
+        : TestProcessor(name, sort_by) {}
+    void factory(arg_t value, int threads) {
+#ifdef TEST_PREFIX
+        TEST_PREFIX
+#endif
+        process( value, threads,
+#define RUNBOX(n) run(#n"."BOX##n, new sandbox##n::testbox() )
+#ifdef BOX1
+        RUNBOX(1),
+#endif
+#ifdef BOX2
+        RUNBOX(2),
+#endif
+#ifdef BOX3
+        RUNBOX(3),
+#endif
+#ifdef BOX4
+        RUNBOX(4),
+#endif
+#ifdef BOX5
+        RUNBOX(5),
+#endif
+#ifdef BOX6
+        RUNBOX(6),
+#endif
+#ifdef BOX7
+        RUNBOX(7),
+#endif
+#ifdef BOX8
+        RUNBOX(8),
+#endif
+#ifdef BOX9
+        RUNBOX(9),
+#endif
+        end );
+#ifdef TEST_POSTFIX
+        TEST_POSTFIX
+#endif
+    }
+};
diff --git a/dep/tbb/src/perf/time_unit.cpp b/dep/tbb/src/perf/time_unit.cpp
new file mode 100644
index 000000000..8d8162dc4
--- /dev/null
+++ b/dep/tbb/src/perf/time_unit.cpp
@@ -0,0 +1,291 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "tbb/task_scheduler_init.h"
+#include "tbb/tick_count.h"
+#include <cmath>
+#include <cstdlib>
+#include <cerrno>
+#include <cfloat>
+#include <vector>
+#include <algorithm>
+
+#include "../src/test/harness.h"
+
+#if  __linux__ || __APPLE__ || __FreeBSD__
+    #include <sys/resource.h>
+#endif /* __APPLE__ */
+
+// The code, performance of which is to be measured, is surrounded by the StartSimpleTiming
+// and StopSimpleTiming macros. It is called "target code" or "code of interest" hereafter.
+//
+// The target code is executed inside the nested loop. Nesting is necessary to allow
+// measurements on arrays that fit cache of a particular level, while making the load
+// big enough to eliminate the influence of random deviations.
+//
+// Macro StartSimpleTiming defines reduction variable "util::anchor", which may be modified (usually 
+// by adding to) by the target code. This can be necessary to prevent optimizing compilers 
+// from throwing out the code of interest. Besides, if the target code is complex enough, 
+// make sure that all its branches contribute (directly or indirectly) to the value 
+// being added to the "util::anchor" variable.
+//
+// To factor out overhead introduced by the measurement infra code it is recommended to make 
+// a calibration run with target code replaced by a no-op (but still modifying "sum"), and
+// store the resulting time in the "util::base" variable.
+//
+// A generally good approach is to make the target code use elements of a preliminary 
+// initialized array. Then for calibration run you just need to add vector elements 
+// to the "sum" variable. To get rid of memory access delays make the array small 
+// enough to fit L2 or L1 cache (play with StartSimpleTiming arguments if necessary).
+//
+// Macro CalibrateSimpleTiming performs default calibration using "util::anchor += i;" operation.
+//
+// Macro ANCHOR_TYPE defines the type of the reduction variable. If it was not 
+// defined  before including this header, it is defined as size_t. Depending on 
+// the target code modern super scalar architectures may blend reduction operation
+// and instructions of interest differently for different target alternatives. So
+// you may play with the type to minimize out-of-order and parallel execution impact
+// on the calibration time veracity. You may even end up with different reduction 
+// variable types (and different calibration times) for different measurements.
+
+
+namespace util {
+
+typedef std::vector<double>    durations_t;
+
+    void trace_histogram ( const durations_t& t, char* histogramFileName )
+    {
+        FILE* f = histogramFileName ? fopen(histogramFileName, "wt") : stdout;
+        size_t  n = t.size();
+        const size_t num_buckets = 100;
+        double  min_val = *std::min_element(t.begin(), t.end()),
+                max_val = *std::max_element(t.begin(), t.end()),
+                bucket_size = (max_val - min_val) / num_buckets;
+        std::vector<size_t> hist(num_buckets + 1, 0);
+        for ( size_t i = 0; i < n; ++i )
+            ++hist[size_t((t[i]-min_val)/bucket_size)];
+        fprintf (f, "Histogram: nvals = %u, min = %g, max = %g, nbuckets = %u\n", (unsigned)n, min_val, max_val, (unsigned)num_buckets);
+        double bucket = min_val;
+        for ( size_t i = 0; i <= num_buckets; ++i, bucket+=bucket_size )
+            fprintf (f, "%12g\t%u\n", bucket, (unsigned)hist[i]);
+        fclose(f);
+    }
+
+    double average ( const durations_t& d, double& variation_percent, double& std_dev_percent )
+    {
+        durations_t t = d;
+        if ( t.size() > 5 ) {
+            t.erase(std::min_element(t.begin(), t.end()));
+            t.erase(std::max_element(t.begin(), t.end()));
+        }
+        size_t  n = t.size();
+        double  sum = 0,
+                min_val = *std::min_element(t.begin(), t.end()),
+                max_val = *std::max_element(t.begin(), t.end());
+        for ( size_t i = 0; i < n; ++i )
+            sum += t[i];
+        double  avg = sum / n,
+                std_dev = 0;
+        for ( size_t i = 0; i < n; ++i ) {
+            double    dev = fabs(t[i] - avg);
+            std_dev += dev * dev;
+        }
+        std_dev = sqrt(std_dev / n);
+        std_dev_percent = std_dev / avg * 100;
+        variation_percent = 100 * (max_val - min_val) / avg;
+        return avg;
+    }
+
+    static int num_threads;
+
+    static double   base = 0,
+                    base_dev = 0,
+                    base_dev_percent = 0;
+
+    static char *empty_fmt = "";
+    static int rate_field_len = 11;
+
+#if !defined(ANCHOR_TYPE)
+    #define ANCHOR_TYPE size_t
+#endif
+
+    static ANCHOR_TYPE anchor = 0;
+    
+    static double sequential_time = 0;
+
+
+#define StartSimpleTiming(nOuter, nInner) {             \
+    tbb::tick_count t1, t0 = tbb::tick_count::now();    \
+    for ( size_t j = 0; l < nOuter; ++l ) {             \
+        for ( size_t i = 0; i < nInner; ++i ) {
+
+#define StopSimpleTiming(res)                   \
+        }                                       \
+        util::anchor += (ANCHOR_TYPE)l;         \
+    }                                           \
+    t1 = tbb::tick_count::now();                \
+    printf (util::empty_fmt, util::anchor);     \
+    res = (t1-t0).seconds() - util::base;       \
+}
+
+#define CalibrateSimpleTiming(T, nOuter, nInner)    \
+    StartSimpleTiming(nOuter, nInner);              \
+        util::anchor += (ANCHOR_TYPE)i;             \
+    StopSimpleTiming(util::base);
+
+
+#define StartTimingImpl(nRuns, nOuter, nInner)      \
+    tbb::tick_count t1, t0;                         \
+    for ( size_t k = 0; k < nRuns; ++k )  {         \
+        t0 = tbb::tick_count::now();                \
+        for ( size_t l = 0; l < nOuter; ++l ) {     \
+            for ( size_t i = 0; i < nInner; ++i ) {
+
+#define StartTiming(nRuns, nOuter, nInner) {        \
+    util::durations_t  t_(nRuns);                   \
+    StartTimingImpl(nRuns, nOuter, nInner)
+
+#define StartTimingEx(vDurations, nRuns, nOuter, nInner) {  \
+    util::durations_t  &t_ = vDurations;                    \
+    vDurations.resize(nRuns);                               \
+    StartTimingImpl(nRuns, nOuter, nInner)
+
+#define StopTiming(Avg, StdDev, StdDevPercent)      \
+            }                                       \
+            util::anchor += (ANCHOR_TYPE)l;         \
+        }                                           \
+        t1 = tbb::tick_count::now();                \
+        t_[k] = (t1 - t0).seconds()/nrep;           \
+    }                                               \
+    printf (util::empty_fmt, util::anchor);         \
+    Avg = util::average(t_, StdDev, StdDevPercent); \
+}
+
+#define CalibrateTiming(nRuns, nOuter, nInner)      \
+    StartTiming(nRuns, nOuter, nInner);             \
+        util::anchor += (ANCHOR_TYPE)i;             \
+    StopTiming(util::base, util::base_dev, util::base_dev_percent);
+
+} // namespace util
+
+
+#ifndef NRUNS
+    #define NRUNS               7
+#endif
+
+#ifndef ONE_TEST_DURATION
+    #define ONE_TEST_DURATION   0.01
+#endif
+
+#define no_histogram  ((char*)-1)
+
+inline 
+double RunTestImpl ( const char* title, void (*pfn)(), char* histogramFileName = no_histogram ) {
+    double  time = 0, variation = 0, deviation = 0;
+    size_t nrep = 1;
+    while (true) {
+        CalibrateTiming(NRUNS, 1, nrep);
+        StartTiming(NRUNS, 1, nrep);
+        pfn();
+        StopTiming(time, variation, deviation);
+        time -= util::base;
+        if ( time > 1e-6 )
+            break;
+        nrep *= 2;
+    }
+    nrep *= (size_t)ceil(ONE_TEST_DURATION/time);
+    CalibrateTiming(NRUNS, 1, nrep);    // sets util::base
+    util::durations_t  t;
+    StartTimingEx(t, NRUNS, 1, nrep);
+        pfn();
+    StopTiming(time, variation, deviation);
+    if ( histogramFileName != (char*)-1 )
+        util::trace_histogram(t, histogramFileName);
+    double clean_time = time - util::base;
+    if ( title ) {
+        // Deviation (in percent) is calulated for the Gross time
+        printf ("\n%-34s %.2e  %5.1f      ", title, clean_time, deviation);
+        if ( util::sequential_time != 0  )
+            //printf ("% .2e  ", clean_time - util::sequential_time);
+            printf ("% 10.1f      ", 100*(clean_time - util::sequential_time)/util::sequential_time);
+        else
+            printf ("%*s ", util::rate_field_len, "");
+        printf ("%-9u %1.6f    |", (unsigned)nrep, time * nrep);
+    }
+    return clean_time;
+}
+
+
+/// Runs the test function, does statistical processing, and, if title is nonzero, prints results.
+/** If histogramFileName is a string, the histogram of individual runs is generated and stored
+    in a file with the given name. If it is NULL then the histogram is printed on the console.
+    By default no histogram is generated. 
+    The histogram format is: "rate bucket start" "number of tests in this bucket". **/
+inline 
+void RunTest ( const char* title_fmt, size_t workload_param, void (*pfn_test)(), char* histogramFileName = no_histogram ) {
+    char title[1024];
+    sprintf(title, title_fmt, (long)workload_param);
+    RunTestImpl(title, pfn_test, histogramFileName);
+}
+
+inline 
+void CalcSequentialTime ( void (*pfn)() ) {
+    util::sequential_time = RunTestImpl(NULL, pfn) / util::num_threads;
+}
+
+inline 
+void ResetSequentialTime () {
+    util::sequential_time = 0;
+}
+
+
+inline void PrintTitle() {
+    //printf ("%-32s %-*s Std Dev,%%  %-*s  Repeats   Gross time  Infra time  | NRUNS = %u", 
+    //        "Test name", util::rate_field_len, "Rate", util::rate_field_len, "Overhead", NRUNS);
+    printf ("%-34s %-*s Std Dev,%%  Par.overhead,%%  Repeats   Gross time  | Nruns %u, Nthreads %d", 
+            "Test name", util::rate_field_len, "Rate", NRUNS, util::num_threads);
+}
+
+void Test();
+
+inline
+int test_main( int argc, char* argv[] ) {
+    ParseCommandLine( argc, argv );
+    ASSERT (MinThread>=2, "Minimal number of threads must be 2 or more");
+    char buf[128];
+    util::rate_field_len = 2 + sprintf(buf, "%.1e", 1.1);
+    for ( int i = MinThread; i <= MaxThread; ++i ) {
+        tbb::task_scheduler_init init (i);
+        util::num_threads = i;
+        PrintTitle();
+        Test();
+        printf("\n");
+    }
+    printf("done\n");
+    return 0;
+}
diff --git a/dep/tbb/src/perf/time_vector.cpp b/dep/tbb/src/perf/time_vector.cpp
new file mode 100644
index 000000000..f2d86725d
--- /dev/null
+++ b/dep/tbb/src/perf/time_vector.cpp
@@ -0,0 +1,256 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+//#define DO_SCALABLEALLOC
+
+#include <cstdlib>
+#include <cmath>
+#include <vector>
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include "tbb/tbb_stddef.h"
+#include "tbb/spin_mutex.h"
+#ifdef DO_SCALABLEALLOC
+#include "tbb/scalable_allocator.h"
+#endif
+#include "tbb/concurrent_vector.h"
+#include "tbb/tbb_allocator.h"
+#include "tbb/cache_aligned_allocator.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/parallel_for.h"
+#include "tbb/tick_count.h"
+#include "tbb/blocked_range.h"
+#include "../test/harness.h"
+//#include "harness_barrier.h"
+#include "../test/harness_allocator.h"
+#define STATISTICS_INLINE
+#include "statistics.h"
+
+using namespace tbb;
+bool ExtraVerbose = false;
+
+class Timer {
+    tbb::tick_count tick;
+public:
+    Timer() { tick = tbb::tick_count::now(); }
+    double get_time()  { return (tbb::tick_count::now() - tick).seconds(); }
+    double diff_time(const Timer &newer) { return (newer.tick - tick).seconds(); }
+    double mark_time() { tick_count t1(tbb::tick_count::now()), t2(tick); tick = t1; return (t1 - t2).seconds(); }
+    double mark_time(const Timer &newer) { tick_count t(tick); tick = newer.tick; return (tick - t).seconds(); }
+};
+
+/************************************************************************/
+/* TEST1                                                                */
+/************************************************************************/
+#define mk_vector_test1(v, a) vector_test1<v<Timer, static_counting_allocator<a<Timer> > >, v<double, static_counting_allocator<a<double> > > >
+template<class timers_vector_t, class values_vector_t>
+class vector_test1 {
+    const char *mode;
+    StatisticsCollector &stat;
+    StatisticsCollector::TestCase key[16];
+
+public:
+    vector_test1(const char *m, StatisticsCollector &s)  :  mode(m), stat(s) {}
+
+    vector_test1 &operator()(size_t len) {
+        if(Verbose) printf("test1<%s>(%u): collecting timing statistics\n", mode, unsigned(len));
+        __TBB_ASSERT(sizeof(Timer) == sizeof(double), NULL);
+        static const char *test_names[] = {
+            "b)creation wholly",
+            "a)creation by push",
+            "c)operation time per item",
+            0 };
+        for(int i = 0; test_names[i]; ++i) key[i] = stat.SetTestCase(test_names[i], mode, len);
+
+        Timer timer0; timers_vector_t::allocator_type::init_counters();
+        timers_vector_t tv(len);
+        Timer timer1; values_vector_t::allocator_type::init_counters();
+        values_vector_t dv;
+        for (size_t i = 0; i < len; ++i)
+            dv.push_back( i );
+        Timer timer2;
+        for (size_t i = 0; i < len; ++i)
+        {
+            dv[len-i-1] = timer0.diff_time(tv[i]);
+            tv[i].mark_time();
+        }
+        stat.AddStatisticValue( key[2], "1total, ms", "%.3f", timer2.get_time()*1000.0 );
+        stat.AddStatisticValue( key[1], "1total, ms", "%.3f", timer1.diff_time(timer2)*1000.0 );
+        stat.AddStatisticValue( key[0], "1total, ms", "%.3f", timer0.diff_time(timer1)*1000.0 );
+        //allocator statistics
+        stat.AddStatisticValue( key[0], "2total allocations", "%d", int(timers_vector_t::allocator_type::allocations) );
+        stat.AddStatisticValue( key[1], "2total allocations", "%d", int(values_vector_t::allocator_type::allocations) );
+        stat.AddStatisticValue( key[2], "2total allocations", "%d",  0);
+        stat.AddStatisticValue( key[0], "3total alloc#items", "%d", int(timers_vector_t::allocator_type::items_allocated) );
+        stat.AddStatisticValue( key[1], "3total alloc#items", "%d", int(values_vector_t::allocator_type::items_allocated) );
+        stat.AddStatisticValue( key[2], "3total alloc#items", "%d",  0);
+        //remarks
+        stat.AddStatisticValue( key[0], "9note", "segment creation time, ns:");
+        stat.AddStatisticValue( key[2], "9note", "average op-time per item, ns:");
+        Timer last_timer(timer2); double last_value = 0;
+        for (size_t j = 0, i = 2; i < len; i *= 2, j++) {
+            stat.AddRoundResult( key[0], (dv[len-i-1]-last_value)*1000000.0 );
+            last_value = dv[len-i-1];
+            stat.AddRoundResult( key[2], last_timer.diff_time(tv[i])/double(i)*1000000.0 );
+            last_timer = tv[i];
+            stat.SetRoundTitle(j, i);
+        }
+        tv.clear(); dv.clear();
+        //__TBB_ASSERT(timers_vector_t::allocator_type::items_allocated == timers_vector_t::allocator_type::items_freed, NULL);
+        //__TBB_ASSERT(values_vector_t::allocator_type::items_allocated == values_vector_t::allocator_type::items_freed, NULL);
+    	return *this;
+    }
+};
+
+/************************************************************************/
+/* TEST2                                                                */
+/************************************************************************/
+#define mk_vector_test2(v, a) vector_test2<v<size_t, a<size_t> > >
+template<class vector_t>
+class vector_test2 {
+    const char *mode;
+    static const int ntrial = 10;
+    StatisticsCollector &stat;
+
+public:
+    vector_test2(const char *m, StatisticsCollector &s)  :  mode(m), stat(s) {}
+
+    vector_test2 &operator()(size_t len) {
+        if(Verbose) printf("test2<%s>(%u): performing standard transformation sequence on vector\n", mode, unsigned(len));
+        StatisticsCollector::TestCase init_key = stat.SetTestCase("allocate", mode, len);
+        StatisticsCollector::TestCase fill_key = stat.SetTestCase("fill", mode, len);
+        StatisticsCollector::TestCase proc_key = stat.SetTestCase("process", mode, len);
+        StatisticsCollector::TestCase full_key = stat.SetTestCase("total time", mode, len);
+        for (int i = 0; i < ntrial; i++) {
+            Timer timer0;
+            vector_t v1(len);
+            vector_t v2(len);
+            Timer timer1;
+            std::generate(v1.begin(), v1.end(), values(0));
+            std::generate(v2.begin(), v2.end(), values(size_t(-len)));
+            Timer timer2;
+            std::reverse(v1.rbegin(), v1.rend());
+            std::inner_product(v1.begin(), v1.end(), v2.rbegin(), 1);
+            std::sort(v1.rbegin(), v1.rend());
+            std::sort(v2.rbegin(), v2.rend());
+            std::set_intersection(v1.begin(), v1.end(), v2.rbegin(), v2.rend(), v1.begin());
+            Timer timer3;
+            stat.AddRoundResult( proc_key, timer2.diff_time(timer3)*1000.0 );
+            stat.AddRoundResult( fill_key, timer1.diff_time(timer2)*1000.0 );
+            stat.AddRoundResult( init_key, timer0.diff_time(timer1)*1000.0 );
+            stat.AddRoundResult( full_key, timer0.diff_time(timer3)*1000.0 );
+        }
+        stat.SetStatisticFormula("1Average", "=AVERAGE(ROUNDS)");
+        stat.SetStatisticFormula("2+/-", "=(MAX(ROUNDS)-MIN(ROUNDS))/2");
+        return *this;
+    }
+
+    class values
+    {
+        size_t value;
+    public:
+        values(size_t i) : value(i) {}
+        size_t operator()() {
+            return value++%(1|(value^55));
+        }
+    };
+};
+
+/************************************************************************/
+/* TEST3                                                                */
+/************************************************************************/
+#define mk_vector_test3(v, a) vector_test3<v<char, local_counting_allocator<a<char>, size_t > > >
+template<class vector_t>
+class vector_test3 {
+    const char *mode;
+    StatisticsCollector &stat;
+
+public:
+    vector_test3(const char *m, StatisticsCollector &s)  :  mode(m), stat(s) {}
+
+    vector_test3 &operator()(size_t len) {
+        if(Verbose) printf("test3<%s>(%u): collecting allocator statistics\n", mode, unsigned(len));
+        static const size_t sz = 1024;
+        vector_t V[sz];
+        StatisticsCollector::TestCase vinst_key = stat.SetTestCase("instances number", mode, len);
+        StatisticsCollector::TestCase count_key = stat.SetTestCase("allocations count", mode, len);
+        StatisticsCollector::TestCase items_key = stat.SetTestCase("allocated items", mode, len);
+        //stat.ReserveRounds(sz-1);
+        for (size_t c = 0, i = 0, s = sz/2; s >= 1 && i < sz; s /= 2, c++)
+        {
+            const size_t count = c? 1<<(c-1) : 0;
+            for (size_t e = i+s; i < e; i++) {
+                //if(count >= 16) V[i].reserve(count);
+                for (size_t j = 0; j < count; j++)
+                    V[i].push_back(j);
+            }
+            stat.SetRoundTitle ( c, count );
+            stat.AddRoundResult( vinst_key, s );
+            stat.AddRoundResult( count_key, V[i-1].get_allocator().allocations );
+            stat.AddRoundResult( items_key, V[i-1].get_allocator().items_allocated );
+        }
+        return *this;
+    }
+};
+
+/************************************************************************/
+/* TYPES SET FOR TESTS                                                  */
+/************************************************************************/
+#define types_set(n, title, op) { StatisticsCollector Collector("time_vector"#n); Collector.SetTitle title; \
+    {mk_vector_test##n(tbb::concurrent_vector, tbb::cache_aligned_allocator) ("TBB:NFS", Collector)op;} \
+    {mk_vector_test##n(tbb::concurrent_vector, tbb::tbb_allocator)           ("TBB:TBB", Collector)op;} \
+    {mk_vector_test##n(tbb::concurrent_vector, std::allocator)               ("TBB:STD", Collector)op;} \
+    {mk_vector_test##n(std::vector, tbb::cache_aligned_allocator)            ("STL:NFS", Collector)op;} \
+    {mk_vector_test##n(std::vector, tbb::tbb_allocator)                      ("STL:TBB", Collector)op;} \
+    {mk_vector_test##n(std::vector, std::allocator)                          ("STL:STD", Collector)op;} \
+    Collector.Print(StatisticsCollector::Stdout|StatisticsCollector::HTMLFile|StatisticsCollector::ExcelXML); }
+
+
+/************************************************************************/
+/* MAIN DRIVER                                                          */
+/************************************************************************/
+int main(int argc, char* argv[]) {
+	if(argc>1) Verbose = true;
+	if(argc>2) ExtraVerbose = true;
+    MinThread = 0; MaxThread = 500000; // use in another meaning - test#:problem size
+    ParseCommandLine( argc, argv );
+
+    ASSERT(tbb_allocator<int>::allocator_type() == tbb_allocator<int>::scalable, "expecting scalable allocator library to be loaded");
+    
+    if(!MinThread || MinThread == 1)
+        types_set(1, ("Vectors performance test #1 for %d", MaxThread), (MaxThread) )
+    if(!MinThread || MinThread == 2)
+        types_set(2, ("Vectors performance test #2 for %d", MaxThread), (MaxThread) )
+    if(!MinThread || MinThread == 3)
+        types_set(3, ("Vectors performance test #3 for %d", MaxThread), (MaxThread) )
+
+    if(!Verbose) printf("done\n");
+    return 0;
+}
+
diff --git a/dep/tbb/src/rml/perfor/omp_nested.cpp b/dep/tbb/src/rml/perfor/omp_nested.cpp
new file mode 100644
index 000000000..b63358cd3
--- /dev/null
+++ b/dep/tbb/src/rml/perfor/omp_nested.cpp
@@ -0,0 +1,135 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdio>
+#include <float.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <omp.h>
+#include <assert.h>
+
+#include "thread_level.h"
+
+using namespace std;
+using namespace tbb;
+
+// Algorithm parameters
+const int Max_OMP_Outer_Threads = 16;
+const int Max_OMP_Inner_Threads = 16;
+
+// Global variables
+int max_outer_threads = Max_OMP_Outer_Threads;
+int max_inner_threads = Max_OMP_Inner_Threads;
+
+// Print help on command-line arguments
+void help_message(char *prog_name) {
+  fprintf(stderr, "\n%s usage:\n", prog_name);
+  fprintf(stderr, 
+	  "  Parameters:\n"
+	  "    -o<num> : max # of threads OMP should use at outer level\n"
+	  "    -i<num> : max # of threads OMP should use at inner level\n"
+	  "\n  Help:\n"
+	  "    -h : print this help message\n");
+}
+
+// Process command-line arguments
+void process_args(int argc, char *argv[], int *max_outer_t, int *max_inner_t) {
+  for (int i=1; i<argc; ++i) {  
+    if (argv[i][0] == '-') {
+      switch (argv[i][1]) {
+      case 'i': // set max_inner_threads
+	if (sscanf(&argv[i][2], "%d", max_inner_t) != 1 || *max_inner_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -i option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'o': // set max_outer_threads
+	if (sscanf(&argv[i][2], "%d", max_outer_t) != 1 || *max_outer_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -o option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'h': // print help message
+	help_message(argv[0]);
+	exit(0);
+	break;
+      default:
+	fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+	help_message(argv[0]);
+	break;
+      }
+    } else {
+      fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+      help_message(argv[0]);
+    }
+  }
+}
+
+int main(int argc, char *argv[]) { 
+  process_args(argc, argv, &max_outer_threads, &max_inner_threads);
+  TotalThreadLevel.init();
+
+  double start, end;
+  start = omp_get_wtime( );
+  
+#pragma omp parallel num_threads(max_outer_threads)
+  {
+    int omp_thread = omp_get_thread_num();
+    if (omp_thread == 0)
+      TotalThreadLevel.change_level(omp_get_num_threads(), omp_outer);
+    if (omp_thread == 0) {
+      sleep(3);
+      TotalThreadLevel.change_level(-1, omp_outer);
+#pragma omp parallel num_threads(max_inner_threads)
+      {
+	int my_omp_thread = omp_get_thread_num();
+	if (my_omp_thread == 0)
+	  TotalThreadLevel.change_level(omp_get_num_threads(), omp_inner);
+	printf("Inner thread %d nested inside outer thread %d\n", my_omp_thread, omp_thread);
+	if (my_omp_thread == 0)
+	  TotalThreadLevel.change_level(-omp_get_num_threads(), omp_inner);
+      }
+      TotalThreadLevel.change_level(1, omp_outer);
+    }
+    else {
+      sleep(6);
+    }
+    if (omp_thread == 0)
+      TotalThreadLevel.change_level(-omp_get_num_threads(), omp_outer);
+  }
+  end = omp_get_wtime( );
+  printf("Simple test of nested OMP (%d outer threads max, %d inner threads max) took: %6.6f\n",
+	 max_outer_threads, max_inner_threads, end-start);
+  TotalThreadLevel.dump();
+  return 0;
+}
diff --git a/dep/tbb/src/rml/perfor/omp_simple.cpp b/dep/tbb/src/rml/perfor/omp_simple.cpp
new file mode 100644
index 000000000..34367780f
--- /dev/null
+++ b/dep/tbb/src/rml/perfor/omp_simple.cpp
@@ -0,0 +1,159 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdio>
+#include <float.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <omp.h>
+#include <assert.h>
+
+#include "thread_level.h"
+#define LOG_THREADS
+
+#include "tbb/task.h"
+#include "tbb/tick_count.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/scalable_allocator.h"
+#include "tbb/parallel_for.h"
+#include "tbb/blocked_range.h"
+
+using namespace std;
+using namespace tbb;
+
+// Algorithm parameters
+const int Max_TBB_Threads = 16;
+const int Max_OMP_Threads = 16;
+
+// Global variables
+int max_tbb_threads = Max_TBB_Threads;
+int max_omp_threads = Max_OMP_Threads;
+
+// Print help on command-line arguments
+void help_message(char *prog_name) {
+  fprintf(stderr, "\n%s usage:\n", prog_name);
+  fprintf(stderr, 
+	  "  Parameters:\n"
+	  "    -t<num> : max # of threads TBB should use\n"
+	  "    -o<num> : max # of threads OMP should use\n"
+	  "\n  Help:\n"
+	  "    -h : print this help message\n");
+}
+
+// Process command-line arguments
+void process_args(int argc, char *argv[], int *max_tbb_t, int *max_omp_t) {
+  for (int i=1; i<argc; ++i) {  
+    if (argv[i][0] == '-') {
+      switch (argv[i][1]) {
+      case 't': // set max_tbb_threads
+	if (sscanf(&argv[i][2], "%d", max_tbb_t) != 1 || *max_tbb_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -t option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'o': // set max_omp_threads
+	if (sscanf(&argv[i][2], "%d", max_omp_t) != 1 || *max_omp_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -o option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'h': // print help message
+	help_message(argv[0]);
+	exit(0);
+	break;
+      default:
+	fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+	help_message(argv[0]);
+	break;
+      }
+    } else {
+      fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+      help_message(argv[0]);
+    }
+  }
+}
+
+int main(int argc, char *argv[]) { 
+  process_args(argc, argv, &max_tbb_threads, &max_omp_threads);
+  TotalThreadLevel.init();
+
+  tick_count start, end;
+  start = tick_count::now();
+  
+#pragma omp parallel num_threads(max_omp_threads)
+  {
+    int omp_thread = omp_get_thread_num();
+#ifdef LOG_THREADS
+    if (omp_thread == 0)
+      TotalThreadLevel.change_level(omp_get_num_threads(), omp_outer);
+#endif
+    task_scheduler_init phase(max_tbb_threads);
+    if (omp_thread == 0) {
+      sleep(3);
+#ifdef LOG_THREADS
+      TotalThreadLevel.change_level(-1, omp_outer);
+#endif
+      parallel_for(blocked_range<size_t>(0, 1000), 
+		   [=](const blocked_range<size_t>& range) {
+#ifdef LOG_THREADS
+	TotalThreadLevel.change_level(1, tbb_inner);
+#endif
+#pragma ivdep
+	for (size_t i=range.begin(); i!=range.end(); ++i) {
+	  if (i==range.begin())
+	    printf("TBB range starting at %d on OMP thread %d\n", (int)i, omp_thread);
+	}
+#ifdef LOG_THREADS
+	TotalThreadLevel.change_level(-1, tbb_inner);
+#endif
+      }, auto_partitioner());
+#ifdef LOG_THREADS
+      TotalThreadLevel.change_level(1, omp_outer);
+#endif
+    }
+    else {
+      sleep(6);
+    }
+#ifdef LOG_THREADS
+    if (omp_thread == 0)
+      TotalThreadLevel.change_level(-omp_get_num_threads(), omp_outer);
+#endif
+  }
+  end = tick_count::now();
+  printf("Simple test of OMP (%d threads max) with TBB (%d threads max) inside took: %6.6f\n",
+	 max_omp_threads, max_tbb_threads, (end-start).seconds());
+#ifdef LOG_THREADS
+  TotalThreadLevel.dump();
+#endif
+  return 0;
+}
diff --git a/dep/tbb/src/rml/perfor/tbb_multi_omp.cpp b/dep/tbb/src/rml/perfor/tbb_multi_omp.cpp
new file mode 100644
index 000000000..c3432f2c9
--- /dev/null
+++ b/dep/tbb/src/rml/perfor/tbb_multi_omp.cpp
@@ -0,0 +1,168 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdio>
+#include <float.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <omp.h>
+#include <assert.h>
+
+#include "thread_level.h"
+
+#include "tbb/task.h"
+#include "tbb/tick_count.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/scalable_allocator.h"
+
+using namespace std;
+using namespace tbb;
+
+// Algorithm parameters
+const int Max_TBB_Threads = 16;
+const int Max_OMP_Threads = 16;
+
+// Global variables
+int max_tbb_threads = Max_TBB_Threads;
+int max_omp_threads = Max_OMP_Threads;
+
+// Print help on command-line arguments
+void help_message(char *prog_name) {
+  fprintf(stderr, "\n%s usage:\n", prog_name);
+  fprintf(stderr, 
+	  "  Parameters:\n"
+	  "    -t<num> : max # of threads TBB should use\n"
+	  "    -o<num> : max # of threads OMP should use\n"
+	  "\n  Help:\n"
+	  "    -h : print this help message\n");
+}
+
+// Process command-line arguments
+void process_args(int argc, char *argv[], int *max_tbb_t, int *max_omp_t) {
+  for (int i=1; i<argc; ++i) {  
+    if (argv[i][0] == '-') {
+      switch (argv[i][1]) {
+      case 't': // set max_tbb_threads
+	if (sscanf(&argv[i][2], "%d", max_tbb_t) != 1 || *max_tbb_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -t option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'o': // set max_omp_threads
+	if (sscanf(&argv[i][2], "%d", max_omp_t) != 1 || *max_omp_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -o option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'h': // print help message
+	help_message(argv[0]);
+	exit(0);
+	break;
+      default:
+	fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+	help_message(argv[0]);
+	break;
+      }
+    } else {
+      fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+      help_message(argv[0]);
+    }
+  }
+}
+
+class SimpleTask : public task {
+  bool isLeaf;
+  int myId;
+public:
+  SimpleTask(bool isLeaf_, int myId_) : isLeaf(isLeaf_), myId(myId_) {}
+  task* execute() {
+    TotalThreadLevel.change_level(1, tbb_outer);
+    omp_set_num_threads(max_omp_threads);
+    if (!isLeaf) {
+      set_ref_count(65);
+      for (int i=0; i<64; ++i) {
+	SimpleTask& st = *new(allocate_child()) SimpleTask(true, i);
+	spawn(st);
+      }
+      TotalThreadLevel.change_level(-1, tbb_outer);
+      wait_for_all();
+      TotalThreadLevel.change_level(1, tbb_outer);
+    }
+    else {
+      if (myId%2 == 0) {
+	sleep(3);
+	TotalThreadLevel.change_level(-1, tbb_outer);
+#pragma omp parallel
+	{
+	  if (omp_get_thread_num() == 0) {
+	    TotalThreadLevel.change_level(omp_get_num_threads(), omp_inner);
+	  }
+	  printf("In OMP parallel region on TBB task with myId=0: thread %d of %d\n", 
+		 omp_get_thread_num(), omp_get_num_threads());
+	  if (omp_get_thread_num() == 0) {
+	    TotalThreadLevel.change_level(-omp_get_num_threads(), omp_inner);
+	  }
+	}
+	TotalThreadLevel.change_level(1, tbb_outer);
+      }
+      else {
+	sleep(6);
+      }
+    }
+    TotalThreadLevel.change_level(-1, tbb_outer);
+    return NULL;
+  }
+};
+
+
+int main(int argc, char *argv[]) { 
+  TotalThreadLevel.init();
+  int dbg=0;
+  TotalThreadLevel.change_level(1, tbb_outer);
+  process_args(argc, argv, &max_tbb_threads, &max_omp_threads);
+
+  task_scheduler_init phase(max_tbb_threads);
+  tick_count start, end;
+  start = tick_count::now();
+  SimpleTask& st = *new(task::allocate_root()) SimpleTask(false, -1);
+  TotalThreadLevel.change_level(-1, tbb_outer);
+  task::spawn_root_and_wait(st);
+  TotalThreadLevel.change_level(1, tbb_outer);
+  end = tick_count::now();
+  printf("Simple Test of TBB (%d threads max) with OMP (%d threads max) inside took: %6.6f\n", 
+	 max_tbb_threads, max_omp_threads, (end-start).seconds());
+
+  TotalThreadLevel.change_level(-1, tbb_outer);
+  TotalThreadLevel.dump();
+  return 0;
+}
diff --git a/dep/tbb/src/rml/perfor/tbb_simple.cpp b/dep/tbb/src/rml/perfor/tbb_simple.cpp
new file mode 100644
index 000000000..a72ed0db2
--- /dev/null
+++ b/dep/tbb/src/rml/perfor/tbb_simple.cpp
@@ -0,0 +1,167 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdio>
+#include <float.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <omp.h>
+#include <assert.h>
+
+#include "thread_level.h"
+
+#include "tbb/task.h"
+#include "tbb/tick_count.h"
+#include "tbb/task_scheduler_init.h"
+#include "tbb/scalable_allocator.h"
+
+using namespace std;
+using namespace tbb;
+
+// Algorithm parameters
+const int Max_TBB_Threads = 16;
+const int Max_OMP_Threads = 16;
+
+// Global variables
+int max_tbb_threads = Max_TBB_Threads;
+int max_omp_threads = Max_OMP_Threads;
+
+// Print help on command-line arguments
+void help_message(char *prog_name) {
+  fprintf(stderr, "\n%s usage:\n", prog_name);
+  fprintf(stderr, 
+	  "  Parameters:\n"
+	  "    -t<num> : max # of threads TBB should use\n"
+	  "    -o<num> : max # of threads OMP should use\n"
+	  "\n  Help:\n"
+	  "    -h : print this help message\n");
+}
+
+// Process command-line arguments
+void process_args(int argc, char *argv[], int *max_tbb_t, int *max_omp_t) {
+  for (int i=1; i<argc; ++i) {  
+    if (argv[i][0] == '-') {
+      switch (argv[i][1]) {
+      case 't': // set max_tbb_threads
+	if (sscanf(&argv[i][2], "%d", max_tbb_t) != 1 || *max_tbb_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -t option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'o': // set max_omp_threads
+	if (sscanf(&argv[i][2], "%d", max_omp_t) != 1 || *max_omp_t < 1) {
+	  fprintf(stderr, "%s Warning: argument of -o option unacceptable: %s\n", argv[0], &argv[i][2]);
+	  help_message(argv[0]);
+	}
+	break;
+      case 'h': // print help message
+	help_message(argv[0]);
+	exit(0);
+	break;
+      default:
+	fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+	help_message(argv[0]);
+	break;
+      }
+    } else {
+      fprintf(stderr, "%s: Warning: command-line option ignored: %s\n", argv[0], argv[i]);
+      help_message(argv[0]);
+    }
+  }
+}
+
+class SimpleTask : public task {
+  bool isLeaf;
+  int myId;
+public:
+  SimpleTask(bool isLeaf_, int myId_) : isLeaf(isLeaf_), myId(myId_) {}
+  task* execute() {
+    TotalThreadLevel.change_level(1, tbb_outer);
+    omp_set_num_threads(max_omp_threads);
+    if (!isLeaf) {
+      set_ref_count(17);
+      for (int i=0; i<16; ++i) {
+	SimpleTask& st = *new(allocate_child()) SimpleTask(true, i);
+	spawn(st);
+      }
+      TotalThreadLevel.change_level(-1, tbb_outer);
+      wait_for_all();
+      TotalThreadLevel.change_level(1, tbb_outer);
+    }
+    else {
+      if (myId == 0) {
+	sleep(3);
+	TotalThreadLevel.change_level(-1, tbb_outer);
+#pragma omp parallel
+	{
+	  if (omp_get_thread_num() == 0) {
+	    TotalThreadLevel.change_level(omp_get_num_threads(), omp_inner);
+	  }
+	  printf("In OMP parallel region on TBB task with myId=0: thread %d of %d\n", 
+		 omp_get_thread_num(), omp_get_num_threads());
+	  if (omp_get_thread_num() == 0) {
+	    TotalThreadLevel.change_level(-omp_get_num_threads(), omp_inner);
+	  }
+	}
+	TotalThreadLevel.change_level(1, tbb_outer);
+      }
+      else {
+	sleep(6);
+      }
+    }
+    TotalThreadLevel.change_level(-1, tbb_outer);
+    return NULL;
+  }
+};
+
+
+int main(int argc, char *argv[]) { 
+  TotalThreadLevel.init();
+  TotalThreadLevel.change_level(1, tbb_outer);
+  process_args(argc, argv, &max_tbb_threads, &max_omp_threads);
+
+  task_scheduler_init phase(max_tbb_threads);
+  tick_count start, end;
+  start = tick_count::now();
+  SimpleTask& st = *new(task::allocate_root()) SimpleTask(false, -1);
+  TotalThreadLevel.change_level(-1, tbb_outer);
+  task::spawn_root_and_wait(st);
+  TotalThreadLevel.change_level(1, tbb_outer);
+  end = tick_count::now();
+  printf("Simple Test of TBB (%d threads max) with OMP (%d threads max) inside took: %6.6f\n", 
+	 max_tbb_threads, max_omp_threads, (end-start).seconds());
+
+  TotalThreadLevel.change_level(-1, tbb_outer);
+  TotalThreadLevel.dump();
+  return 0;
+}
diff --git a/dep/tbb/src/rml/perfor/thread_level.h b/dep/tbb/src/rml/perfor/thread_level.h
new file mode 100644
index 000000000..a73afa81a
--- /dev/null
+++ b/dep/tbb/src/rml/perfor/thread_level.h
@@ -0,0 +1,140 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+// Thread level recorder
+#ifndef __THREAD_LEVEL_H
+#define __THREAD_LEVEL_H
+#include <cstdio>
+#include <omp.h>
+#include "tbb/atomic.h"
+#include "tbb/tick_count.h"
+#include "../src/test/harness.h"
+
+//#define LOG_THREADS // use this to ifdef out calls to this class 
+
+using namespace tbb;
+
+typedef enum {tbb_outer, tbb_inner, omp_outer, omp_inner} client_t;
+
+class ThreadLevelRecorder {
+  tbb::atomic<int> tbb_outer_level;
+  tbb::atomic<int> tbb_inner_level;
+  tbb::atomic<int> omp_outer_level;
+  tbb::atomic<int> omp_inner_level;
+  struct record {
+    tbb::tick_count time;
+    int n_tbb_outer_thread;
+    int n_tbb_inner_thread;
+    int n_omp_outer_thread;
+    int n_omp_inner_thread;
+  };
+  tbb::atomic<unsigned> next;
+  /** Must be power of two */
+  static const unsigned max_record_count = 1<<20;
+  record array[max_record_count];
+  int max_threads;
+  bool fail;
+ public:
+  void change_level(int delta, client_t whichClient);
+  void dump();
+  void init();
+};
+
+void ThreadLevelRecorder::change_level(int delta, client_t whichClient) {
+  int tox=tbb_outer_level, tix=tbb_inner_level, oox=omp_outer_level, oix=omp_inner_level;
+  if (whichClient == tbb_outer) {
+    tox = tbb_outer_level+=delta;
+  } else if (whichClient == tbb_inner) {
+    tix = tbb_inner_level+=delta;
+  } else if (whichClient == omp_outer) {
+    oox = omp_outer_level+=delta;
+  } else if (whichClient == omp_inner) {
+    oix = omp_inner_level+=delta;
+  } else {
+    printf("WARNING: Bad client type; ignoring.\n");
+    return;
+  }
+  // log non-negative entries
+  tbb::tick_count t = tbb::tick_count::now();
+  unsigned k = next++;
+  if (k<max_record_count) {
+    record& r = array[k];
+    r.time = t;
+    r.n_tbb_outer_thread = tox>=0?tox:0;
+    r.n_omp_outer_thread = oox>=0?oox:0;
+    r.n_tbb_inner_thread = tix>=0?tix:0;
+    r.n_omp_inner_thread = oix>=0?oix:0;
+  }
+  char errStr[100];
+  int tot_threads;
+  tot_threads = tox+tix+oox+oix;
+  sprintf(errStr, "ERROR: Number of threads (%d+%d+%d+%d=%d) in use exceeds maximum (%d).\n", 
+	  tox, tix, oox, oix, tot_threads, max_threads);
+  if (tot_threads > max_threads) {
+#ifdef NO_BAIL_OUT
+    if (!fail) {
+      printf("%sContinuing...\n", errStr);
+      fail = true;
+    }
+#else
+    dump();
+    ASSERT(tot_threads <= max_threads, errStr);
+#endif
+  }
+}
+
+void ThreadLevelRecorder::dump() {
+  FILE* f = fopen("time.txt","w");
+  if (!f) {
+    perror("fopen(time.txt)\n");
+    exit(1);
+  }
+  unsigned limit = next;
+  if (limit>max_record_count) { // Clip
+    limit = max_record_count;
+  }
+  for (unsigned i=0; i<limit; ++i) {
+    fprintf(f,"%f\t%d\t%d\t%d\t%d\n",(array[i].time-array[0].time).seconds(), array[i].n_tbb_outer_thread,
+	    array[i].n_tbb_inner_thread, array[i].n_omp_outer_thread, array[i].n_omp_inner_thread);
+  }
+  fclose(f);
+  int tox=tbb_outer_level, tix=tbb_inner_level, oox=omp_outer_level, oix=omp_inner_level;
+  int tot_threads;
+  tot_threads = tox+tix+oox+oix;
+  if (!fail) printf("INFO: Passed.\n");
+  else printf("INFO: Failed.\n");
+}
+
+void ThreadLevelRecorder::init() {
+  fail = false;
+  max_threads = omp_get_max_threads();
+  printf("INFO: Getting maximum hardware threads... %d.\n", max_threads);
+}
+
+ThreadLevelRecorder TotalThreadLevel;
+#endif
diff --git a/dep/tbb/src/rml/test/rml_omp_stub.cpp b/dep/tbb/src/rml/test/rml_omp_stub.cpp
new file mode 100644
index 000000000..d9d6ba4c1
--- /dev/null
+++ b/dep/tbb/src/rml/test/rml_omp_stub.cpp
@@ -0,0 +1,66 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+// This file is compiled with C++, but linked with a program written in C.
+// The intent is to find dependencies on the C++ run-time.
+
+#include <stdlib.h>
+#define RML_PURE_VIRTUAL_HANDLER abort
+
+#if _MSC_VER==1500 && !defined(__INTEL_COMPILER)
+// VS2008/VC9 seems to have an issue; 
+#pragma warning( push )
+#pragma warning( disable: 4100 ) 
+#endif          
+#include "rml_omp.h"
+#if _MSC_VER==1500 && !defined(__INTEL_COMPILER)
+#pragma warning( pop )
+#endif
+
+rml::versioned_object::version_type Version;
+
+class MyClient: public __kmp::rml::omp_client {
+public:
+    /*override*/rml::versioned_object::version_type version() const {return 0;}
+    /*override*/size_type max_job_count() const {return 1024;}
+    /*override*/size_t min_stack_size() const {return 1<<20;}
+    /*override*/rml::job* create_one_job() {return NULL;}
+    /*override*/void acknowledge_close_connection() {}
+    /*override*/void cleanup(job&) {}
+    /*override*/policy_type policy() const {return throughput;}
+    /*override*/void process( job&, void*, __kmp::rml::omp_client::size_type ) {}
+   
+};
+
+//! Never actually set, because point of test is to find linkage issues.
+__kmp::rml::omp_server* MyServerPtr;
+
+extern "C" void Cplusplus() {
+    MyClient client;
+    Version = client.version();
+}
diff --git a/dep/tbb/src/rml/test/test_job_automaton.cpp b/dep/tbb/src/rml/test/test_job_automaton.cpp
new file mode 100644
index 000000000..29fd7928f
--- /dev/null
+++ b/dep/tbb/src/rml/test/test_job_automaton.cpp
@@ -0,0 +1,154 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "job_automaton.h"
+#define HARNESS_NO_PARSE_COMMAND_LINE 1
+#include "harness.h"
+#include "harness_barrier.h"
+
+class State {
+    Harness::SpinBarrier barrier;
+    rml::internal::job_automaton ja;
+    rml::job job;
+    tbb::atomic<int> job_created;
+    tbb::atomic<int> job_destroyed;
+    tbb::atomic<bool> job_received;
+public:
+    State() : barrier(2) {
+        job_created = 0;
+        job_destroyed = 0;
+        job_received = false;
+    }
+    void exercise( bool is_owner );
+    ~State() {
+        ASSERT( job_created==job_destroyed, "accounting error" );
+        ASSERT( job_destroyed<=1, "destroyed job twice" );
+    }
+};
+
+int DelayMask;
+const int N = 14; 
+tbb::atomic<int> Coverage[N];
+
+//! Mark kth interval as covered and insert delay if kth bit of DelayMask is set.
+/** An interval is the code between two operations on the job_automaton that we are testing. */
+void Cover( int k ) {
+    ASSERT( k<N, NULL );
+    ++Coverage[k];
+    if( DelayMask>>k&1 ) {
+        // Introduce delay (and possibly a thread context switch)
+        __TBB_Yield();
+    }
+}
+
+void State::exercise( bool is_owner ) {
+    barrier.wait();
+    if( is_owner ) {
+        Cover(0);
+        if( ja.try_acquire() ) {
+            Cover(1);
+            ++job_created; 
+            ja.set_and_release(job);
+            Cover(2);
+            if( ja.try_acquire() ) {
+                Cover(3);
+                ja.release();
+                Cover(4);
+                if( ja.try_acquire() ) {
+                    Cover(5);
+                    ja.release();
+                }
+            }
+            Cover(6);
+        } else {
+            Cover(7);
+        }
+        if( DelayMask&1<<N ) {
+            while( !job_received ) 
+                __TBB_Yield();
+        }
+    } else {
+        // Using extra bit of DelayMask for choosing whether to run wait_for_job or not.
+        if( DelayMask&1<<N ) {
+            rml::job* j= &ja.wait_for_job(); 
+ if( j!=&job ) printf("%p\n",j);
+            ASSERT( j==&job, NULL );
+            job_received = true;
+        }
+        Cover(8);
+    }   
+    rml::job* j;
+    if( ja.try_plug(j) ) {
+        ASSERT( j==&job || !j, NULL );
+        if( j ) {
+            Cover(9+is_owner);
+            ++job_destroyed;
+        } else {
+            __TBB_ASSERT( !is_owner, "owner failed to create job but plugged self" );
+            Cover(11);
+        } 
+    } else {
+        Cover(12+is_owner);
+    }
+}
+
+class Loop: NoAssign {
+    State& s;
+public:
+    Loop(State& s_) : s(s_) {}
+    void operator()( int i ) const {s.exercise(i==0);}
+};
+
+/** Return true if coverage is acceptable.
+    If report==true, issue message if it is unacceptable. */
+bool CheckCoverage( bool report ) {
+    bool okay = true;
+    for( int i=0; i<N; ++i ) {
+        const int min_coverage = 4; 
+        if( Coverage[i]<min_coverage ) {
+            okay = false;
+            if( report )
+                printf("Warning: Coverage[%d]=%d is less than acceptable minimum of %d\n", i, int(Coverage[i]),min_coverage);
+        }
+    }
+    return okay;
+}
+
+int main() {
+    for( DelayMask=0; DelayMask<8<<N; ++DelayMask ) {
+        State s;
+        NativeParallelFor( 2, Loop(s) );
+        if( CheckCoverage(false) ) { 
+            // Reached acceptable code coverage level
+            break;
+        }
+    }
+    CheckCoverage(true);
+    printf("done\n");
+    return 0;
+}
diff --git a/dep/tbb/src/rml/test/test_rml_mixed.cpp b/dep/tbb/src/rml/test/test_rml_mixed.cpp
new file mode 100644
index 000000000..b70d914ac
--- /dev/null
+++ b/dep/tbb/src/rml/test/test_rml_mixed.cpp
@@ -0,0 +1,247 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "rml_tbb.h"
+#include "rml_omp.h"
+#include "tbb/atomic.h"
+#include "tbb/tick_count.h"
+#include "harness.h"
+
+const int OMP_ParallelRegionSize = 16;
+int TBB_MaxThread = 4;           // Includes master 
+int OMP_MaxThread = int(~0u>>1); // Includes master
+
+template<typename Client>
+class ClientBase: public Client {
+protected:
+    typedef typename Client::version_type version_type;
+    typedef typename Client::job job;
+    typedef typename Client::policy_type policy_type;
+
+private:
+    /*override*/version_type version() const {
+        return 0;
+    }
+    /*override*/size_t min_stack_size() const {
+        return 1<<20;
+    }
+    /*override*/job* create_one_job() {
+        return new rml::job;
+    }
+    /*override*/policy_type policy() const {
+        return Client::turnaround;
+    }
+    /*override*/void acknowledge_close_connection() {
+        delete this;
+    }
+    /*override*/void cleanup( job& j ) {delete &j;}
+};
+
+//! Represents a TBB or OpenMP run-time that uses RML.
+template<typename Factory, typename Client>
+class RunTime {
+public:
+    //! Factory that run-time uses to make servers.
+    Factory factory;
+    Client* client;
+    typename Factory::server_type* server;
+    RunTime() {
+        factory.open();
+    }
+    ~RunTime() {
+        factory.close();
+    }
+    //! Create server for this run-time
+    void create_connection();
+
+    //! Destroy server for this run-time
+    void destroy_connection();
+};
+
+class ThreadLevelRecorder {
+    tbb::atomic<int> level;
+    struct record {
+        tbb::tick_count time;
+        int nthread;
+    };
+    tbb::atomic<unsigned> next;
+    /** Must be power of two */
+    static const unsigned max_record_count = 1<<20;
+    record array[max_record_count];
+public:
+    void change_level( int delta );
+    void dump();
+};
+
+void ThreadLevelRecorder::change_level( int delta ) {
+    int x = level+=delta;
+    tbb::tick_count t = tbb::tick_count::now();
+    unsigned k = next++;
+    if( k<max_record_count ) {
+        record& r = array[k];
+        r.time = t;
+        r.nthread = x;
+    } 
+}
+
+void ThreadLevelRecorder::dump() {
+    FILE* f = fopen("time.txt","w");
+    if( !f ) {
+        perror("fopen(time.txt)\n");
+        exit(1);
+    }
+    unsigned limit = next;
+    if( limit>max_record_count ) {
+        // Clip
+        limit = next;
+    }
+    for( unsigned i=0; i<limit; ++i ) {
+        fprintf(f,"%f\t%d\n",(array[i].time-array[0].time).seconds(),array[i].nthread);
+    }
+    fclose(f);
+}
+
+ThreadLevelRecorder TotalThreadLevel;
+
+class TBB_Client: public ClientBase<tbb::internal::rml::tbb_client> {
+    /*override*/void process( job& j );
+    /*override*/size_type max_job_count() const {
+        return TBB_MaxThread-1;
+    }
+};
+
+class OMP_Client: public ClientBase<__kmp::rml::omp_client> {
+    /*override*/void process( job&, void* cookie, omp_client::size_type );
+    /*override*/size_type max_job_count() const {
+        return OMP_MaxThread-1;
+    }
+};
+
+RunTime<tbb::internal::rml::tbb_factory, TBB_Client> TBB_RunTime;
+RunTime<__kmp::rml::omp_factory, OMP_Client> OMP_RunTime;
+
+template<typename Factory, typename Client>
+void RunTime<Factory,Client>::create_connection() {
+    client = new Client;
+    typename Factory::status_type status = factory.make_server( server, *client );
+    ASSERT( status==Factory::st_success, NULL );
+}
+
+template<typename Factory, typename Client>
+void RunTime<Factory,Client>::destroy_connection() {
+    server->request_close_connection();
+    server = NULL;
+}
+
+class OMP_Team {
+public:
+    OMP_Team( __kmp::rml::omp_server& ) {}
+    tbb::atomic<unsigned> barrier;
+};
+
+tbb::atomic<int> AvailWork;
+tbb::atomic<int> CompletionCount;
+ 
+void OMPWork() {
+    tbb::atomic<int> x;
+    for( x=0; x<2000000; ++x ) {
+        continue;
+    }
+}
+
+void TBBWork() {
+    if( AvailWork>=0 ) {
+        int k = --AvailWork;
+        if( k==-1 ) {
+            TBB_RunTime.server->adjust_job_count_estimate(-(TBB_MaxThread-1));
+            ++CompletionCount;
+        } else if( k>=0 ) {
+            for( int k=0; k<4; ++k ) {
+                OMP_Team team( *OMP_RunTime.server );
+                int n = OMP_RunTime.server->try_increase_load( OMP_ParallelRegionSize-1, /*strict=*/false );
+                team.barrier = 0;
+                ::rml::job* array[OMP_ParallelRegionSize-1];
+                if( n>0)
+                    OMP_RunTime.server->get_threads( n, &team, array );
+                // Master does work inside parallel region too.
+                OMPWork();
+                // Master waits for workers to finish
+                if( n>0 )
+                    while( team.barrier!=unsigned(n) ) {
+                        __TBB_Yield();
+                    } 
+            }
+            ++CompletionCount;
+        }
+    }
+}
+
+/*override*/void TBB_Client::process( job& ) {
+    TotalThreadLevel.change_level(1);
+    TBBWork();
+    TotalThreadLevel.change_level(-1);
+}  
+
+/*override*/void OMP_Client::process( job& /* j */, void* cookie, omp_client::size_type ) {
+    TotalThreadLevel.change_level(1);
+    ASSERT( OMP_RunTime.server, NULL );
+    OMPWork();
+    ASSERT( OMP_RunTime.server, NULL );
+    static_cast<OMP_Team*>(cookie)->barrier+=1;
+    TotalThreadLevel.change_level(-1);
+}
+
+void TBBOutSideOpenMPInside() {
+    TotalThreadLevel.change_level(1);
+    CompletionCount = 0;
+    int tbbtasks = 32;
+    AvailWork = tbbtasks;
+    TBB_RunTime.server->adjust_job_count_estimate(TBB_MaxThread-1);
+    while( CompletionCount!=tbbtasks+1 ) {
+        TBBWork();
+    }
+    TotalThreadLevel.change_level(-1);
+}  
+
+int main( int argc, char* argv[] ) {
+    // Set defaults
+    MinThread = 4;
+    MaxThread = 4;
+    ParseCommandLine(argc,argv);
+    for( int TBB_MaxThread=MinThread; TBB_MaxThread<=MaxThread; ++TBB_MaxThread ) {
+        if( Verbose ) printf("Testing with TBB_MaxThread=%d\n", TBB_MaxThread);
+        TBB_RunTime.create_connection();
+        OMP_RunTime.create_connection();
+        TBBOutSideOpenMPInside();
+        OMP_RunTime.destroy_connection();
+        TBB_RunTime.destroy_connection();
+    }
+    TotalThreadLevel.dump();
+    printf("done\n");
+    return 0;
+}
diff --git a/dep/tbb/src/rml/test/test_rml_omp.cpp b/dep/tbb/src/rml/test/test_rml_omp.cpp
new file mode 100644
index 000000000..fedf851aa
--- /dev/null
+++ b/dep/tbb/src/rml/test/test_rml_omp.cpp
@@ -0,0 +1,173 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "rml_omp.h"
+#include "test_server.h"
+#include "tbb/tbb_misc.h"
+
+typedef __kmp::rml::omp_server MyServer;
+typedef __kmp::rml::omp_factory MyFactory;
+
+static bool StrictTeam;
+
+class MyTeam {
+    MyTeam& operator=( const MyTeam& ) ;
+public:
+    struct info_type {
+        rml::job* job;
+        bool ran;
+        info_type() : job(NULL), ran(false) {}
+    };
+    MyTeam( MyServer& /* server */, size_t max_thread_ ) :
+        max_thread(max_thread_)
+    {
+        self_ptr = this;
+        info = new info_type[max_thread];
+    }
+    ~MyTeam() {
+        delete[] info;
+    }
+    const size_t max_thread;
+    size_t n_thread;
+    tbb::atomic<int> barrier;
+    /** Indexed with 1-origin index */
+    info_type* info;
+    int iteration;
+    MyTeam* self_ptr;
+};
+
+class MyClient: public ClientBase<__kmp::rml::omp_client> {
+public:
+    MyServer* server;
+    /*override*/void process( job& j, void* cookie, size_type index ) {
+        MyTeam& t = *static_cast<MyTeam*>(cookie);
+        ASSERT( t.self_ptr==&t, "trashed cookie" ); 
+        ASSERT( index<t.max_thread, NULL ); 
+        ASSERT( !t.info[index].ran, "duplicate index?" ); 
+        t.info[index].job = &j;
+        t.info[index].ran = true;
+        do_process(j);
+        if( index==1 && nesting.level<nesting.limit ) {
+            DoOneConnection<MyFactory,MyClient> doc(MaxThread,Nesting(nesting.level+1,nesting.limit),0,false);
+            doc(0);
+        }
+        ++t.barrier;
+    }
+    static const bool is_omp = true;
+    bool is_strict() const {return StrictTeam;}
+};
+
+void FireUpJobs( MyServer& server, MyClient& client, int max_thread, int n_extra, Checker* checker ) {
+    ASSERT( max_thread>=0, NULL );
+    client.server = &server;
+    MyTeam team(server,size_t(max_thread));
+    MyServer::size_type n_thread = 0;
+    for( int iteration=0; iteration<4; ++iteration ) {
+        for( size_t i=0; i<team.max_thread; ++i ) 
+            team.info[i].ran = false;
+        switch( iteration ) {
+            default:
+                n_thread = int(max_thread);
+                break;
+            case 1:
+                // No change in number of threads
+                break;
+            case 2:
+                // Decrease number of threads.  
+                n_thread = int(max_thread)/2;
+                break;
+            // Case 3 is same code as the default, but has effect of increasing the number of threads.
+        }
+        team.barrier = 0;
+        if( Verbose ) {
+            printf("client %d: server.run with n_thread=%d\n", client.client_id(), int(n_thread) );
+        }
+        server.independent_thread_number_changed( n_extra );
+        if( checker ) {
+            // Give RML time to respond to change in number of threads.
+            MilliSleep(1);
+        }
+        int n_delivered = server.try_increase_load( n_thread, StrictTeam );
+        team.n_thread = n_delivered;
+        ::rml::job* job_array[JobArraySize];
+        job_array[n_delivered] = (::rml::job*)intptr_t(-1);
+        server.get_threads( n_delivered, &team, job_array );
+        __TBB_ASSERT( job_array[n_delivered]== (::rml::job*)intptr_t(-1), NULL );
+        for( int i=0; i<n_delivered; ++i ) {
+            MyJob* j = static_cast<MyJob*>(job_array[i]);
+            int s = j->state;
+            ASSERT( s==MyJob::idle||s==MyJob::busy, NULL );
+        }
+        server.independent_thread_number_changed( -n_extra );
+        if( Verbose ) {
+            printf("client %d: team size is %d\n", client.client_id(), n_delivered);
+        }
+        if( checker ) {
+            checker->check_number_of_threads_delivered( n_delivered, n_thread, n_extra );
+        }      
+        // Protocol requires that master wait until workers have called "done_processing"
+        while( team.barrier!=n_delivered ) {
+            ASSERT( team.barrier>=0, NULL );
+            ASSERT( team.barrier<=n_delivered, NULL );
+            __TBB_Yield();
+        }
+        if( Verbose ) {
+            printf("client %d: team completed\n", client.client_id() );
+        }
+        for( int i=0; i<n_delivered; ++i ) {
+            ASSERT( team.info[i].ran, "thread on team allegedly delivered, but did not run?" );
+        }
+        for( MyServer::size_type i=n_delivered; i<MyServer::size_type(max_thread); ++i ) {
+            ASSERT( !team.info[i].ran, "thread on team ran with illegal index" );
+        }
+        ASSERT( !StrictTeam || n_delivered==int(n_thread), "server failed to satisfy strict request" );
+    }
+}
+
+void DoClientSpecificVerification( MyServer& server, int /*n_thread*/ )
+{
+    ASSERT( server.current_balance()==int(tbb::internal::DetectNumberOfWorkers())-1, NULL );
+}
+
+int main( int argc, char* argv[] ) {
+    // Set defaults
+    MinThread = 0;
+    MaxThread = 4;
+    ParseCommandLine(argc,argv);
+
+    StrictTeam = true;
+    VerifyInitialization<MyFactory,MyClient>( MaxThread );
+    SimpleTest<MyFactory,MyClient>();
+
+    StrictTeam = false;
+    VerifyInitialization<MyFactory,MyClient>( MaxThread );
+    SimpleTest<MyFactory,MyClient>();
+
+    printf("done\n");
+    return 0;
+}
diff --git a/dep/tbb/src/rml/test/test_rml_omp_c_linkage.c b/dep/tbb/src/rml/test/test_rml_omp_c_linkage.c
new file mode 100644
index 000000000..e94790fdd
--- /dev/null
+++ b/dep/tbb/src/rml/test/test_rml_omp_c_linkage.c
@@ -0,0 +1,37 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include <stdio.h>
+
+void Cplusplus();
+
+int main() {
+    Cplusplus();      
+    printf("done\n");
+    return 0;
+} 
diff --git a/dep/tbb/src/rml/test/test_rml_tbb.cpp b/dep/tbb/src/rml/test/test_rml_tbb.cpp
new file mode 100644
index 000000000..a3cd666ba
--- /dev/null
+++ b/dep/tbb/src/rml/test/test_rml_tbb.cpp
@@ -0,0 +1,122 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "rml_tbb.h"
+#include "test_server.h"
+
+typedef tbb::internal::rml::tbb_server MyServer;
+typedef tbb::internal::rml::tbb_factory MyFactory;
+
+class MyClient: public ClientBase<tbb::internal::rml::tbb_client> {
+    tbb::atomic<int> counter;
+    /*override*/void process( job& j ) {
+        do_process(j);
+    }
+public:
+    MyClient() {counter=1;}
+    static const bool is_omp = false;
+    bool is_strict() const {return false;}
+};
+
+void FireUpJobs( MyServer& server, MyClient& client, int n_thread, int n_extra, Checker* checker ) {
+    if( Verbose ) 
+        printf("client %d: calling adjust_job_count_estimate(%d)\n", client.client_id(),n_thread); 
+    // Exercise independent_thread_number_changed, even for zero values.
+    server.independent_thread_number_changed( n_extra );
+    // Experiments indicate that when oversubscribing, the main thread should wait a little
+    // while for the RML worker threads to do some work. 
+    int delay = n_thread>int(server.default_concurrency()) ? 50 : 1;
+    if( checker ) {
+        // Give RML time to respond to change in number of threads.
+        MilliSleep(delay);
+        for( int k=0; k<n_thread; ++k )
+            client.job_array[k].processing_count = 0;
+    }
+    server.adjust_job_count_estimate( n_thread );
+    int n_used = 0;
+    if( checker ) {
+        MilliSleep(delay);
+        for( int k=0; k<n_thread; ++k )
+            if( client.job_array[k].processing_count )
+                ++n_used;
+    }
+    // Logic further below presumes that jobs never starve, so undo previous call
+    // to independent_thread_number_changed before waiting on those jobs.
+    server.independent_thread_number_changed( -n_extra );
+    if( Verbose ) 
+        printf("client %d: wait for each job to be processed at least once\n",client.client_id());
+    // Calculate the number of jobs that are expected to get threads.
+    // Typically this is equal to n_thread.  But if nested, subtract 1 to account for the fact
+    // that this thread itself cannot process the job.
+    int expected = client.nesting.level==0 ? n_thread : n_thread-1;
+    // Wait for expected number of jobs to be processed.
+    if( client.nesting.level==0 ) {
+        for(;;) {
+            int n = 0;
+            for( int k=0; k<n_thread; ++k ) 
+                if( client.job_array[k].processing_count!=0 ) 
+                    ++n;
+            if( n>=expected ) break;
+            server.yield();
+        }
+    } else {
+        printf("testing of nested tbb execution is yet to be supported\n");
+    }
+    server.adjust_job_count_estimate(-n_thread);
+    if( checker ) 
+        checker->check_number_of_threads_delivered( n_used, n_thread, n_extra );
+}
+
+void DoClientSpecificVerification( MyServer&, int n_thread )
+{
+    MyClient* client = new MyClient;
+    client->initialize( n_thread, Nesting(), ClientStackSize[0] );
+    MyFactory factory;
+    memset( &factory, 0, sizeof(factory) );
+    MyFactory::status_type status = factory.open();
+    ASSERT( status!=MyFactory::st_not_found, "could not find RML library" );
+    ASSERT( status!=MyFactory::st_incompatible, NULL );
+    ASSERT( status==MyFactory::st_success, NULL );
+    MyFactory::server_type* server; 
+    status = factory.make_server( server, *client );
+    ASSERT( status==MyFactory::st_connection_exists, "Did the first connection get lost?" );
+    factory.close();
+    client->update(MyClient::destroyed, MyClient::live);
+    delete client;
+}
+
+int main( int argc, char* argv[] ) {
+    // Set defaults
+    MinThread = 0;
+    MaxThread = 4;
+    ParseCommandLine(argc,argv);
+    VerifyInitialization<MyFactory,MyClient>( MaxThread );
+    SimpleTest<MyFactory,MyClient>();
+    printf("done\n");
+    return 0;
+}
diff --git a/dep/tbb/src/rml/test/test_server.h b/dep/tbb/src/rml/test/test_server.h
new file mode 100644
index 000000000..65e07af9c
--- /dev/null
+++ b/dep/tbb/src/rml/test/test_server.h
@@ -0,0 +1,398 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+/* This header contains code shared by test_omp_server.cpp and test_tbb_server.cpp 
+   There is no ifndef guard - test is supposed to include this file exactly once.
+   The test is also exected to have #include of rml_omp.h or rml_tbb.h before 
+   including this header. 
+
+   This header should not use any parts of TBB that require linking in the TBB run-time. 
+   It uses a few instances of tbb::atomic<T>, all of which are completely inlined. */
+
+#include "tbb/atomic.h"
+#include "tbb/tbb_thread.h"
+#include "harness.h"
+#include "harness_memory.h"
+
+//! Define TRIVIAL as 1 to test only a single client, no nesting, no extra threads.
+#define TRIVIAL 0
+
+//! Maximum number of clients 
+#if TRIVIAL 
+const size_t MaxClient = 1;
+#else
+const size_t MaxClient = 4;
+#endif
+
+const size_t ClientStackSize[MaxClient] = {
+    1000000
+#if !TRIVIAL
+   ,2000000
+   ,1000000
+   ,4000000
+#endif /* TRIVIAL */
+};
+
+const size_t OverheadStackSize = 500000;
+
+const size_t JobArraySize = 1000;
+
+#if _WIN32||_WIN64
+#include <Windows.h> /* Need Sleep */
+#else
+#include <unistd.h>  /* Need usleep */   
+#endif
+
+void MilliSleep( unsigned milliseconds ) {
+#if _WIN32||_WIN64
+    Sleep( milliseconds );
+#else
+    usleep( milliseconds*1000 );
+#endif /* _WIN32||_WIN64 */
+}
+
+class MyJob: public ::rml::job {
+public:
+    //! Enumeration for tracking states of a job.
+    enum state_t {
+        //! Job has not yet been allocated.
+        unallocated,
+        //! Is idle.
+        idle,
+        //! Has a thread working on it.
+        busy,
+        //! After call to client::cleanup 
+        clean
+    };
+    tbb::atomic<int> state;
+    volatile int processing_count;
+    void update( state_t new_state, state_t old_state ) {
+        int o = state.compare_and_swap(new_state,old_state);
+        ASSERT( o==old_state, "illegal transition" );
+    }
+    void update_from_either( state_t new_state, state_t old_state1, state_t old_state2 ) {
+        int snapshot;
+        do {
+            snapshot = state;
+            ASSERT( snapshot==old_state1||snapshot==old_state2, "illegal transition" );
+        } while( state.compare_and_swap(new_state,snapshot)!=snapshot );
+    }
+    MyJob() {
+        state=unallocated;
+        processing_count=0;
+    }
+    ~MyJob() {
+        // Overwrite so that accidental use after destruction can be detected.
+        memset(this,-1,sizeof(*this));
+    }
+};
+
+static tbb::atomic<int> ClientConstructions;
+static tbb::atomic<int> ClientDestructions;
+
+struct Nesting {
+    int level;
+    int limit;
+    Nesting() : level(0), limit(0) {}
+    Nesting( int level_, int limit_ ) : level(level_), limit(limit_) {}
+};
+
+template<typename Client>
+class ClientBase: public Client {
+protected:
+    typedef typename Client::size_type size_type;
+    typedef typename Client::version_type version_type;
+    typedef typename Client::policy_type policy_type;
+    typedef typename Client::job job;
+private:
+    size_type my_max_job_count;
+    size_t my_stack_size;
+    tbb::atomic<size_t> next_job_index;
+    int my_client_id;
+    rml::server* my_server;
+
+public:
+    enum state_t {
+        //! Treat *this as constructed.
+        live=0x1,
+        //! Treat *this as destroyed.
+        destroyed=0xDEAD
+    };
+
+    tbb::atomic<int> state;
+    void update( state_t new_state, state_t old_state ) {
+        int o = state.compare_and_swap(new_state,old_state);
+        ASSERT( o==old_state, NULL );
+    }
+
+    tbb::atomic<bool> expect_close_connection;
+
+    MyJob *job_array;
+ 
+    /*override*/version_type version() const {
+        ASSERT( state==live, NULL );
+        return 1;
+    }
+ 
+    /*override*/size_type max_job_count() const {
+        ASSERT( state==live, NULL );
+        return my_max_job_count;
+    }
+
+    /*override*/size_t min_stack_size() const {
+        ASSERT( state==live, NULL );
+        return my_stack_size;
+    }
+
+    /*override*/policy_type policy() const {return Client::throughput;} 
+
+    /*override*/void acknowledge_close_connection() {
+        ASSERT( expect_close_connection, NULL );
+        for( size_t k=next_job_index; k>0; ) {
+            --k;
+            ASSERT( job_array[k].state==MyJob::clean, NULL );
+        }
+        delete[] job_array;
+        job_array = NULL;
+        ASSERT( my_server, NULL );
+        update( destroyed, live );
+        delete this; 
+    }
+
+    /*override*/void cleanup( job& j_ ) {
+        if( Verbose ) 
+            printf("client %d: cleanup(%p) called\n",client_id(),&j_);
+        ASSERT( state==live, NULL );
+        MyJob& j = static_cast<MyJob&>(j_);
+        j.update(MyJob::clean,MyJob::idle);
+        if( Verbose ) 
+            printf("client %d: cleanup(%p) returns\n",client_id(),&j_);
+    }
+   
+    job* create_one_job();
+
+protected:
+    void do_process( job& j_ ) {
+        ASSERT( state==live, NULL );
+        MyJob& j = static_cast<MyJob&>(j_);
+        ASSERT( &j, NULL );
+        j.update(MyJob::busy,MyJob::idle);
+        ++j.processing_count;
+        ASSERT( my_stack_size>OverheadStackSize, NULL ); 
+#ifdef __ia64__
+        // Half of the stack is reserved for RSE, so test only remaining half.
+        UseStackSpace( (my_stack_size-OverheadStackSize)/2 );
+#else
+        UseStackSpace( my_stack_size-OverheadStackSize );
+#endif 
+        j.update(MyJob::idle,MyJob::busy);
+        my_server->yield();
+    } 
+public:
+    ClientBase() : my_server(NULL) {
+        my_client_id = ClientConstructions++;
+        next_job_index = 0; 
+    }
+    int client_id() const {return my_client_id;}
+
+    Nesting nesting;
+
+    void initialize( size_type max_job_count, Nesting nesting_, size_t stack_size ) {
+        ASSERT( stack_size>0, NULL );
+        my_max_job_count = max_job_count;
+        nesting = nesting_;
+        my_stack_size = stack_size;
+        job_array = new MyJob[JobArraySize];
+        expect_close_connection = false;
+        state = live;
+    }
+
+    void set_server( rml::server* s ) {my_server=s;}
+
+    virtual ~ClientBase() {
+        ASSERT( state==destroyed, NULL );
+        ++ClientDestructions;
+    }
+};
+
+template<typename Client>
+typename Client::job* ClientBase<Client>::create_one_job() {
+    if( Verbose ) 
+        printf("client %d: create_one_job() called\n",client_id());
+    size_t k = next_job_index++;
+    ASSERT( state==live, NULL );
+    // Following assertion depends on assumption that implementation does not destroy jobs until 
+    // the connection is closed.  If the implementation is changed to destroy jobs sooner, the 
+    // test logic in this header will have to be reworked.
+    ASSERT( k<my_max_job_count, "RML allocated more than max_job_count jobs simultaneously" );
+    ASSERT( k<JobArraySize, "JobArraySize not big enough (problem is in test, not RML)" );
+    MyJob& j = job_array[k];
+    j.update(MyJob::idle,MyJob::unallocated);
+    if( Verbose ) 
+        printf("client %d: create_one_job() for k=%d returns %p\n",client_id(),int(k),&j);
+    return &j;
+}
+
+class Checker {
+public:
+    int default_concurrency;
+    void check_number_of_threads_delivered( int n_delivered, int n_requested, int n_extra ) const;
+    Checker( rml::server& server ) : default_concurrency(int(server.default_concurrency())) {}
+};
+
+void Checker::check_number_of_threads_delivered( int n_delivered, int n_requested, int n_extra ) const {
+    ASSERT( default_concurrency>=0, NULL );
+    // Check that number of threads delivered is reasonable.
+    int n_avail = default_concurrency;
+    if( n_extra>0 )
+        n_avail-=n_extra;
+    if( n_avail<0 ) 
+        n_avail=0;
+    // If the client asked for more threads than the hardware provides, the difference becomes private threads
+    // that are available regardless of what else is running.
+    if( n_requested>default_concurrency ) 
+        n_avail += n_requested-default_concurrency;
+    int n_expected = n_requested;
+    if( n_expected>n_avail )
+        n_expected=n_avail;
+    const char* msg = NULL;
+    if( n_delivered>n_avail ) 
+        msg = "server delivered more threads than were theoretically available";
+    else if( n_delivered>n_expected ) 
+        msg = "server delivered more threads than expected";
+    else if( n_delivered<n_expected ) 
+        msg = "server delivered fewer threads than ideal";
+    if( msg ) {
+        printf("Warning: %s (n_delivered=%d n_avail=%d n_requested=%d n_extra=%d default_concurrency=%d)\n",
+               msg, n_delivered, n_avail, n_requested, n_extra, default_concurrency );
+    }
+}
+
+template<typename Factory,typename Client>
+class DoOneConnection: NoAssign {
+    //! Number of threads to request
+    const int n_thread;
+    //! Nesting 
+    const Nesting nesting;
+    //! Number of extra threads to pretend having outside the RML
+    const int n_extra;
+    //! If true, check number of threads actually delivered.
+    const bool check_delivered;
+public:
+    DoOneConnection( int n_thread_, Nesting nesting_, int n_extra_, bool check_delivered_ ) : 
+        n_thread(n_thread_), 
+        nesting(nesting_), 
+        n_extra(n_extra_), 
+        check_delivered(check_delivered_)
+    {
+    }
+   
+    //! Test ith connection 
+    void operator()( size_t i ) const;
+};
+
+template<typename Factory,typename Client>
+void DoOneConnection<Factory,Client>::operator()( size_t i ) const {
+    ASSERT( i<MaxClient, NULL );
+    Client* client = new Client;
+    client->initialize( Client::is_omp ? JobArraySize : n_thread, nesting, ClientStackSize[i] );
+    Factory factory;
+    memset( &factory, 0, sizeof(factory) );
+    typename Factory::status_type status = factory.open();
+
+    typename Factory::server_type* server; 
+    status = factory.make_server( server, *client );
+    if( Verbose ) 
+        printf("client %d: opened server n_thread=%d nesting=(%d,%d)\n",
+               client->client_id(), n_thread, nesting.level, nesting.limit);
+
+    client->set_server( server );
+    Checker checker( *server );
+ 
+    FireUpJobs( *server, *client, n_thread, n_extra, check_delivered && !client->is_strict() ? &checker : NULL );
+
+    // Close the connection
+    client->expect_close_connection = true;
+    if( Verbose )
+        printf("client %d: calling request_close_connection\n", client->client_id());
+    server->request_close_connection();
+    // Client deletes itself when it sees call to acknowledge_close_connection from server.
+    factory.close();
+}
+
+//! Test with n_threads threads and n_client clients.
+template<typename Factory, typename Client>
+void SimpleTest() {
+    for( int n_thread=MinThread; n_thread<=MaxThread; ++n_thread ) {
+        // Test nested connections
+        DoOneConnection<Factory,Client> doc(n_thread,Nesting(0,1),0,false);
+        doc(0);
+    }
+    // Let RML catch up.
+    while( ClientConstructions!=ClientDestructions ) {
+        MilliSleep(1);
+    }
+}
+
+static void check_server_info( void* arg, const char* server_info )
+{
+    ASSERT( strstr(server_info, (char*)arg), NULL );
+}
+
+template<typename Factory, typename Client>
+void VerifyInitialization( int n_thread ) {
+    Client* client = new Client;
+    client->initialize( Client::is_omp ? JobArraySize : n_thread, Nesting(), ClientStackSize[0] );
+    Factory factory;
+    memset( &factory, 0, sizeof(factory) );
+    typename Factory::status_type status = factory.open();
+    ASSERT( status!=Factory::st_not_found, "could not find RML library" );
+    ASSERT( status!=Factory::st_incompatible, NULL );
+    ASSERT( status==Factory::st_success, NULL );
+    factory.call_with_server_info( check_server_info, (void*)"Intel(R) RML library built:" );
+    typename Factory::server_type* server; 
+    status = factory.make_server( server, *client );
+    ASSERT( status!=Factory::st_incompatible, NULL );
+    ASSERT( status!=Factory::st_not_found, NULL );
+    ASSERT( status==Factory::st_success, NULL );
+    if( Verbose ) 
+        printf("client %d: opened server n_thread=%d nesting=(%d,%d)\n",
+               client->client_id(), n_thread, 0, 0);
+    ASSERT( server, NULL );
+    client->set_server( server );
+
+    DoClientSpecificVerification( *server, n_thread );
+ 
+    // Close the connection
+    client->expect_close_connection = true;
+    if( Verbose )
+        printf("client %d: calling request_close_connection\n", client->client_id());
+    server->request_close_connection();
+    // Client deletes itself when it sees call to acknowledge_close_connection from server.
+    factory.close();
+}
diff --git a/dep/tbb/src/rml/test/test_thread_monitor.cpp b/dep/tbb/src/rml/test/test_thread_monitor.cpp
new file mode 100644
index 000000000..89ef32554
--- /dev/null
+++ b/dep/tbb/src/rml/test/test_thread_monitor.cpp
@@ -0,0 +1,129 @@
+/*
+    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
+
+    This file is part of Threading Building Blocks.
+
+    Threading Building Blocks is free software; you can redistribute it
+    and/or modify it under the terms of the GNU General Public License
+    version 2 as published by the Free Software Foundation.
+
+    Threading Building Blocks is distributed in the hope that it will be
+    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with Threading Building Blocks; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    As a special exception, you may use this file as part of a free software
+    library without restriction.  Specifically, if other files instantiate
+    templates or use macros or inline functions from this file, or you compile
+    this file and link it with other files to produce an executable, this
+    file does not by itself cause the resulting executable to be covered by
+    the GNU General Public License.  This exception does not however
+    invalidate any other reasons why the executable file might be covered by
+    the GNU General Public License.
+*/
+
+#include "thread_monitor.h"
+#include "harness.h"
+#include "harness_memory.h"
+
+class ThreadState {
+    void loop();
+public:
+    static __RML_DECL_THREAD_ROUTINE routine( void* arg ) {
+        static_cast<ThreadState*>(arg)->loop();
+        return 0;
+    }
+    typedef rml::internal::thread_monitor thread_monitor;
+    thread_monitor monitor;
+    volatile int request;
+    volatile int ack;
+    volatile unsigned clock;
+    volatile unsigned stamp;
+    ThreadState() : request(-1), ack(-1) {}
+};
+
+void ThreadState::loop() {
+    for(;;) {
+        ++clock;
+        if( ack==request ) {
+            thread_monitor::cookie c;
+            monitor.prepare_wait(c);
+            if( ack==request ) {
+                if( Verbose ) {
+                    printf("%p: request=%d ack=%d\n", this, request, ack );
+                }
+                monitor.commit_wait(c);
+            } else
+                monitor.cancel_wait();
+        } else {
+            // Throw in delay occasionally
+            switch( request%8 ) {
+                case 0: 
+                case 1:
+                case 5:
+                    rml::internal::thread_monitor::yield();
+            }
+            int r = request;
+            ack = request;
+            if( !r ) return;
+        }
+    }
+}
+
+// Linux on Itanium seems to require at least 1<<18 bytes per stack.
+const size_t MinStackSize = 1<<18;
+const size_t MaxStackSize = 1<<22;
+
+int main( int argc, char* argv[] ) {
+    // Set defaults
+    MinThread = 1;
+    MaxThread = 4;
+    ParseCommandLine( argc, argv );
+
+    for( int p=MinThread; p<=MaxThread; ++p ) {
+        ThreadState* t = new ThreadState[p];
+        for( size_t stack_size = MinStackSize; stack_size<=MaxStackSize; stack_size*=2 ) {
+            if( Verbose )
+                printf("launching %d threads\n",p);
+            for( int i=0; i<p; ++i )
+                rml::internal::thread_monitor::launch( ThreadState::routine, t+i, stack_size ); 
+            for( int k=1000; k>=0; --k ) {
+                if( k%8==0 ) {
+                    // Wait for threads to wait.
+                    for( int i=0; i<p; ++i ) {
+                        unsigned count = 0;
+                        do {
+                            t[i].stamp = t[i].clock;
+                            rml::internal::thread_monitor::yield();
+                            if( ++count>=1000 ) {
+                                printf("Warning: thread %d not waiting\n",i);
+                                break;
+                            }
+                        } while( t[i].stamp!=t[i].clock );
+                    }
+                }
+                if( Verbose ) 
+                    printf("notifying threads\n");
+                for( int i=0; i<p; ++i ) {
+                    // Change state visible to launched thread
+                    t[i].request = k;
+                    t[i].monitor.notify();
+                }
+                if( Verbose ) 
+                    printf("waiting for threads to respond\n");
+                for( int i=0; i<p; ++i ) 
+                    // Wait for thread to respond 
+                    while( t[i].ack!=k ) 
+                        rml::internal::thread_monitor::yield();
+            }
+        }
+        delete[] t;
+    }
+
+    printf("done\n");
+    return 0;
+}
diff --git a/dep/tbb/src/tbb/gate.h b/dep/tbb/src/tbb/gate.h
deleted file mode 100644
index fb1283621..000000000
--- a/dep/tbb/src/tbb/gate.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
-    Copyright 2005-2009 Intel Corporation.  All Rights Reserved.
-
-    This file is part of Threading Building Blocks.
-
-    Threading Building Blocks is free software; you can redistribute it
-    and/or modify it under the terms of the GNU General Public License
-    version 2 as published by the Free Software Foundation.
-
-    Threading Building Blocks is distributed in the hope that it will be
-    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
-    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with Threading Building Blocks; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    As a special exception, you may use this file as part of a free software
-    library without restriction.  Specifically, if other files instantiate
-    templates or use macros or inline functions from this file, or you compile
-    this file and link it with other files to produce an executable, this
-    file does not by itself cause the resulting executable to be covered by
-    the GNU General Public License.  This exception does not however
-    invalidate any other reasons why the executable file might be covered by
-    the GNU General Public License.
-*/
-
-#ifndef _TBB_Gate_H
-#define _TBB_Gate_H
-
-#include "itt_notify.h"
-
-namespace tbb {
-
-namespace internal {
-
-#if __TBB_RML
-//! Fake version of Gate for use with RML.
-/** Really just an atomic intptr_t with a compare-and-swap operation,
-    but wrapped in syntax that makes it look like a normal Gate object,
-    in order to minimize source changes for RML in task.cpp. */
-class Gate {
-public:
-    typedef intptr_t state_t;
-   
-    //! Get current state of gate
-    state_t get_state() const {
-        return state;
-    }
-
-#if defined(_MSC_VER) && defined(_Wp64)
-    // Workaround for overzealous compiler warnings in /Wp64 mode
-    #pragma warning (disable: 4244)
-#endif
-
-    bool try_update( intptr_t value, intptr_t comparand ) {
-        return state.compare_and_swap(value,comparand)==comparand;
-    }
-private:
-    atomic<state_t> state;
-};
-
-#elif __TBB_USE_FUTEX
-
-//! Implementation of Gate based on futex.
-/** Use this futex-based implementation where possible, because it is the simplest and usually fastest. */
-class Gate {
-public:
-    typedef intptr_t state_t;
-
-    Gate() {
-        ITT_SYNC_CREATE(&state, SyncType_Scheduler, SyncObj_Gate);
-    }
-
-    //! Get current state of gate
-    state_t get_state() const {
-        return state;
-    }
-    //! Update state=value if state==comparand (flip==false) or state!=comparand (flip==true)
-    void try_update( intptr_t value, intptr_t comparand, bool flip=false ) {
-        __TBB_ASSERT( comparand!=0 || value!=0, "either value or comparand must be non-zero" );
-retry:
-        state_t old_state = state;
-        // First test for condition without using atomic operation
-        if( flip ? old_state!=comparand : old_state==comparand ) {
-            // Now atomically retest condition and set.
-            state_t s = state.compare_and_swap( value, old_state );
-            if( s==old_state ) {
-                // compare_and_swap succeeded
-                if( value!=0 )   
-                    futex_wakeup_all( &state );  // Update was successful and new state is not SNAPSHOT_EMPTY
-            } else {
-                // compare_and_swap failed.  But for != case, failure may be spurious for our purposes if
-                // the value there is nonetheless not equal to value.  This is a fairly rare event, so
-                // there is no need for backoff.  In event of such a failure, we must retry.
-                if( flip && s!=value ) 
-                    goto retry;
-            }
-        }
-    }
-    //! Wait for state!=0.
-    void wait() {
-        if( state==0 )
-            futex_wait( &state, 0 );
-    }
-private:
-    atomic<state_t> state;
-};
-
-#elif USE_WINTHREAD
-
-class Gate {
-public:
-    typedef intptr_t state_t;
-private:
-    //! If state==0, then thread executing wait() suspend until state becomes non-zero.
-    state_t state;
-    CRITICAL_SECTION critical_section;
-    HANDLE event;
-public:
-    //! Initialize with count=0
-    Gate() : state(0) {
-        event = CreateEvent( NULL, true, false, NULL );
-        InitializeCriticalSection( &critical_section );
-        ITT_SYNC_CREATE(&event, SyncType_Scheduler, SyncObj_Gate);
-        ITT_SYNC_CREATE(&critical_section, SyncType_Scheduler, SyncObj_GateLock);
-    }
-    ~Gate() {
-        // Fake prepare/acquired pair for Intel(R) Parallel Amplifier to correctly attribute the operations below
-        ITT_NOTIFY( sync_prepare, &event );
-        CloseHandle( event );
-        DeleteCriticalSection( &critical_section );
-        ITT_NOTIFY( sync_acquired, &event );
-    }
-    //! Get current state of gate
-    state_t get_state() const {
-        return state;
-    }
-    //! Update state=value if state==comparand (flip==false) or state!=comparand (flip==true)
-    void try_update( intptr_t value, intptr_t comparand, bool flip=false ) {
-        __TBB_ASSERT( comparand!=0 || value!=0, "either value or comparand must be non-zero" );
-        EnterCriticalSection( &critical_section );
-        state_t old = state;
-        if( flip ? old!=comparand : old==comparand ) {
-            state = value;
-            if( !old )
-                SetEvent( event );
-            else if( !value )
-                ResetEvent( event );
-        }
-        LeaveCriticalSection( &critical_section );
-    }
-    //! Wait for state!=0.
-    void wait() {
-        if( state==0 ) {
-            WaitForSingleObject( event, INFINITE );
-        }
-    }
-};
-
-#elif USE_PTHREAD
-
-class Gate {
-public:
-    typedef intptr_t state_t;
-private:
-    //! If state==0, then thread executing wait() suspend until state becomes non-zero.
-    state_t state;
-    pthread_mutex_t mutex;
-    pthread_cond_t cond;
-public:
-    //! Initialize with count=0
-    Gate() : state(0)
-    {
-        pthread_mutex_init( &mutex, NULL );
-        pthread_cond_init( &cond, NULL);
-        ITT_SYNC_CREATE(&cond, SyncType_Scheduler, SyncObj_Gate);
-        ITT_SYNC_CREATE(&mutex, SyncType_Scheduler, SyncObj_GateLock);
-    }
-    ~Gate() {
-        pthread_cond_destroy( &cond );
-        pthread_mutex_destroy( &mutex );
-    }
-    //! Get current state of gate
-    state_t get_state() const {
-        return state;
-    }
-    //! Update state=value if state==comparand (flip==false) or state!=comparand (flip==true)
-    void try_update( intptr_t value, intptr_t comparand, bool flip=false ) {
-        __TBB_ASSERT( comparand!=0 || value!=0, "either value or comparand must be non-zero" );
-        pthread_mutex_lock( &mutex );
-        state_t old = state;
-        if( flip ? old!=comparand : old==comparand ) {
-            state = value;
-            if( !old )
-                pthread_cond_broadcast( &cond );
-        }
-        pthread_mutex_unlock( &mutex );
-    }
-    //! Wait for state!=0.
-    void wait() {
-        if( state==0 ) {
-            pthread_mutex_lock( &mutex );
-            while( state==0 ) {
-                pthread_cond_wait( &cond, &mutex );
-            }
-            pthread_mutex_unlock( &mutex );
-        }
-    }
-};
-
-#else
-#error Must define USE_PTHREAD or USE_WINTHREAD
-#endif  /* threading kind */
-
-} // namespace Internal
-
-} // namespace ThreadingBuildingBlocks
-
-#endif /* _TBB_Gate_H */
diff --git a/dep/tbb/src/tbb/private_server.cpp b/dep/tbb/src/tbb/private_server.cpp
index cda558e81..99234ea35 100644
--- a/dep/tbb/src/tbb/private_server.cpp
+++ b/dep/tbb/src/tbb/private_server.cpp
@@ -26,8 +26,8 @@
     the GNU General Public License.
 */
 
-#include "../rml/include/rml_tbb.h"
-#include "../rml/server/thread_monitor.h"
+#include "rml_tbb.h"
+#include "../server/thread_monitor.h"
 #include "tbb/atomic.h"
 #include "tbb/cache_aligned_allocator.h"
 #include "tbb/spin_mutex.h"
diff --git a/dep/tbb/src/tbb/tbb_version.h b/dep/tbb/src/tbb/tbb_version.h
index 07a91d6f5..cd86de08e 100644
--- a/dep/tbb/src/tbb/tbb_version.h
+++ b/dep/tbb/src/tbb/tbb_version.h
@@ -27,13 +27,13 @@
 */
 
 // Please define version number in the file:
-#include "../../include/tbb/tbb_stddef.h"
+#include "tbb/tbb_stddef.h"
 
 // And don't touch anything below
 #ifndef ENDL
 #define ENDL "\n"
 #endif
-#include "../../build/vsproject/version_string.tmp"
+#include "version_string.tmp"
 
 #ifndef __TBB_VERSION_STRINGS
 #pragma message("Warning: version_string.tmp isn't generated properly by version_info.sh script!")
diff --git a/dep/tbb/src/tbbmalloc/tbbmalloc.rc b/dep/tbb/src/tbbmalloc/tbbmalloc.rc
index 4e8a2ed0b..89830ebd3 100644
--- a/dep/tbb/src/tbbmalloc/tbbmalloc.rc
+++ b/dep/tbb/src/tbbmalloc/tbbmalloc.rc
@@ -43,7 +43,7 @@
 //
 #include <winresrc.h>
 #define ENDL "\r\n"
-#include "../tbb/tbb_version.h"
+#include "tbb/tbb_version.h"
 
 #define TBBMALLOC_VERNUMBERS TBB_VERSION_MAJOR, TBB_VERSION_MINOR, __TBB_VERSION_YMD
 #define TBBMALLOC_VERSION __TBB_STRING(TBBMALLOC_VERNUMBERS)
diff --git a/src/shared/revision_nr.h b/src/shared/revision_nr.h
index 9b1c09c56..bf09d4b41 100644
--- a/src/shared/revision_nr.h
+++ b/src/shared/revision_nr.h
@@ -1,4 +1,4 @@
 #ifndef __REVISION_NR_H__
 #define __REVISION_NR_H__
- #define REVISION_NR "11163"
+ #define REVISION_NR "11164"
 #endif // __REVISION_NR_H__