[haiku-commits] BRANCH pdziepak-github.scheduler [978fc08] src/system/kernel/scheduler headers/private/kernel

From: pdziepak-github.scheduler <community@xxxxxxxxxxxx>
To: haiku-commits@xxxxxxxxxxxxx
Date: Thu, 24 Oct 2013 02:15:32 +0200 (CEST)
added 4 changesets to branch 'refs/remotes/pdziepak-github/scheduler'
old head: 31a75d402f761a1087c449e44926a6bec0c31009
new head: 978fc080654a2367cfb75a8afa196361ab56645e
overview: https://github.com/pdziepak/Haiku/compare/31a75d4...978fc08

----------------------------------------------------------------------------

7e1ecb9: kernel: Protect scheduler_set_thread_priority() with lock

ed8627e: kernel/util: Fix MinMaxHeap::_GrowHeap()

e927edd: scheduler_affine: Disable logic not needed on current topology

978fc08: scheduler: Remove support for running different schedulers
  
  Simple scheduler behaves exactly the same as affine scheduler with a
  single core. Obviously, affine scheduler is more complicated thus
  introduces greater overhead but quite a lot of multicore logic has been
  disabled on single core systems in the previous commit.

                                    [ Pawel Dziepak <pdziepak@xxxxxxxxxxx> ]

----------------------------------------------------------------------------

9 files changed, 1721 insertions(+), 2584 deletions(-)
headers/private/kernel/kscheduler.h              |  130 +-
headers/private/kernel/util/MinMaxHeap.h         |    6 +-
src/system/kernel/Jamfile                        |    2 -
src/system/kernel/scheduler/scheduler.cpp        | 1704 +++++++++++++++++-
src/system/kernel/scheduler/scheduler_affine.cpp | 1651 -----------------
src/system/kernel/scheduler/scheduler_affine.h   |   13 -
src/system/kernel/scheduler/scheduler_simple.cpp |  776 --------
src/system/kernel/scheduler/scheduler_simple.h   |   12 -
src/system/kernel/thread.cpp                     |   11 +-

############################################################################

Commit:      7e1ecb9315396949d3197916c1a3bf67a0fee22c
Author:      Pawel Dziepak <pdziepak@xxxxxxxxxxx>
Date:        Wed Oct 23 22:59:10 2013 UTC

kernel: Protect scheduler_set_thread_priority() with lock

----------------------------------------------------------------------------

diff --git a/src/system/kernel/thread.cpp b/src/system/kernel/thread.cpp
index 391a487..d13fec1 100644
--- a/src/system/kernel/thread.cpp
+++ b/src/system/kernel/thread.cpp
@@ -1919,7 +1919,10 @@ thread_exit(void)
                panic("thread_exit() called with interrupts disabled!\n");
 
        // boost our priority to get this over with
-       scheduler_set_thread_priority(thread, B_URGENT_DISPLAY_PRIORITY);
+       {
+               InterruptsSpinLocker _(gSchedulerLock);
+               scheduler_set_thread_priority(thread, 
B_URGENT_DISPLAY_PRIORITY);
+       }
 
        if (team != kernelTeam) {
                // Cancel previously installed alarm timer, if any. Hold the 
scheduler

############################################################################

Commit:      ed8627e5358e6bd5b901545c79a4f58c51c838b3
Author:      Pawel Dziepak <pdziepak@xxxxxxxxxxx>
Date:        Wed Oct 23 22:59:58 2013 UTC

kernel/util: Fix MinMaxHeap::_GrowHeap()

----------------------------------------------------------------------------

diff --git a/headers/private/kernel/util/MinMaxHeap.h 
b/headers/private/kernel/util/MinMaxHeap.h
index 4dc8b0c..14efee5 100644
--- a/headers/private/kernel/util/MinMaxHeap.h
+++ b/headers/private/kernel/util/MinMaxHeap.h
@@ -335,8 +335,8 @@ MIN_MAX_HEAP_CLASS_NAME::_GrowHeap(int minimalSize)
        fMinElements = newBuffer;
 
        newBuffer += newSize / 2;
-       if (fMaxElements != NULL)
-               memcpy(newBuffer, fMaxElements, fSize * sizeof(Element*));
+       if (fMaxLastElement > 0)
+               memcpy(newBuffer, fMinElements + fSize, fSize * 
sizeof(Element*));
        fMaxElements = newBuffer;
 
        fSize = newSize / 2;
@@ -426,7 +426,7 @@ 
MIN_MAX_HEAP_CLASS_NAME::_ChangeTree(MinMaxHeapLink<Element, Key>* link)
 
        if (otherLastElement <= 0) {
                ASSERT(currentLastElement == 1);
-               return true;
+               return false;
        }
 
        ASSERT((link->fIndex - 1) / 2 < otherLastElement);

############################################################################

Commit:      e927edd3765865bd8e7729ca15bd0ee0602be42c
Author:      Pawel Dziepak <pdziepak@xxxxxxxxxxx>
Date:        Wed Oct 23 23:33:12 2013 UTC

scheduler_affine: Disable logic not needed on current topology

----------------------------------------------------------------------------

diff --git a/src/system/kernel/scheduler/scheduler_affine.cpp 
b/src/system/kernel/scheduler/scheduler_affine.cpp
index bf1652b..a0fb24e 100644
--- a/src/system/kernel/scheduler/scheduler_affine.cpp
+++ b/src/system/kernel/scheduler/scheduler_affine.cpp
@@ -53,6 +53,8 @@ const bigtime_t kCacheExpire = 100000;
 static int sDisableSmallTaskPacking;
 static int32 sSmallTaskCore;
 
+static bool sSingleCore;
+
 static scheduler_mode sSchedulerMode;
 
 static int32 (*sChooseCore)(Thread* thread);
@@ -439,6 +441,8 @@ dump_idle_cores(int argc, char** argv)
 static inline bool
 affine_has_cache_expired(Thread* thread)
 {
+       ASSERT(!sSingleCore);
+
        if (thread_is_idle_thread(thread))
                return false;
 
@@ -495,6 +499,8 @@ affine_dump_thread_data(Thread* thread)
 static void
 affine_update_thread_heaps(int32 core)
 {
+       ASSERT(!sSingleCore);
+
        CoreEntry* entry = &sCoreEntries[core];
 
        ASSERT(entry->fCPUBoundThreads >= 0
@@ -535,6 +541,8 @@ affine_update_thread_heaps(int32 core)
 static inline void
 affine_disable_small_task_packing(void)
 {
+       ASSERT(!sSingleCore);
+
        ASSERT(sDisableSmallTaskPacking == 0);
        ASSERT(sSmallTaskCore == sCPUToCore[smp_get_current_cpu()]);
 
@@ -565,7 +573,7 @@ affine_increase_penalty(Thread* thread)
                ASSERT(core >= 0);
 
                int32 additionalPenalty = 
schedulerThreadData->additional_penalty;
-               if (additionalPenalty == 0) {
+               if (additionalPenalty == 0 && !sSingleCore) {
                        sCPUBoundThreads++;
                        sCoreEntries[core].fCPUBoundThreads++;
 
@@ -573,7 +581,7 @@ affine_increase_penalty(Thread* thread)
                }
 
                const int kSmallTaskThreshold = 50;
-               if (additionalPenalty > kSmallTaskThreshold) {
+               if (additionalPenalty > kSmallTaskThreshold && !sSingleCore) {
                        if (sSmallTaskCore == core)
                                affine_disable_small_task_packing();
                }
@@ -618,6 +626,9 @@ affine_update_priority_heaps(int32 cpu, int32 priority)
 
        sCPUPriorityHeaps[core].ModifyKey(&sCPUEntries[cpu], priority);
 
+       if (sSingleCore)
+               return;
+
        int32 maxPriority
                = AffineCPUHeap::GetKey(sCPUPriorityHeaps[core].PeekMaximum());
        int32 corePriority = 
AffineCorePriorityHeap::GetKey(&sCoreEntries[core]);
@@ -774,6 +785,7 @@ affine_choose_core_power_saving(Thread* thread)
 static inline int32
 affine_choose_core(Thread* thread)
 {
+       ASSERT(!sSingleCore);
        return sChooseCore(thread);
 }
 
@@ -790,6 +802,8 @@ affine_choose_cpu(int32 core)
 static bool
 affine_should_rebalance(Thread* thread)
 {
+       ASSERT(!sSingleCore);
+
        if (thread_is_idle_thread(thread))
                return false;
 
@@ -843,7 +857,7 @@ affine_should_rebalance(Thread* thread)
 static void
 affine_assign_active_thread_to_core(Thread* thread)
 {
-       if (thread_is_idle_thread(thread))
+       if (thread_is_idle_thread(thread) || sSingleCore)
                return;
 
        scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
@@ -874,6 +888,12 @@ affine_thread_goes_away(Thread* thread)
        ASSERT(schedulerThreadData->previous_core >= 0);
        int32 core = schedulerThreadData->previous_core;
 
+       schedulerThreadData->went_sleep = system_time();
+       schedulerThreadData->went_sleep_active = sCoreEntries[core].fActiveTime;
+
+       if (sSingleCore)
+               return;
+
        ASSERT(sCoreEntries[core].fThreads > 0);
        ASSERT(sCoreEntries[core].fThreads > sCoreEntries[core].fCPUBoundThreads
                || (sCoreEntries[core].fThreads == 
sCoreEntries[core].fCPUBoundThreads
@@ -888,9 +908,6 @@ affine_thread_goes_away(Thread* thread)
        }
 
        affine_update_thread_heaps(core);
-
-       schedulerThreadData->went_sleep = system_time();
-       schedulerThreadData->went_sleep_active = sCoreEntries[core].fActiveTime;
 }
 
 
@@ -921,6 +938,11 @@ affine_enqueue(Thread* thread, bool newOne)
 
                if (newOne)
                        affine_assign_active_thread_to_core(thread);
+       } else if (sSingleCore) {
+               targetCore = 0;
+               targetCPU = affine_choose_cpu(targetCore);
+
+               schedulerThreadData->previous_core = targetCore;
        } else if (schedulerThreadData->previous_core < 0
                || (newOne && affine_has_cache_expired(thread))
                || affine_should_rebalance(thread)) {
@@ -1506,6 +1528,7 @@ scheduler_affine_init()
        if (result != B_OK)
                return result;
        sRunQueueCount = coreCount;
+       sSingleCore = coreCount == 1;
 
        // create package heap and idle package stack
        sPackageEntries = new(std::nothrow) PackageEntry[packageCount];
@@ -1633,8 +1656,10 @@ scheduler_affine_init()
        add_debugger_command_etc("cpu_heap", &dump_cpu_heap,
                "List CPUs in CPU priority heap", "\nList CPUs in CPU priority 
heap",
                0);
-       add_debugger_command_etc("idle_cores", &dump_idle_cores,
-               "List idle cores", "\nList idle cores", 0);
+       if (!sSingleCore) {
+               add_debugger_command_etc("idle_cores", &dump_idle_cores,
+                       "List idle cores", "\nList idle cores", 0);
+       }
 
        runQueuesDeleter.Detach();
        pinnedRunQueuesDeleter.Detach();

############################################################################

Commit:      978fc080654a2367cfb75a8afa196361ab56645e
Author:      Pawel Dziepak <pdziepak@xxxxxxxxxxx>
Date:        Thu Oct 24 00:04:03 2013 UTC

scheduler: Remove support for running different schedulers

Simple scheduler behaves exactly the same as affine scheduler with a
single core. Obviously, affine scheduler is more complicated thus
introduces greater overhead but quite a lot of multicore logic has been
disabled on single core systems in the previous commit.

----------------------------------------------------------------------------

diff --git a/headers/private/kernel/kscheduler.h 
b/headers/private/kernel/kscheduler.h
index 7746222..bfdac1d 100644
--- a/headers/private/kernel/kscheduler.h
+++ b/headers/private/kernel/kscheduler.h
@@ -24,86 +24,70 @@ typedef enum scheduler_mode {
        SCHEDULER_MODE_COUNT
 } scheduler_mode;
 
-struct scheduler_ops {
-       /*!     Enqueues the thread in the ready-to-run queue.
-               The caller must hold the scheduler lock (with disabled 
interrupts).
-       */
-       void (*enqueue_in_run_queue)(Thread* thread);
-
-       /*!     Selects a thread from the ready-to-run queue and, if that's not 
the
-               calling thread, switches the current CPU's context to run the 
selected
-               thread.
-               If it's the same thread, the thread will just continue to run.
-               In either case, unless the thread is dead or is sleeping/waiting
-               indefinitely, the function will eventually return.
-               The caller must hold the scheduler lock (with disabled 
interrupts).
-       */
-       void (*reschedule)(void);
-
-       /*!     Sets the given thread's priority.
-               The thread may be running or may be in the ready-to-run queue.
-               The caller must hold the scheduler lock (with disabled 
interrupts).
-       */
-       void (*set_thread_priority)(Thread* thread, int32 priority);
-       bigtime_t (*estimate_max_scheduling_latency)(Thread* thread);
-
-       /*!     Called when the Thread structure is first created.
-               Per-thread housekeeping resources can be allocated.
-               Interrupts must be enabled.
-       */
-       status_t (*on_thread_create)(Thread* thread, bool idleThread);
-
-       /*!     Called when a Thread structure is initialized and made ready for
-               use.
-               The per-thread housekeeping data structures are reset, if 
needed.
-               The caller must hold the scheduler lock (with disabled 
interrupts).
-       */
-       void (*on_thread_init)(Thread* thread);
-
-       /*!     Called when a Thread structure is freed.
-               Frees up any per-thread resources allocated on the scheduler's 
part. The
-               function may be called even if on_thread_create() failed.
-               Interrupts must be enabled.
-       */
-       void (*on_thread_destroy)(Thread* thread);
-
-       /*!     Called in the early boot process to start thread scheduling on 
the
-               current CPU.
-               The function is called once for each CPU.
-               Interrupts must be disabled, but the caller must not hold the 
scheduler
-               lock.
-       */
-       void (*start)(void);
-
-       /*! Sets scheduler operation mode.
-        */
-       status_t (*set_operation_mode)(scheduler_mode mode);
-
-       /*! Dumps scheduler specific thread information.
-       */
-       void (*dump_thread_data)(Thread* thread);
-};
-
-extern struct scheduler_ops* gScheduler;
 extern spinlock gSchedulerLock;
 
-#define scheduler_enqueue_in_run_queue(thread) \
-                               gScheduler->enqueue_in_run_queue(thread)
-#define scheduler_set_thread_priority(thread, priority)        \
-                               gScheduler->set_thread_priority(thread, 
priority)
-#define scheduler_reschedule() gScheduler->reschedule()
-#define scheduler_start()              gScheduler->start()
-#define scheduler_on_thread_create(thread, idleThread) \
-                               gScheduler->on_thread_create(thread, idleThread)
-#define scheduler_on_thread_init(thread) \
-                               gScheduler->on_thread_init(thread)
-#define scheduler_on_thread_destroy(thread) \
-                               gScheduler->on_thread_destroy(thread)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+/*!    Enqueues the thread in the ready-to-run queue.
+       The caller must hold the scheduler lock (with disabled interrupts).
+*/
+void scheduler_enqueue_in_run_queue(Thread* thread);
+
+/*!    Selects a thread from the ready-to-run queue and, if that's not the
+       calling thread, switches the current CPU's context to run the selected
+       thread.
+       If it's the same thread, the thread will just continue to run.
+       In either case, unless the thread is dead or is sleeping/waiting
+       indefinitely, the function will eventually return.
+       The caller must hold the scheduler lock (with disabled interrupts).
+*/
+void scheduler_reschedule(void);
+
+/*!    Sets the given thread's priority.
+       The thread may be running or may be in the ready-to-run queue.
+       The caller must hold the scheduler lock (with disabled interrupts).
+*/
+void scheduler_set_thread_priority(Thread* thread, int32 priority);
+
+/*!    Called when the Thread structure is first created.
+       Per-thread housekeeping resources can be allocated.
+       Interrupts must be enabled.
+*/
+status_t scheduler_on_thread_create(Thread* thread, bool idleThread);
+
+/*!    Called when a Thread structure is initialized and made ready for
+       use.
+       The per-thread housekeeping data structures are reset, if needed.
+       The caller must hold the scheduler lock (with disabled interrupts).
+*/
+void  scheduler_on_thread_init(Thread* thread);
+
+/*!    Called when a Thread structure is freed.
+       Frees up any per-thread resources allocated on the scheduler's part. The
+       function may be called even if on_thread_create() failed.
+       Interrupts must be enabled.
+*/
+void scheduler_on_thread_destroy(Thread* thread);
+
+/*!    Called in the early boot process to start thread scheduling on the
+       current CPU.
+       The function is called once for each CPU.
+       Interrupts must be disabled, but the caller must not hold the scheduler
+       lock.
+*/
+void scheduler_start(void);
+
+/*! Sets scheduler operation mode.
+ */
+status_t scheduler_set_operation_mode(scheduler_mode mode);
+
+/*! Dumps scheduler specific thread information.
+*/
+void scheduler_dump_thread_data(Thread* thread);
+
 void scheduler_add_listener(struct SchedulerListener* listener);
 void scheduler_remove_listener(struct SchedulerListener* listener);
 
diff --git a/src/system/kernel/Jamfile b/src/system/kernel/Jamfile
index 2c0079b..9bf77eb 100644
--- a/src/system/kernel/Jamfile
+++ b/src/system/kernel/Jamfile
@@ -63,8 +63,6 @@ KernelMergeObject kernel_core.o :
 
        # scheduler
        scheduler.cpp
-       scheduler_affine.cpp
-       scheduler_simple.cpp
        scheduler_tracing.cpp
        scheduling_analysis.cpp
 
diff --git a/src/system/kernel/scheduler/scheduler.cpp 
b/src/system/kernel/scheduler/scheduler.cpp
index fd84311..18357d2 100644
--- a/src/system/kernel/scheduler/scheduler.cpp
+++ b/src/system/kernel/scheduler/scheduler.cpp
@@ -1,78 +1,1659 @@
 /*
+ * Copyright 2013, Paweł Dziepak, pdziepak@xxxxxxxxxxx.
+ * Copyright 2009, Rene Gollent, rene@xxxxxxxxxxx.
  * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@xxxxxx.
- * Copyright 2010, Axel Dörfler, axeld@xxxxxxxxxxxxxxxx.
+ * Copyright 2002-2010, Axel Dörfler, axeld@xxxxxxxxxxxxxxxx.
+ * Copyright 2002, Angelo Mottola, a.mottola@xxxxxxxxx.
  * Distributed under the terms of the MIT License.
+ *
+ * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
+ * Distributed under the terms of the NewOS License.
  */
 
 
+/*! The thread scheduler */
+
+
+#include <OS.h>
+
+#include <AutoDeleter.h>
+#include <cpu.h>
+#include <debug.h>
+#include <int.h>
+#include <kernel.h>
 #include <kscheduler.h>
 #include <listeners.h>
+#include <scheduler_defs.h>
 #include <smp.h>
+#include <thread.h>
+#include <timer.h>
+#include <util/Heap.h>
+#include <util/MinMaxHeap.h>
+
+#include "RunQueue.h"
+#include "scheduler_common.h"
+#include "scheduler_tracing.h"
+
+
+//#define TRACE_SCHEDULER
+#ifdef TRACE_SCHEDULER
+#      define TRACE(...) dprintf_no_syslog(__VA_ARGS__)
+#else
+#      define TRACE(...) do { } while (false)
+#endif
+
+
+spinlock gSchedulerLock = B_SPINLOCK_INITIALIZER;
+SchedulerListenerList gSchedulerListeners;
+
+bool sSchedulerEnabled;
+
+const bigtime_t kThreadQuantum = 1000;
+const bigtime_t kMinThreadQuantum = 3000;
+const bigtime_t kMaxThreadQuantum = 10000;
+
+const bigtime_t kCacheExpire = 100000;
+
+static int sDisableSmallTaskPacking;
+static int32 sSmallTaskCore;
+
+static bool sSingleCore;
+
+static scheduler_mode sSchedulerMode;
+
+static int32 (*sChooseCore)(Thread* thread);
+
+
+// Heaps in sCPUPriorityHeaps are used for load balancing on a core the logical
+// processors in the heap belong to. Since there are no cache affinity issues
+// at this level and the run queue is shared among all logical processors on
+// the core the only real concern is to make lower priority threads give way to
+// the higher priority threads.
+struct CPUEntry : public MinMaxHeapLinkImpl<CPUEntry, int32> {
+       int32           fCPUNumber;
+};
+typedef MinMaxHeap<CPUEntry, int32> CPUHeap;
+static CPUEntry* sCPUEntries;
+static CPUHeap* sCPUPriorityHeaps;
+
+struct CoreEntry : public DoublyLinkedListLinkImpl<CoreEntry> {
+       HeapLink<CoreEntry, int32>      fPriorityHeapLink;
+       MinMaxHeapLink<CoreEntry, int32>        fThreadHeapLink;
+
+       int32           fCoreID;
+
+       bigtime_t       fActiveTime;
+
+       int32           fCPUBoundThreads;
+       int32           fThreads;
+};
+
+static CoreEntry* sCoreEntries;
+typedef Heap<CoreEntry, int32, HeapLesserCompare<int32>,
+               HeapMemberGetLink<CoreEntry, int32, 
&CoreEntry::fPriorityHeapLink> >
+       CorePriorityHeap;
+static CorePriorityHeap* sCorePriorityHeap;
+
+typedef MinMaxHeap<CoreEntry, int32, MinMaxHeapCompare<int32>,
+               MinMaxHeapMemberGetLink<CoreEntry, int32, 
&CoreEntry::fThreadHeapLink> >
+       CoreThreadHeap;
+static CoreThreadHeap* sCoreThreadHeap;
+static CoreThreadHeap* sCoreCPUBoundThreadHeap;
+
+static int32 sCPUBoundThreads;
+static int32 sAssignedThreads;
+
+// sPackageUsageHeap is used to decide which core should be woken up from the
+// idle state. When aiming for performance we should use as many packages as
+// possible with as little cores active in each package as possible (so that 
the
+// package can enter any boost mode if it has one and the active core have more
+// of the shared cache for themselves. If power saving is the main priority we
+// should keep active cores on as little packages as possible (so that other
+// packages can go to the deep state of sleep). The heap stores only packages
+// with at least one core active and one core idle. The packages with all cores
+// idle are stored in sPackageIdleList (in LIFO manner).
+struct PackageEntry : public MinMaxHeapLinkImpl<PackageEntry, int32>,
+       DoublyLinkedListLinkImpl<PackageEntry> {
+       int32                                           fPackageID;
+
+       DoublyLinkedList<CoreEntry>     fIdleCores;
+       int32                                           fIdleCoreCount;
+
+       int32                                           fCoreCount;
+};
+typedef MinMaxHeap<PackageEntry, int32> PackageHeap;
+typedef DoublyLinkedList<PackageEntry> IdlePackageList;
+
+static PackageEntry* sPackageEntries;
+static PackageHeap* sPackageUsageHeap;
+static IdlePackageList* sIdlePackageList;
+
+// The run queues. Holds the threads ready to run ordered by priority.
+// One queue per schedulable target per core. Additionally, each
+// logical processor has its sPinnedRunQueues used for scheduling
+// pinned threads.
+typedef RunQueue<Thread, THREAD_MAX_SET_PRIORITY> ThreadRunQueue;
+static ThreadRunQueue* sRunQueues;
+static ThreadRunQueue* sPinnedRunQueues;
+static int32 sRunQueueCount;
+
+// Since CPU IDs used internally by the kernel bear no relation to the actual
+// CPU topology the following arrays are used to efficiently get the core
+// and the package that CPU in question belongs to.
+static int32* sCPUToCore;
+static int32* sCPUToPackage;
+
+
+struct scheduler_thread_data {
+                                               scheduler_thread_data() { 
Init(); }
+       inline  void            Init();
+
+                       int32           priority_penalty;
+                       int32           additional_penalty;
+
+                       bool            lost_cpu;
+                       bool            cpu_bound;
+
+                       bigtime_t       time_left;
+                       bigtime_t       stolen_time;
+                       bigtime_t       quantum_start;
+
+                       bigtime_t       went_sleep;
+                       bigtime_t       went_sleep_active;
+
+                       int32           previous_core;
+};
+
+
+void
+scheduler_thread_data::Init()
+{
+       priority_penalty = 0;
+       additional_penalty = 0;
+
+       time_left = 0;
+       stolen_time = 0;
+
+       went_sleep = 0;
+       went_sleep_active = 0;
+
+       lost_cpu = false;
+       cpu_bound = true;
+
+       previous_core = -1;
+}
+
+
+static inline int
+get_minimal_priority(Thread* thread)
+{
+       return min_c(thread->priority, 25) / 5;
+}
+
+
+static inline int32
+get_thread_penalty(Thread* thread)
+{
+       int32 penalty = thread->scheduler_data->priority_penalty;
+
+       const int kMinimalPriority = get_minimal_priority(thread);
+       if (kMinimalPriority > 0) {
+               penalty
+                       += thread->scheduler_data->additional_penalty % 
kMinimalPriority;
+       }
+
+       return penalty;
+}
+
+
+static inline int32
+get_effective_priority(Thread* thread)
+{
+       if (thread->priority == B_IDLE_PRIORITY)
+               return thread->priority;
+       if (thread->priority >= B_FIRST_REAL_TIME_PRIORITY)
+               return thread->priority;
+
+       int32 effectivePriority = thread->priority;
+       effectivePriority -= get_thread_penalty(thread);
+
+       ASSERT(effectivePriority < B_FIRST_REAL_TIME_PRIORITY);
+       ASSERT(effectivePriority >= B_LOWEST_ACTIVE_PRIORITY);
+
+       return effectivePriority;
+}
+
+
+static void
+dump_queue(ThreadRunQueue::ConstIterator& iterator)
+{
+       if (!iterator.HasNext())
+               kprintf("Run queue is empty.\n");
+       else {
+               kprintf("thread      id      priority penalty  name\n");
+               while (iterator.HasNext()) {
+                       Thread* thread = iterator.Next();
+                       kprintf("%p  %-7" B_PRId32 " %-8" B_PRId32 " %-8" 
B_PRId32 " %s\n",
+                               thread, thread->id, thread->priority,
+                               get_thread_penalty(thread), thread->name);
+               }
+       }
+}
+
+
+static int
+dump_run_queue(int argc, char **argv)
+{
+       int32 cpuCount = smp_get_num_cpus();
+       int32 coreCount = 0;
+       for (int32 i = 0; i < cpuCount; i++) {
+               if (gCPU[i].topology_id[CPU_TOPOLOGY_SMT] == 0)
+                       sCPUToCore[i] = coreCount++;
+       }
+
+       ThreadRunQueue::ConstIterator iterator;
+       for (int32 i = 0; i < coreCount; i++) {
+               kprintf("\nCore %" B_PRId32 " run queue:\n", i);
+               iterator = sRunQueues[i].GetConstIterator();
+               dump_queue(iterator);
+       }
+
+       for (int32 i = 0; i < cpuCount; i++) {
+               iterator = sPinnedRunQueues[i].GetConstIterator();
+
+               if (iterator.HasNext()) {
+                       kprintf("\nCPU %" B_PRId32 " run queue:\n", i);
+                       dump_queue(iterator);
+               }
+       }
+
+       return 0;
+}
+
+
+static void
+dump_heap(CPUHeap* heap)
+{
+       CPUHeap temp(smp_get_num_cpus());
+
+       kprintf("cpu priority actual priority\n");
+       CPUEntry* entry = heap->PeekMinimum();
+       while (entry) {
+               int32 cpu = entry->fCPUNumber;
+               int32 key = CPUHeap::GetKey(entry);
+               kprintf("%3" B_PRId32 " %8" B_PRId32 " %15" B_PRId32 "\n", cpu, 
key,
+                       get_effective_priority(gCPU[cpu].running_thread));
+
+               heap->RemoveMinimum();
+               temp.Insert(entry, key);
+
+               entry = heap->PeekMinimum();
+       }
+
+       entry = temp.PeekMinimum();
+       while (entry) {
+               int32 key = CPUHeap::GetKey(entry);
+               temp.RemoveMinimum();
+               heap->Insert(entry, key);
+               entry = temp.PeekMinimum();
+       }
+}
+
+
+static void
+dump_core_thread_heap(CoreThreadHeap* heap)
+{
+       CoreThreadHeap temp(sRunQueueCount);
+
+       CoreEntry* entry = heap->PeekMinimum();
+       while (entry) {
+               int32 key = CoreThreadHeap::GetKey(entry);
+               kprintf("%4" B_PRId32 " %6" B_PRId32 " %7" B_PRId32 " %9" 
B_PRId32 "\n",
+                       entry->fCoreID, key, entry->fThreads, 
entry->fCPUBoundThreads);
+
+               heap->RemoveMinimum();
+               temp.Insert(entry, key);
+
+               entry = heap->PeekMinimum();
+       }
+
+       entry = temp.PeekMinimum();
+       while (entry) {
+               int32 key = CoreThreadHeap::GetKey(entry);
+               temp.RemoveMinimum();
+               heap->Insert(entry, key);
+               entry = temp.PeekMinimum();
+       }
+}
+
+
+static int
+dump_cpu_heap(int argc, char** argv)
+{
+       CorePriorityHeap temp(sRunQueueCount);
+
+       CoreEntry* entry = sCorePriorityHeap->PeekRoot();
+       if (entry != NULL)
+               kprintf("core priority\n");
+       else
+               kprintf("No active cores.\n");
+
+       while (entry) {
+               int32 core = entry->fCoreID;
+               int32 key = CorePriorityHeap::GetKey(entry);
+               kprintf("%4" B_PRId32 " %8" B_PRId32 "\n", core, key);
+
+               sCorePriorityHeap->RemoveRoot();
+               temp.Insert(entry, key);
+
+               entry = sCorePriorityHeap->PeekRoot();
+       }
+
+       entry = temp.PeekRoot();
+       while (entry) {
+               int32 key = CorePriorityHeap::GetKey(entry);
+               temp.RemoveRoot();
+               sCorePriorityHeap->Insert(entry, key);
+               entry = temp.PeekRoot();
+       }
+
+       kprintf("\ncore    key threads cpu-bound\n");
+       dump_core_thread_heap(sCoreThreadHeap);
+       dump_core_thread_heap(sCoreCPUBoundThreadHeap);
+
+       for (int32 i = 0; i < sRunQueueCount; i++) {
+               kprintf("\nCore %" B_PRId32 " heap:\n", i);
+               dump_heap(&sCPUPriorityHeaps[i]);
+       }
+
+       return 0;
+}
+
+
+static int
+dump_idle_cores(int argc, char** argv)
+{
+       kprintf("Idle packages:\n");
+       IdlePackageList::ReverseIterator idleIterator
+               = sIdlePackageList->GetReverseIterator();
+
+       if (idleIterator.HasNext()) {
+               kprintf("package cores\n");
+
+               while (idleIterator.HasNext()) {
+                       PackageEntry* entry = idleIterator.Next();
+                       kprintf("%-7" B_PRId32 " ", entry->fPackageID);
+
+                       DoublyLinkedList<CoreEntry>::ReverseIterator iterator
+                               = entry->fIdleCores.GetReverseIterator();
+                       if (iterator.HasNext()) {
+                               while (iterator.HasNext()) {
+                                       CoreEntry* coreEntry = iterator.Next();
+                                       kprintf("%" B_PRId32 "%s", 
coreEntry->fCoreID,
+                                               iterator.HasNext() ? ", " : "");
+                               }
+                       } else
+                               kprintf("-");
+                       kprintf("\n");
+               }
+       } else
+               kprintf("No idle packages.\n");
+
+       PackageHeap temp(smp_get_num_cpus());
+       kprintf("\nPackages with idle cores:\n");
+
+       PackageEntry* entry = sPackageUsageHeap->PeekMinimum();
+       if (entry == NULL)
+               kprintf("No packages.\n");
+       else
+               kprintf("package count cores\n");
+
+       while (entry != NULL) {
+               kprintf("%-7" B_PRId32 " %-5" B_PRId32 " ", entry->fPackageID,
+                       entry->fIdleCoreCount);
+
+               DoublyLinkedList<CoreEntry>::ReverseIterator iterator
+                       = entry->fIdleCores.GetReverseIterator();
+               if (iterator.HasNext()) {
+                       while (iterator.HasNext()) {
+                               CoreEntry* coreEntry = iterator.Next();
+                               kprintf("%" B_PRId32 "%s", coreEntry->fCoreID,
+                                       iterator.HasNext() ? ", " : "");
+                       }
+               } else
+                       kprintf("-");
+               kprintf("\n");
+
+               sPackageUsageHeap->RemoveMinimum();
+               temp.Insert(entry, entry->fIdleCoreCount);
+
+               entry = sPackageUsageHeap->PeekMinimum();
+       }
+
+       entry = temp.PeekMinimum();
+       while (entry != NULL) {
+               int32 key = PackageHeap::GetKey(entry);
+               temp.RemoveMinimum();
+               sPackageUsageHeap->Insert(entry, key);
+               entry = temp.PeekMinimum();
+       }
+
+       return 0;
+}
+
+
+static inline bool
+has_cache_expired(Thread* thread)
+{
+       ASSERT(!sSingleCore);
+
+       if (thread_is_idle_thread(thread))
+               return false;
+
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+       ASSERT(schedulerThreadData->previous_core >= 0);
+
+       CoreEntry* coreEntry = 
&sCoreEntries[schedulerThreadData->previous_core];
+       switch (sSchedulerMode) {
+               case SCHEDULER_MODE_PERFORMANCE:
+                       return coreEntry->fActiveTime
+                                       - 
schedulerThreadData->went_sleep_active > kCacheExpire;
+
+               case SCHEDULER_MODE_POWER_SAVING:
+                       return system_time() - schedulerThreadData->went_sleep
+                               > kCacheExpire;
+
+               default:
+                       return true;
+       }
+}
+
+
+void
+scheduler_dump_thread_data(Thread* thread)
+{
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+
+       kprintf("\tpriority_penalty:\t%" B_PRId32 "\n",
+               schedulerThreadData->priority_penalty);
+
+       int32 additionalPenalty = 0;
+       const int kMinimalPriority = get_minimal_priority(thread);
+       if (kMinimalPriority > 0) {
+               additionalPenalty
+                       = schedulerThreadData->additional_penalty % 
kMinimalPriority;
+       }
+       kprintf("\tadditional_penalty:\t%" B_PRId32 " (%" B_PRId32 ")\n",
+               additionalPenalty, schedulerThreadData->additional_penalty);
+       kprintf("\tstolen_time:\t\t%" B_PRId64 "\n",
+               schedulerThreadData->stolen_time);
+       kprintf("\twent_sleep:\t\t%" B_PRId64 "\n",
+               schedulerThreadData->went_sleep);
+       kprintf("\twent_sleep_active:\t%" B_PRId64 "\n",
+               schedulerThreadData->went_sleep_active);
+       kprintf("\tprevious_core:\t\t%" B_PRId32 "\n",
+               schedulerThreadData->previous_core);
+       if (schedulerThreadData->previous_core > 0
+               && has_cache_expired(thread)) {
+               kprintf("\tcache affinity has expired\n");
+       }
+}
+
+
+static void
+update_thread_heaps(int32 core)
+{
+       ASSERT(!sSingleCore);
+
+       CoreEntry* entry = &sCoreEntries[core];
+
+       ASSERT(entry->fCPUBoundThreads >= 0
+               && entry->fCPUBoundThreads <= entry->fThreads);
+       ASSERT(entry->fThreads >= 0
+               && entry->fThreads <= thread_max_threads());
+
+       int32 newKey = entry->fCPUBoundThreads * thread_max_threads();
+       newKey += entry->fThreads;
+
+       int32 oldKey = CoreThreadHeap::GetKey(entry);
+
+       if (oldKey == newKey)
+               return;
+
+       if (newKey > thread_max_threads()) {
+               if (oldKey <= thread_max_threads()) {
+                       sCoreThreadHeap->ModifyKey(entry, -1);
+                       ASSERT(sCoreThreadHeap->PeekMinimum() == entry);
+                       sCoreThreadHeap->RemoveMinimum();
+
+                       sCoreCPUBoundThreadHeap->Insert(entry, newKey);
+               } else
+                       sCoreCPUBoundThreadHeap->ModifyKey(entry, newKey);
+       } else {
+               if (oldKey > thread_max_threads()) {
+                       sCoreCPUBoundThreadHeap->ModifyKey(entry, -1);
+                       ASSERT(sCoreCPUBoundThreadHeap->PeekMinimum() == entry);
+                       sCoreCPUBoundThreadHeap->RemoveMinimum();
+
+                       sCoreThreadHeap->Insert(entry, newKey);
+               } else
+                       sCoreThreadHeap->ModifyKey(entry, newKey);
+       }
+}
+
+
+static inline void
+disable_small_task_packing(void)
+{
+       ASSERT(!sSingleCore);
+
+       ASSERT(sDisableSmallTaskPacking == 0);
+       ASSERT(sSmallTaskCore == sCPUToCore[smp_get_current_cpu()]);
+
+       ASSERT(sAssignedThreads > 0);
+       sDisableSmallTaskPacking = sAssignedThreads * 64;
+       sSmallTaskCore = -1;
+}
+
+
+static inline void
+increase_penalty(Thread* thread)
+{
+       if (thread->priority <= B_LOWEST_ACTIVE_PRIORITY)
+               return;
+       if (thread->priority >= B_FIRST_REAL_TIME_PRIORITY)
+               return;
+
+       TRACE("increasing thread %ld penalty\n", thread->id);
+
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+       int32 oldPenalty = schedulerThreadData->priority_penalty++;
+
+       ASSERT(thread->priority - oldPenalty >= B_LOWEST_ACTIVE_PRIORITY);
+
+       const int kMinimalPriority = get_minimal_priority(thread);
+       if (thread->priority - oldPenalty <= kMinimalPriority) {
+               int32 core = schedulerThreadData->previous_core;
+               ASSERT(core >= 0);
+
+               int32 additionalPenalty = 
schedulerThreadData->additional_penalty;
+               if (additionalPenalty == 0 && !sSingleCore) {
+                       sCPUBoundThreads++;
+                       sCoreEntries[core].fCPUBoundThreads++;
+
+                       update_thread_heaps(core);
+               }
+
+               const int kSmallTaskThreshold = 50;
+               if (additionalPenalty > kSmallTaskThreshold && !sSingleCore) {
+                       if (sSmallTaskCore == core)
+                               disable_small_task_packing();
+               }
+
+               schedulerThreadData->priority_penalty = oldPenalty;
+               schedulerThreadData->additional_penalty++;
+       }
+}
+
+
+static inline void
+cancel_penalty(Thread* thread)
+{
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+
+       if (schedulerThreadData->priority_penalty != 0)
+               TRACE("cancelling thread %ld penalty\n", thread->id);
+
+       switch (sSchedulerMode) {
+               case SCHEDULER_MODE_PERFORMANCE:
+                       schedulerThreadData->additional_penalty = 0;
+                       schedulerThreadData->priority_penalty = 0;
+                       break;
+
+               case SCHEDULER_MODE_POWER_SAVING:
+                       if (schedulerThreadData->additional_penalty != 0)
+                               schedulerThreadData->additional_penalty /= 2;
+                       else if (schedulerThreadData->priority_penalty != 0)
+                               schedulerThreadData->priority_penalty--;
+                       break;
+
+               default:
+                       break;
+       }
+}
+
+
+static inline void
+update_priority_heaps(int32 cpu, int32 priority)
+{
+       int32 core = sCPUToCore[cpu];
+
+       sCPUPriorityHeaps[core].ModifyKey(&sCPUEntries[cpu], priority);
+
+       if (sSingleCore)
+               return;
+
+       int32 maxPriority
+               = CPUHeap::GetKey(sCPUPriorityHeaps[core].PeekMaximum());
+       int32 corePriority = CorePriorityHeap::GetKey(&sCoreEntries[core]);
+
+       if (corePriority != maxPriority) {
+               if (maxPriority == B_IDLE_PRIORITY) {
+                       sCorePriorityHeap->ModifyKey(&sCoreEntries[core], 
B_IDLE_PRIORITY);
+                       ASSERT(sCorePriorityHeap->PeekRoot() == 
&sCoreEntries[core]);
+                       sCorePriorityHeap->RemoveRoot();
+               } else if (corePriority == B_IDLE_PRIORITY)
+                       sCorePriorityHeap->Insert(&sCoreEntries[core], 
maxPriority);
+               else
+                       sCorePriorityHeap->ModifyKey(&sCoreEntries[core], 
maxPriority);
+
+               int32 package = sCPUToPackage[cpu];
+               PackageEntry* packageEntry = &sPackageEntries[package];
+               if (maxPriority == B_IDLE_PRIORITY) {
+                       // core goes idle
+                       ASSERT(packageEntry->fIdleCoreCount >= 0);
+                       ASSERT(packageEntry->fIdleCoreCount < 
packageEntry->fCoreCount);
+
+                       packageEntry->fIdleCoreCount++;
+                       packageEntry->fIdleCores.Add(&sCoreEntries[core]);
+
+                       if (packageEntry->fIdleCoreCount == 1) {
+                               // first core on that package to go idle
+
+                               if (packageEntry->fCoreCount > 1)
+                                       sPackageUsageHeap->Insert(packageEntry, 
1);
+                               else
+                                       sIdlePackageList->Add(packageEntry);
+                       } else if (packageEntry->fIdleCoreCount
+                               == packageEntry->fCoreCount) {
+                               // package goes idle
+                               sPackageUsageHeap->ModifyKey(packageEntry, 0);
+                               ASSERT(sPackageUsageHeap->PeekMinimum() == 
packageEntry);
+                               sPackageUsageHeap->RemoveMinimum();
+
+                               sIdlePackageList->Add(packageEntry);
+                       } else {
+                               sPackageUsageHeap->ModifyKey(packageEntry,
+                                       packageEntry->fIdleCoreCount);
+                       }
+               } else if (corePriority == B_IDLE_PRIORITY) {
+                       // core wakes up
+                       ASSERT(packageEntry->fIdleCoreCount > 0);
+                       ASSERT(packageEntry->fIdleCoreCount <= 
packageEntry->fCoreCount);
+
+                       packageEntry->fIdleCoreCount--;
+                       packageEntry->fIdleCores.Remove(&sCoreEntries[core]);
+
+                       if (packageEntry->fIdleCoreCount + 1 == 
packageEntry->fCoreCount) {
+                               // package wakes up
+                               sIdlePackageList->Remove(packageEntry);
+
+                               if (packageEntry->fIdleCoreCount > 0) {
+                                       sPackageUsageHeap->Insert(packageEntry,
+                                               packageEntry->fIdleCoreCount);
+                               }
+                       } else if (packageEntry->fIdleCoreCount == 0) {
+                               // no more idle cores in the package
+                               sPackageUsageHeap->ModifyKey(packageEntry, 0);
+                               ASSERT(sPackageUsageHeap->PeekMinimum() == 
packageEntry);
+                               sPackageUsageHeap->RemoveMinimum();
+                       } else {
+                               sPackageUsageHeap->ModifyKey(packageEntry,
+                                       packageEntry->fIdleCoreCount);
+                       }
+               }
+       }
+}
+
+
+static int32
+choose_core_performance(Thread* thread)
+{
+       CoreEntry* entry;
+
+       if (sIdlePackageList->Last() != NULL) {
+               // wake new package
+               PackageEntry* package = sIdlePackageList->Last();
+               entry = package->fIdleCores.Last();
+       } else if (sPackageUsageHeap->PeekMaximum() != NULL) {
+               // wake new core
+               PackageEntry* package = sPackageUsageHeap->PeekMaximum();
+               entry = package->fIdleCores.Last();
+       } else {
+               // no idle cores, use least occupied core
+               entry = sCorePriorityHeap->PeekRoot();
+
+               int32 priority = get_effective_priority(thread);
+               if (CorePriorityHeap::GetKey(entry) >= priority) {
+                       entry = sCoreThreadHeap->PeekMinimum();
+                       if (entry == NULL)
+                               entry = sCoreCPUBoundThreadHeap->PeekMinimum();
+               }
+       }
+
+       ASSERT(entry != NULL);
+       return entry->fCoreID;
+}
+
+
+static inline bool
+is_task_small(Thread* thread)
+{
+       int32 priority = get_effective_priority(thread);
+       int32 penalty = thread->scheduler_data->priority_penalty;
+       return penalty == 0 || priority >= B_DISPLAY_PRIORITY;
+}
+
+
+static int32
+choose_core_power_saving(Thread* thread)
+{
+       CoreEntry* entry;
+
+       int32 priority = get_effective_priority(thread);
+
+       if (sDisableSmallTaskPacking > 0)
+               sDisableSmallTaskPacking--;
+
+       if (!sDisableSmallTaskPacking && is_task_small(thread)
+               && sCoreThreadHeap->PeekMaximum() != NULL) {
+               // try to pack all threads on one core
+               if (sSmallTaskCore < 0)
+                       sSmallTaskCore = 
sCoreThreadHeap->PeekMaximum()->fCoreID;
+               entry = &sCoreEntries[sSmallTaskCore];
+       } else if (sCorePriorityHeap->PeekRoot() != NULL
+               && CorePriorityHeap::GetKey(sCorePriorityHeap->PeekRoot())
+                       < priority) {
+               // run immediately on already woken core
+               entry = sCorePriorityHeap->PeekRoot();
+       } else if (sPackageUsageHeap->PeekMinimum() != NULL) {
+               // wake new core
+               PackageEntry* package = sPackageUsageHeap->PeekMinimum();
+               entry = package->fIdleCores.Last();
+       } else if (sIdlePackageList->Last() != NULL) {
+               // wake new package
+               PackageEntry* package = sIdlePackageList->Last();
+               entry = package->fIdleCores.Last();
+       } else {
+               // no idle cores, use least occupied core
+               entry = sCoreThreadHeap->PeekMinimum();
+               if (entry == NULL)
+                       entry = sCoreCPUBoundThreadHeap->PeekMinimum();
+       }
+
+       ASSERT(entry != NULL);
+       return entry->fCoreID;
+}
+
+
+static inline int32
+choose_core(Thread* thread)
+{
+       ASSERT(!sSingleCore);
+       return sChooseCore(thread);
+}
+
+
+static inline int32
+choose_cpu(int32 core)
+{
+       CPUEntry* entry = sCPUPriorityHeaps[core].PeekMinimum();
+       ASSERT(entry != NULL);
+       return entry->fCPUNumber;
+}
+
+
+static bool
+should_rebalance(Thread* thread)
+{
+       ASSERT(!sSingleCore);
+
+       if (thread_is_idle_thread(thread))
+               return false;
+
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+       ASSERT(schedulerThreadData->previous_core >= 0);
+
+       CoreEntry* coreEntry = 
&sCoreEntries[schedulerThreadData->previous_core];
+
+       // If this is a cpu bound thread and we have significantly more such 
threads
+       //  than the average get rid of this one.
+       if (schedulerThreadData->additional_penalty != 0) {
+               int32 averageCPUBound = sCPUBoundThreads / sRunQueueCount;
+               return coreEntry->fCPUBoundThreads - averageCPUBound > 1;
+       }
+
+       // If this thread is not cpu bound but we have at least one consider 
giving
+       // this one to someone less busy.
+       int32 averageThread = sAssignedThreads / sRunQueueCount;
+       if (coreEntry->fCPUBoundThreads > 0) {
+               CoreEntry* other = sCoreThreadHeap->PeekMinimum();
+               if (other != NULL
+                       && CoreThreadHeap::GetKey(other) <= averageThread) {
+                       return true;
+               }
+       }
+
+       int32 threadsAboveAverage = coreEntry->fThreads - averageThread;
+
+       // All cores try to give us small tasks, check whether we have enough.
+       const int kSmallTaskCountThreshold = 5;
+       if (sDisableSmallTaskPacking == 0 && sSmallTaskCore == 
coreEntry->fCoreID) {
+               if (threadsAboveAverage > kSmallTaskCountThreshold) {
+                       if (!is_task_small(thread))
+                               return true;
+               } else if (threadsAboveAverage > 2 * kSmallTaskCountThreshold) {
+                       disable_small_task_packing();
+               }
+       }
+
+       // Try our luck at small task packing.
+       if (sDisableSmallTaskPacking == 0 && is_task_small(thread))
+               return sSmallTaskCore != coreEntry->fCoreID;
+
+       // No cpu bound threads - the situation is quite good. Make sure it
+       // won't get much worse...
+       const int32 kBalanceThreshold = 3;
+       return threadsAboveAverage > kBalanceThreshold;
+}
+
+
+static void
+assign_active_thread_to_core(Thread* thread)
+{
+       if (thread_is_idle_thread(thread) || sSingleCore)
+               return;
+
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+
+       ASSERT(schedulerThreadData->previous_core >= 0);
+       int32 core = schedulerThreadData->previous_core;
+
+       sCoreEntries[core].fThreads++;
+       sAssignedThreads++;
+
+       if (schedulerThreadData->additional_penalty != 0) {
+               sCoreEntries[core].fCPUBoundThreads++;
+               sCPUBoundThreads++;
+       }
+
+       update_thread_heaps(core);
+}
+
+
+static inline void
+thread_goes_away(Thread* thread)
+{
+       if (thread_is_idle_thread(thread))
+               return;
+
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+
+       ASSERT(schedulerThreadData->previous_core >= 0);
+       int32 core = schedulerThreadData->previous_core;
+
+       schedulerThreadData->went_sleep = system_time();
+       schedulerThreadData->went_sleep_active = sCoreEntries[core].fActiveTime;
+
+       if (sSingleCore)
+               return;
+
+       ASSERT(sCoreEntries[core].fThreads > 0);
+       ASSERT(sCoreEntries[core].fThreads > sCoreEntries[core].fCPUBoundThreads
+               || (sCoreEntries[core].fThreads == 
sCoreEntries[core].fCPUBoundThreads
+                       && schedulerThreadData->additional_penalty != 0));
+       sCoreEntries[core].fThreads--;
+       sAssignedThreads--;
+
+       if (schedulerThreadData->additional_penalty != 0) {
+               ASSERT(sCoreEntries[core].fCPUBoundThreads > 0);
+               sCoreEntries[core].fCPUBoundThreads--;
+               sCPUBoundThreads--;
+       }
+
+       update_thread_heaps(core);
+}
+
+
+static void
+enqueue(Thread* thread, bool newOne)
+{
+       ASSERT(thread != NULL);
 
-#include "scheduler_affine.h"
-#include "scheduler_simple.h"
-#include "scheduler_tracing.h"
+       thread->state = thread->next_state = B_THREAD_READY;
 
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
 
-struct scheduler_ops* gScheduler;
-spinlock gSchedulerLock = B_SPINLOCK_INITIALIZER;
-SchedulerListenerList gSchedulerListeners;
+       bigtime_t hasSlept = system_time() - schedulerThreadData->went_sleep;
+       if (newOne && hasSlept > kThreadQuantum)
+               cancel_penalty(thread);
 
-static void (*sRescheduleFunction)(void);
+       int32 threadPriority = get_effective_priority(thread);
 
+       T(EnqueueThread(thread, threadPriority));
 
-static void
-scheduler_reschedule_no_op(void)
+       bool pinned = thread->pinned_to_cpu > 0;
+       int32 targetCPU = -1;
+       int32 targetCore;
+       if (pinned) {
+               targetCPU = thread->previous_cpu->cpu_num;
+               targetCore = sCPUToCore[targetCPU];
+               ASSERT(targetCore == schedulerThreadData->previous_core);
+
+               if (newOne)
+                       assign_active_thread_to_core(thread);
+       } else if (sSingleCore) {
+               targetCore = 0;
+               targetCPU = choose_cpu(targetCore);
+
+               schedulerThreadData->previous_core = targetCore;
+       } else if (schedulerThreadData->previous_core < 0
+               || (newOne && has_cache_expired(thread))
+               || should_rebalance(thread)) {
+
+               if (thread_is_idle_thread(thread)) {
+                       targetCPU = thread->previous_cpu->cpu_num;
+                       targetCore = sCPUToCore[targetCPU];
+               } else {
+                       if (!newOne)
+                               thread_goes_away(thread);
+
+                       targetCore = choose_core(thread);
+                       targetCPU = choose_cpu(targetCore);
+               }
+
+               schedulerThreadData->previous_core = targetCore;
+               assign_active_thread_to_core(thread);
+       } else {
+               targetCore = schedulerThreadData->previous_core;
+               targetCPU = choose_cpu(targetCore);
+               if (newOne)
+                       assign_active_thread_to_core(thread);
+       }
+
+       ASSERT(targetCore >= 0 && targetCore < sRunQueueCount);
+       ASSERT(targetCPU >= 0 && targetCPU < smp_get_num_cpus());
+
+       TRACE("enqueueing thread %ld with priority %ld %ld\n", thread->id,
+               threadPriority, targetCore);
+       if (pinned)
+               sPinnedRunQueues[targetCPU].PushBack(thread, threadPriority);
+       else
+               sRunQueues[targetCore].PushBack(thread, threadPriority);
+
+       schedulerThreadData->cpu_bound = true;
+       schedulerThreadData->time_left = 0;
+       schedulerThreadData->stolen_time = 0;
+
+       // notify listeners
+       NotifySchedulerListeners(&SchedulerListener::ThreadEnqueuedInRunQueue,
+               thread);
+
+       Thread* targetThread = gCPU[targetCPU].running_thread;
+       int32 targetPriority = get_effective_priority(targetThread);
+
+       TRACE("choosing CPU %ld with current priority %ld\n", targetCPU,
+               targetPriority);
+
+       if (threadPriority > targetPriority) {
+               targetThread->scheduler_data->lost_cpu = true;
+
+               // It is possible that another CPU schedules the thread before 
the
+               // target CPU. However, since the target CPU is sent an ICI it 
will
+               // reschedule anyway and update its heap key to the correct 
value.
+               update_priority_heaps(targetCPU, threadPriority);
+
+               if (targetCPU == smp_get_current_cpu())
+                       gCPU[targetCPU].invoke_scheduler = true;
+               else {
+                       smp_send_ici(targetCPU, SMP_MSG_RESCHEDULE, 0, 0, 0, 
NULL,
+                               SMP_MSG_FLAG_ASYNC);
+               }
+       }
+}
+
+
+/*!    Enqueues the thread into the run queue.
+       Note: thread lock must be held when entering this function
+*/
+void
+scheduler_enqueue_in_run_queue(Thread *thread)
 {
-       Thread* thread = thread_get_current_thread();
-       if (thread != NULL && thread->next_state != B_THREAD_READY)
-               panic("scheduler_reschedule_no_op() called in non-ready 
thread");
+       TRACE("enqueueing new thread %ld with static priority %ld\n", 
thread->id,
+               thread->priority);
+       enqueue(thread, true);
 }
 
 
-// #pragma mark - SchedulerListener
+static inline void
+put_back(Thread* thread)
+{
+       bool pinned = sPinnedRunQueues != NULL && thread->pinned_to_cpu > 0;
 
+       if (pinned) {
+               int32 pinnedCPU = thread->previous_cpu->cpu_num;
+               sPinnedRunQueues[pinnedCPU].PushFront(thread,
+                       get_effective_priority(thread));
+       } else {
+               int32 previousCore = thread->scheduler_data->previous_core;
+               ASSERT(previousCore >= 0);
+               sRunQueues[previousCore].PushFront(thread,
+                       get_effective_priority(thread));
+       }
+}
 
-SchedulerListener::~SchedulerListener()
+
+/*!    Sets the priority of a thread.
+       Note: thread lock must be held when entering this function
+*/
+void
+scheduler_set_thread_priority(Thread *thread, int32 priority)
+{
+       if (priority == thread->priority)
+               return;
+
+       TRACE("changing thread %ld priority to %ld (old: %ld, effective: 
%ld)\n",
+               thread->id, priority, thread->priority,
+               get_effective_priority(thread));
+
+       if (thread->state == B_THREAD_RUNNING)
+               thread_goes_away(thread);
+
+       if (thread->state != B_THREAD_READY) {
+               cancel_penalty(thread);
+               thread->priority = priority;
+
+               if (thread->state == B_THREAD_RUNNING) {
+                       assign_active_thread_to_core(thread);
+                       update_priority_heaps(thread->cpu->cpu_num, priority);
+               }
+               return;
+       }
+
+       // The thread is in the run queue. We need to remove it and re-insert 
it at
+       // a new position.
+
+       T(RemoveThread(thread));
+
+       // notify listeners
+       NotifySchedulerListeners(&SchedulerListener::ThreadRemovedFromRunQueue,
+               thread);
+
+       // remove thread from run queue
+       int32 previousCore = thread->scheduler_data->previous_core;
+       ASSERT(previousCore >= 0);
+       sRunQueues[previousCore].Remove(thread);
+       thread_goes_away(thread);
+
+       // set priority and re-insert
+       cancel_penalty(thread);
+       thread->priority = priority;
+
+       scheduler_enqueue_in_run_queue(thread);
+}
+
+
+static int32
+reschedule_event(timer *unused)
 {
+       // This function is called as a result of the timer event set by the
+       // scheduler. Make sure the reschedule() is invoked.
+       Thread* thread= thread_get_current_thread();
+
+       thread->scheduler_data->lost_cpu = true;
+       thread->cpu->invoke_scheduler = true;
+       thread->cpu->preempted = 1;
+       return B_HANDLED_INTERRUPT;
 }
 
 
-// #pragma mark - kernel private
+static inline bool 
+quantum_ended(Thread* thread, bool wasPreempted, bool hasYielded)
+{
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
 
+       if (hasYielded) {
+               schedulerThreadData->time_left = 0;
+               return true;
+       }
 
-/*!    Add the given scheduler listener. Thread lock must be held.
+       bigtime_t time_used = system_time() - 
schedulerThreadData->quantum_start;
+       schedulerThreadData->time_left -= time_used;
+       schedulerThreadData->time_left = max_c(0, 
schedulerThreadData->time_left);
+
+       // too little time left, it's better make the next quantum a bit longer
+       if (wasPreempted || schedulerThreadData->time_left <= kThreadQuantum / 
50) {
+               schedulerThreadData->stolen_time += 
schedulerThreadData->time_left;
+               schedulerThreadData->time_left = 0;
+       }
+
+       return schedulerThreadData->time_left == 0;
+}
+
+
+static inline bigtime_t
+quantum_linear_interpolation(bigtime_t maxQuantum, bigtime_t minQuantum,
+       int32 maxPriority, int32 minPriority, int32 priority)
+{
+       ASSERT(priority <= maxPriority);
+       ASSERT(priority >= minPriority);
+
+       bigtime_t result = (maxQuantum - minQuantum) * (priority - minPriority);
+       result /= maxPriority - minPriority;
+       return maxQuantum - result;
+}
+
+
+static inline bigtime_t
+get_base_quantum(Thread* thread)
+{
+       int32 priority = get_effective_priority(thread);
+
+       if (priority >= B_URGENT_DISPLAY_PRIORITY)
+               return kThreadQuantum;
+       if (priority > B_NORMAL_PRIORITY) {
+               return quantum_linear_interpolation(kThreadQuantum * 4,
+                       kThreadQuantum, B_URGENT_DISPLAY_PRIORITY, 
B_NORMAL_PRIORITY,
+                       priority);
+       }
+       return quantum_linear_interpolation(kThreadQuantum * 64,
+               kThreadQuantum * 4, B_NORMAL_PRIORITY, B_IDLE_PRIORITY, 
priority);
+}
+
+
+static inline bigtime_t
+compute_quantum(Thread* thread)
+{
+       scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
+
+       bigtime_t quantum;
+       if (schedulerThreadData->time_left != 0)
+               quantum = schedulerThreadData->time_left;
+       else
+               quantum = get_base_quantum(thread);
+
+       quantum += schedulerThreadData->stolen_time;
+       schedulerThreadData->stolen_time = 0;
+
+       schedulerThreadData->time_left = quantum;
+       schedulerThreadData->quantum_start = system_time();
+
+       return quantum;
+}
+
+
+static inline Thread*
+dequeue_thread(int32 thisCPU)
+{
+       int32 thisCore = sCPUToCore[thisCPU];
+       Thread* sharedThread = sRunQueues[thisCore].PeekMaximum();
+
+       Thread* pinnedThread = NULL;
+       if (sPinnedRunQueues != NULL)
+               pinnedThread = sPinnedRunQueues[thisCPU].PeekMaximum();
+
+       if (sharedThread == NULL && pinnedThread == NULL)
+               return NULL;
+
+       int32 pinnedPriority = -1;
+       if (pinnedThread != NULL)
+               pinnedPriority = get_effective_priority(pinnedThread);
+
+       int32 sharedPriority = -1;
+       if (sharedThread != NULL)
+               sharedPriority = get_effective_priority(sharedThread);
+
+       if (sharedPriority > pinnedPriority) {
+               sRunQueues[thisCore].Remove(sharedThread);
+               return sharedThread;
+       }
+
+       sPinnedRunQueues[thisCPU].Remove(pinnedThread);
+       return pinnedThread;
+}
+
+
+static inline void
+track_cpu_activity(Thread* oldThread, Thread* nextThread, int32 thisCore)
+{
+       if (!thread_is_idle_thread(oldThread)) {
+               bigtime_t active
+                       = (oldThread->kernel_time - 
oldThread->cpu->last_kernel_time)
+                               + (oldThread->user_time - 
oldThread->cpu->last_user_time);
+
+               atomic_add64(&oldThread->cpu->active_time, active);
+               sCoreEntries[thisCore].fActiveTime += active;
+       }
+
+       if (!thread_is_idle_thread(nextThread)) {
+               oldThread->cpu->last_kernel_time = nextThread->kernel_time;
+               oldThread->cpu->last_user_time = nextThread->user_time;
+       }
+}
+
+
+static void
+_scheduler_reschedule(void)
+{
+       Thread* oldThread = thread_get_current_thread();
+
+       int32 thisCPU = smp_get_current_cpu();
+       int32 thisCore = sCPUToCore[thisCPU];
+
+       TRACE("reschedule(): cpu %ld, current thread = %ld\n", thisCPU,
+               oldThread->id);
+
+       oldThread->state = oldThread->next_state;
+       scheduler_thread_data* schedulerOldThreadData = 
oldThread->scheduler_data;
+
+       // update CPU heap so that old thread would have CPU properly chosen
+       Thread* nextThread = sRunQueues[thisCore].PeekMaximum();
+       if (nextThread != NULL) {
+               update_priority_heaps(thisCPU,
+                       get_effective_priority(nextThread));
+       }
+
+       switch (oldThread->next_state) {
+               case B_THREAD_RUNNING:
+               case B_THREAD_READY:
+                       if (!schedulerOldThreadData->lost_cpu)
+                               schedulerOldThreadData->cpu_bound = false;
+
+                       if (quantum_ended(oldThread, oldThread->cpu->preempted,
+                                       oldThread->has_yielded)) {
+                               if (schedulerOldThreadData->cpu_bound)
+                                       increase_penalty(oldThread);
+
+                               TRACE("enqueueing thread %ld into run queue 
priority = %ld\n",
+                                       oldThread->id, 
get_effective_priority(oldThread));
+                               enqueue(oldThread, false);
+                       } else {
+                               TRACE("putting thread %ld back in run queue 
priority = %ld\n",
+                                       oldThread->id, 
get_effective_priority(oldThread));
+                               put_back(oldThread);
+                       }
+
+                       break;
+               case B_THREAD_SUSPENDED:
+                       thread_goes_away(oldThread);
+                       TRACE("reschedule(): suspending thread %ld\n", 
oldThread->id);
+                       break;
+               case THREAD_STATE_FREE_ON_RESCHED:
+                       thread_goes_away(oldThread);
+                       break;
+               default:
+                       thread_goes_away(oldThread);
+                       TRACE("not enqueueing thread %ld into run queue 
next_state = %ld\n",
+                               oldThread->id, oldThread->next_state);
+                       break;
+       }
+
+       oldThread->has_yielded = false;
+       schedulerOldThreadData->lost_cpu = false;
+
+       // select thread with the biggest priority
+       if (oldThread->cpu->disabled) {
+               ASSERT(sPinnedRunQueues != NULL);
+               nextThread = sPinnedRunQueues[thisCPU].PeekMaximum();
+               if (nextThread != NULL)
+                       sPinnedRunQueues[thisCPU].Remove(nextThread);
+               else {
+                       nextThread = 
sRunQueues[thisCore].GetHead(B_IDLE_PRIORITY);
+                       if (nextThread != NULL)
+                               sRunQueues[thisCore].Remove(nextThread);
+               }
+       } else
+               nextThread = dequeue_thread(thisCPU);
+       if (!nextThread)
+               panic("reschedule(): run queues are empty!\n");
+
+       TRACE("reschedule(): cpu %ld, next thread = %ld\n", thisCPU,
+               nextThread->id);
+
+       T(ScheduleThread(nextThread, oldThread));
+
+       // notify listeners
+       NotifySchedulerListeners(&SchedulerListener::ThreadScheduled,
+               oldThread, nextThread);
+
+       // update CPU heap
+       update_priority_heaps(thisCPU,
+               get_effective_priority(nextThread));
+
+       nextThread->state = B_THREAD_RUNNING;
+       nextThread->next_state = B_THREAD_READY;
+       ASSERT(nextThread->scheduler_data->previous_core == thisCore);
+       //nextThread->scheduler_data->previous_core = thisCore;
+
+       // track kernel time (user time is tracked in thread_at_kernel_entry())
+       scheduler_update_thread_times(oldThread, nextThread);
+
+       // track CPU activity
+       track_cpu_activity(oldThread, nextThread, thisCore);
+
+       if (nextThread != oldThread || oldThread->cpu->preempted) {
+               timer* quantumTimer = &oldThread->cpu->quantum_timer;
+               if (!oldThread->cpu->preempted)
+                       cancel_timer(quantumTimer);
+
+               oldThread->cpu->preempted = 0;
+               if (!thread_is_idle_thread(nextThread)) {
+                       bigtime_t quantum = compute_quantum(oldThread);
+                       add_timer(quantumTimer, &reschedule_event, quantum,
+                               B_ONE_SHOT_RELATIVE_TIMER | 
B_TIMER_ACQUIRE_SCHEDULER_LOCK);
+               }
+
+               if (nextThread != oldThread)
+                       scheduler_switch_thread(oldThread, nextThread);
+       }
+}
+
+
+/*!    Runs the scheduler.
+       Note: expects thread spinlock to be held
 */
 void
-scheduler_add_listener(struct SchedulerListener* listener)
+scheduler_reschedule(void)
 {
-       gSchedulerListeners.Add(listener);
+       if (!sSchedulerEnabled) {
+               Thread* thread = thread_get_current_thread();
+               if (thread != NULL && thread->next_state != B_THREAD_READY)
+                       panic("scheduler_reschedule_no_op() called in non-ready 
thread");
+               return;
+       }
+
+       _scheduler_reschedule();
 }
 
 
-/*!    Remove the given scheduler listener. Thread lock must be held.
+status_t
+scheduler_on_thread_create(Thread* thread, bool idleThread)
+{
+       thread->scheduler_data = new (std::nothrow)scheduler_thread_data;
+       if (thread->scheduler_data == NULL)
+               return B_NO_MEMORY;
+       return B_OK;
+}
+
+
+void
+scheduler_on_thread_init(Thread* thread)
+{
+       thread->scheduler_data->Init();
+}
+
+
+void
+scheduler_on_thread_destroy(Thread* thread)
+{
+       delete thread->scheduler_data;
+}
+
+
+/*!    This starts the scheduler. Must be run in the context of the initial 
idle
+       thread. Interrupts must be disabled and will be disabled when returning.
 */
 void
-scheduler_remove_listener(struct SchedulerListener* listener)
+scheduler_start(void)
 {
-       gSchedulerListeners.Remove(listener);
+       SpinLocker schedulerLocker(gSchedulerLock);
+
+       _scheduler_reschedule();
 }
 
 
-static bool
-should_use_affine_scheduler(int32 cpuCount)
+static status_t
+set_operation_mode(scheduler_mode mode)
 {
-       if (cpuCount < 2)
-               return false;
+       if (mode != SCHEDULER_MODE_PERFORMANCE
+               && mode != SCHEDULER_MODE_POWER_SAVING) {
+               return B_BAD_VALUE;
+       }
 
-       for (int32 i = 1; i < cpuCount; i++) {
-               for (int32 j = 0; j < gCPUCacheLevelCount; j++) {
-                       if (gCPU[i].cache_id[j] != gCPU[i - 1].cache_id[j])
-                               return true;
+#ifdef TRACE_SCHEDULER
+       const char* modeNames = { "performance", "power saving" };
+#endif
+       TRACE("switching scheduler to %s mode\n", modeNames[mode]);
+
+       sSchedulerMode = mode;
+       switch (mode) {
+               case SCHEDULER_MODE_PERFORMANCE:
+                       sDisableSmallTaskPacking = -1;
+                       sSmallTaskCore = -1;
+                       sChooseCore = choose_core_performance;
+                       break;
+
+               case SCHEDULER_MODE_POWER_SAVING:
+                       sDisableSmallTaskPacking = 0;
+                       sSmallTaskCore = -1;
+                       sChooseCore = choose_core_power_saving;
+                       break;
+
+               default:
+                       break;
+       }
+
+       return B_OK;
+}
+
+
+static void
+traverse_topology_tree(cpu_topology_node* node, int packageID, int coreID)
+{
+       switch (node->level) {
+               case CPU_TOPOLOGY_SMT:
+                       sCPUToCore[node->id] = coreID;
+                       sCPUToPackage[node->id] = packageID;
+                       return;
+
+               case CPU_TOPOLOGY_CORE:
+                       coreID = node->id;
+                       break;
+
+               case CPU_TOPOLOGY_PACKAGE:
+                       packageID = node->id;
+                       break;
+
+               default:
+                       break;
+       }
+
+       for (int32 i = 0; i < node->children_count; i++)
+               traverse_topology_tree(node->children[i], packageID, coreID);
+}
+
+
+static status_t
+build_topology_mappings(int32& cpuCount, int32& coreCount, int32& packageCount)
+{
+       cpuCount = smp_get_num_cpus();
+
+       sCPUToCore = new(std::nothrow) int32[cpuCount];
+       if (sCPUToCore == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<int32> cpuToCoreDeleter(sCPUToCore);
+
+       sCPUToPackage = new(std::nothrow) int32[cpuCount];
+       if (sCPUToPackage == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<int32> cpuToPackageDeleter(sCPUToPackage);
+
+       coreCount = 0;
+       for (int32 i = 0; i < cpuCount; i++) {
+               if (gCPU[i].topology_id[CPU_TOPOLOGY_SMT] == 0)
+                       coreCount++;
+       }
+
+       packageCount = 0;
+       for (int32 i = 0; i < cpuCount; i++) {
+               if (gCPU[i].topology_id[CPU_TOPOLOGY_SMT] == 0
+                       && gCPU[i].topology_id[CPU_TOPOLOGY_CORE] == 0) {
+                       packageCount++;
                }
        }
 
-       return false;
+       cpu_topology_node* root = get_cpu_topology();
+       traverse_topology_tree(root, 0, 0);
+
+       cpuToCoreDeleter.Detach();
+       cpuToPackageDeleter.Detach();
+       return B_OK;
+}
+
+
+static status_t
+_scheduler_init()
+{
+       // create logical processor to core and package mappings
+       int32 cpuCount, coreCount, packageCount;
+       status_t result = build_topology_mappings(cpuCount, coreCount,
+               packageCount);
+       if (result != B_OK)
+               return result;
+       sRunQueueCount = coreCount;
+       sSingleCore = coreCount == 1;
+
+       // create package heap and idle package stack
+       sPackageEntries = new(std::nothrow) PackageEntry[packageCount];
+       if (sPackageEntries == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<PackageEntry> packageEntriesDeleter(sPackageEntries);
+
+       sPackageUsageHeap = new(std::nothrow) PackageHeap(packageCount);
+       if (sPackageUsageHeap == NULL)
+               return B_NO_MEMORY;
+       ObjectDeleter<PackageHeap> packageHeapDeleter(sPackageUsageHeap);
+
+       sIdlePackageList = new(std::nothrow) IdlePackageList;
+       if (sIdlePackageList == NULL)
+               return B_NO_MEMORY;
+       ObjectDeleter<IdlePackageList> packageListDeleter(sIdlePackageList);
+
+       for (int32 i = 0; i < packageCount; i++) {
+               sPackageEntries[i].fPackageID = i;
+               sPackageEntries[i].fIdleCoreCount = coreCount / packageCount;
+               sPackageEntries[i].fCoreCount = coreCount / packageCount;
+               sIdlePackageList->Insert(&sPackageEntries[i]);
+       }
+
+       // create logical processor and core heaps
+       sCPUEntries = new CPUEntry[cpuCount];
+       if (sCPUEntries == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<CPUEntry> cpuEntriesDeleter(sCPUEntries);
+
+       sCoreEntries = new CoreEntry[coreCount];
+       if (sCoreEntries == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<CoreEntry> coreEntriesDeleter(sCoreEntries);
+
+       sCorePriorityHeap = new CorePriorityHeap(coreCount);
+       if (sCorePriorityHeap == NULL)
+               return B_NO_MEMORY;
+       ObjectDeleter<CorePriorityHeap> 
corePriorityHeapDeleter(sCorePriorityHeap);
+
+       sCoreThreadHeap = new CoreThreadHeap;
+       if (sCoreThreadHeap == NULL)
+               return B_NO_MEMORY;
+       ObjectDeleter<CoreThreadHeap> coreThreadHeapDeleter(sCoreThreadHeap);
+
+       sCoreCPUBoundThreadHeap = new CoreThreadHeap(coreCount);
+       if (sCoreCPUBoundThreadHeap == NULL)
+               return B_NO_MEMORY;
+       ObjectDeleter<CoreThreadHeap> coreCPUThreadHeapDeleter(
+               sCoreCPUBoundThreadHeap);
+
+       for (int32 i = 0; i < coreCount; i++) {
+               sCoreEntries[i].fCoreID = i;
+               sCoreEntries[i].fActiveTime = 0;
+               sCoreEntries[i].fThreads = 0;
+               sCoreEntries[i].fCPUBoundThreads = 0;
+
+               status_t result = sCoreThreadHeap->Insert(&sCoreEntries[i], 0);
+               if (result != B_OK)
+                       return result;
+
+               result = sCorePriorityHeap->Insert(&sCoreEntries[i], 
B_IDLE_PRIORITY);
+               if (result != B_OK)
+                       return result;
+               sCorePriorityHeap->RemoveRoot();
+       }
+
+       sCPUPriorityHeaps = new CPUHeap[coreCount];
+       if (sCPUPriorityHeaps == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<CPUHeap> cpuPriorityHeapDeleter(sCPUPriorityHeaps);
+
+       for (int32 i = 0; i < cpuCount; i++) {
+               sCPUEntries[i].fCPUNumber = i;
+               int32 core = sCPUToCore[i];
+
+               int32 package = sCPUToPackage[i];
+               if (sCPUPriorityHeaps[core].PeekMaximum() == NULL)
+                       
sPackageEntries[package].fIdleCores.Insert(&sCoreEntries[core]);
+
+               status_t result
+                       = sCPUPriorityHeaps[core].Insert(&sCPUEntries[i], 
B_IDLE_PRIORITY);
+               if (result != B_OK)
+                       return result;
+       }
+
+       // create per-logical processor run queues for pinned threads
+       TRACE("scheduler_init(): creating %" B_PRId32 " per-cpu queue%s\n",
+               cpuCount, cpuCount != 1 ? "s" : "");
+
+       sPinnedRunQueues = new(std::nothrow) ThreadRunQueue[cpuCount];
+       if (sPinnedRunQueues == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<ThreadRunQueue> pinnedRunQueuesDeleter(sPinnedRunQueues);
+       for (int i = 0; i < cpuCount; i++) {
+               status_t result = sPinnedRunQueues[i].GetInitStatus();
+               if (result != B_OK)
+                       return result;
+       }
+
+       // create per-core run queues
+       TRACE("scheduler_init(): creating %" B_PRId32 " per-core queue%s\n",
+               coreCount, coreCount != 1 ? "s" : "");
+
+       sRunQueues = new(std::nothrow) ThreadRunQueue[coreCount];
+       if (sRunQueues == NULL)
+               return B_NO_MEMORY;
+       ArrayDeleter<ThreadRunQueue> runQueuesDeleter(sRunQueues);
+       for (int i = 0; i < coreCount; i++) {
+               status_t result = sRunQueues[i].GetInitStatus();
+               if (result != B_OK)
+                       return result;
+       }
+
+#if 1
+       set_operation_mode(SCHEDULER_MODE_POWER_SAVING);
+#else
+       set_operation_mode(SCHEDULER_MODE_PERFORMANCE);
+#endif
+
+       add_debugger_command_etc("run_queue", &dump_run_queue,
+               "List threads in run queue", "\nLists threads in run queue", 0);
+       add_debugger_command_etc("cpu_heap", &dump_cpu_heap,
+               "List CPUs in CPU priority heap", "\nList CPUs in CPU priority 
heap",
+               0);
+       if (!sSingleCore) {
+               add_debugger_command_etc("idle_cores", &dump_idle_cores,
+                       "List idle cores", "\nList idle cores", 0);
+       }
+
+       runQueuesDeleter.Detach();
+       pinnedRunQueuesDeleter.Detach();
+       coreCPUThreadHeapDeleter.Detach();
+       coreThreadHeapDeleter.Detach();
+       corePriorityHeapDeleter.Detach();
+       cpuPriorityHeapDeleter.Detach();
+       coreEntriesDeleter.Detach();
+       cpuEntriesDeleter.Detach();
+       packageEntriesDeleter.Detach();
+       packageHeapDeleter.Detach();
+       packageListDeleter.Detach();
+       return B_OK;
 }
 
 
@@ -84,23 +1665,10 @@ scheduler_init(void)
                " cache level%s\n", cpuCount, cpuCount != 1 ? "s" : "",
                gCPUCacheLevelCount, gCPUCacheLevelCount != 1 ? "s" : "");
 
-       status_t result;
-       if (should_use_affine_scheduler(cpuCount)) {
-               dprintf("scheduler_init: using affine scheduler\n");
-               result = scheduler_affine_init();
-       } else {
-               dprintf("scheduler_init: using simple scheduler\n");
-               result = scheduler_simple_init();
-       }
-
+       status_t result = _scheduler_init();
        if (result != B_OK)
                panic("scheduler_init: failed to initialize scheduler\n");
 
-       // Disable rescheduling until the basic kernel initialization is done 
and
-       // CPUs are ready to enable interrupts.
-       sRescheduleFunction = gScheduler->reschedule;
-       gScheduler->reschedule = scheduler_reschedule_no_op;
-
 #if SCHEDULER_TRACING
        add_debugger_command_etc("scheduler", &cmd_scheduler,
                "Analyze scheduler tracing information",
@@ -114,7 +1682,36 @@ scheduler_init(void)
 void
 scheduler_enable_scheduling(void)
 {
-       gScheduler->reschedule = sRescheduleFunction;
+       sSchedulerEnabled = true;
+}
+
+
+// #pragma mark - SchedulerListener
+
+
+SchedulerListener::~SchedulerListener()
+{
+}
+
+
+// #pragma mark - kernel private
+
+
+/*!    Add the given scheduler listener. Thread lock must be held.
+*/
+void
+scheduler_add_listener(struct SchedulerListener* listener)
+{
+       gSchedulerListeners.Add(listener);
+}
+
+
+/*!    Remove the given scheduler listener. Thread lock must be held.
+*/
+void
+scheduler_remove_listener(struct SchedulerListener* listener)
+{
+       gSchedulerListeners.Remove(listener);
 }
 
 
@@ -138,7 +1735,16 @@ _user_estimate_max_scheduling_latency(thread_id id)
        }
        BReference<Thread> threadReference(thread, true);
 
-       // ask the scheduler for the thread's latency
-       InterruptsSpinLocker locker(gSchedulerLock);
-       return gScheduler->estimate_max_scheduling_latency(thread);
+       // TODO: This is probably meant to be called periodically to return the
+       // current estimate depending on the system usage; we return fixed 
estimates
+       // per thread priority, though.
+
+       if (thread->priority >= B_REAL_TIME_DISPLAY_PRIORITY)
+               return kMinThreadQuantum / 4;
+       if (thread->priority >= B_DISPLAY_PRIORITY)
+               return kMinThreadQuantum;
+       if (thread->priority < B_NORMAL_PRIORITY)
+               return 2 * kMaxThreadQuantum;
+
+       return 2 * kMinThreadQuantum;
 }
diff --git a/src/system/kernel/scheduler/scheduler_affine.cpp 
b/src/system/kernel/scheduler/scheduler_affine.cpp
deleted file mode 100644
index a0fb24e..0000000
--- a/src/system/kernel/scheduler/scheduler_affine.cpp
+++ /dev/null
@@ -1,1676 +0,0 @@
-/*
- * Copyright 2013, Paweł Dziepak, pdziepak@xxxxxxxxxxx.
- * Copyright 2009, Rene Gollent, rene@xxxxxxxxxxx.
- * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@xxxxxx.
- * Copyright 2002-2010, Axel Dörfler, axeld@xxxxxxxxxxxxxxxx.
- * Copyright 2002, Angelo Mottola, a.mottola@xxxxxxxxx.
- * Distributed under the terms of the MIT License.
- *
- * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
- * Distributed under the terms of the NewOS License.
- */
-
-
-/*! The thread scheduler */
-
-
-#include <OS.h>
-
-#include <AutoDeleter.h>
-#include <cpu.h>
-#include <debug.h>
-#include <int.h>
-#include <kernel.h>
-#include <kscheduler.h>
-#include <listeners.h>
-#include <scheduler_defs.h>
-#include <smp.h>
-#include <thread.h>
-#include <timer.h>
-#include <util/Heap.h>
-#include <util/MinMaxHeap.h>
-#include <util/Random.h>
-
-#include "RunQueue.h"
-#include "scheduler_common.h"
-#include "scheduler_tracing.h"
-
-
-//#define TRACE_SCHEDULER
-#ifdef TRACE_SCHEDULER
-#      define TRACE(...) dprintf_no_syslog(__VA_ARGS__)
-#else
-#      define TRACE(...) do { } while (false)
-#endif
-
-
-const bigtime_t kThreadQuantum = 1000;
-const bigtime_t kMinThreadQuantum = 3000;
-const bigtime_t kMaxThreadQuantum = 10000;
-
-const bigtime_t kCacheExpire = 100000;
-
-static int sDisableSmallTaskPacking;
-static int32 sSmallTaskCore;
-
-static bool sSingleCore;
-
-static scheduler_mode sSchedulerMode;
-
-static int32 (*sChooseCore)(Thread* thread);
-
-
-// Heaps in sCPUPriorityHeaps are used for load balancing on a core the logical
-// processors in the heap belong to. Since there are no cache affinity issues
-// at this level and the run queue is shared among all logical processors on
-// the core the only real concern is to make lower priority threads give way to
-// the higher priority threads.
-struct CPUEntry : public MinMaxHeapLinkImpl<CPUEntry, int32> {
-       int32           fCPUNumber;
-};
-typedef MinMaxHeap<CPUEntry, int32> AffineCPUHeap;
-static CPUEntry* sCPUEntries;
-static AffineCPUHeap* sCPUPriorityHeaps;
-
-struct CoreEntry : public DoublyLinkedListLinkImpl<CoreEntry> {
-       HeapLink<CoreEntry, int32>      fPriorityHeapLink;
-       MinMaxHeapLink<CoreEntry, int32>        fThreadHeapLink;
-
-       int32           fCoreID;
-
-       bigtime_t       fActiveTime;
-
-       int32           fCPUBoundThreads;
-       int32           fThreads;
-};
-
-static CoreEntry* sCoreEntries;
-typedef Heap<CoreEntry, int32, HeapLesserCompare<int32>,
-               HeapMemberGetLink<CoreEntry, int32, 
&CoreEntry::fPriorityHeapLink> >
-       AffineCorePriorityHeap;
-static AffineCorePriorityHeap* sCorePriorityHeap;
-
-typedef MinMaxHeap<CoreEntry, int32, MinMaxHeapCompare<int32>,
-               MinMaxHeapMemberGetLink<CoreEntry, int32, 
&CoreEntry::fThreadHeapLink> >
-       AffineCoreThreadHeap;
-static AffineCoreThreadHeap* sCoreThreadHeap;
-static AffineCoreThreadHeap* sCoreCPUBoundThreadHeap;
-
-static int32 sCPUBoundThreads;
-static int32 sAssignedThreads;
-
-// sPackageUsageHeap is used to decide which core should be woken up from the
-// idle state. When aiming for performance we should use as many packages as
-// possible with as little cores active in each package as possible (so that 
the
-// package can enter any boost mode if it has one and the active core have more
-// of the shared cache for themselves. If power saving is the main priority we
-// should keep active cores on as little packages as possible (so that other
-// packages can go to the deep state of sleep). The heap stores only packages
-// with at least one core active and one core idle. The packages with all cores
-// idle are stored in sPackageIdleList (in LIFO manner).
-struct PackageEntry : public MinMaxHeapLinkImpl<PackageEntry, int32>,
-       DoublyLinkedListLinkImpl<PackageEntry> {
-       int32                                           fPackageID;
-
-       DoublyLinkedList<CoreEntry>     fIdleCores;
-       int32                                           fIdleCoreCount;
-
-       int32                                           fCoreCount;
-};
-typedef MinMaxHeap<PackageEntry, int32> AffinePackageHeap;
-typedef DoublyLinkedList<PackageEntry> AffineIdlePackageList;
-
-static PackageEntry* sPackageEntries;
-static AffinePackageHeap* sPackageUsageHeap;
-static AffineIdlePackageList* sIdlePackageList;
-
-// The run queues. Holds the threads ready to run ordered by priority.
-// One queue per schedulable target per core. Additionally, each
-// logical processor has its sPinnedRunQueues used for scheduling
-// pinned threads.
-typedef RunQueue<Thread, THREAD_MAX_SET_PRIORITY> AffineRunQueue;
-static AffineRunQueue* sRunQueues;
-static AffineRunQueue* sPinnedRunQueues;
-static int32 sRunQueueCount;
-
-// Since CPU IDs used internally by the kernel bear no relation to the actual
-// CPU topology the following arrays are used to efficiently get the core
-// and the package that CPU in question belongs to.
-static int32* sCPUToCore;
-static int32* sCPUToPackage;
-
-
-struct scheduler_thread_data {
-                                               scheduler_thread_data() { 
Init(); }
-       inline  void            Init();
-
-                       int32           priority_penalty;
-                       int32           additional_penalty;
-
-                       bool            lost_cpu;
-                       bool            cpu_bound;
-
-                       bigtime_t       time_left;
-                       bigtime_t       stolen_time;
-                       bigtime_t       quantum_start;
-
-                       bigtime_t       went_sleep;
-                       bigtime_t       went_sleep_active;
-
-                       int32           previous_core;
-};
-
-
-void
-scheduler_thread_data::Init()
-{
-       priority_penalty = 0;
-       additional_penalty = 0;
-
-       time_left = 0;
-       stolen_time = 0;
-
-       went_sleep = 0;
-       went_sleep_active = 0;
-
-       lost_cpu = false;
-       cpu_bound = true;
-
-       previous_core = -1;
-}
-
-
-static inline int
-affine_get_minimal_priority(Thread* thread)
-{
-       return min_c(thread->priority, 25) / 5;
-}
-
-
-static inline int32
-affine_get_thread_penalty(Thread* thread)
-{
-       int32 penalty = thread->scheduler_data->priority_penalty;
-
-       const int kMinimalPriority = affine_get_minimal_priority(thread);
-       if (kMinimalPriority > 0) {
-               penalty
-                       += thread->scheduler_data->additional_penalty % 
kMinimalPriority;
-       }
-
-       return penalty;
-}
-
-
-static inline int32
-affine_get_effective_priority(Thread* thread)
-{
-       if (thread->priority == B_IDLE_PRIORITY)
-               return thread->priority;
-       if (thread->priority >= B_FIRST_REAL_TIME_PRIORITY)
-               return thread->priority;
-
-       int32 effectivePriority = thread->priority;
-       effectivePriority -= affine_get_thread_penalty(thread);
-
-       ASSERT(effectivePriority < B_FIRST_REAL_TIME_PRIORITY);
-       ASSERT(effectivePriority >= B_LOWEST_ACTIVE_PRIORITY);
-
-       return effectivePriority;
-}
-
-
-static void
-dump_queue(AffineRunQueue::ConstIterator& iterator)
-{
-       if (!iterator.HasNext())
-               kprintf("Run queue is empty.\n");
-       else {
-               kprintf("thread      id      priority penalty  name\n");
-               while (iterator.HasNext()) {
-                       Thread* thread = iterator.Next();
-                       kprintf("%p  %-7" B_PRId32 " %-8" B_PRId32 " %-8" 
B_PRId32 " %s\n",
-                               thread, thread->id, thread->priority,
-                               affine_get_thread_penalty(thread), 
thread->name);
-               }
-       }
-}
-
-
-static int
-dump_run_queue(int argc, char **argv)
-{
-       int32 cpuCount = smp_get_num_cpus();
-       int32 coreCount = 0;
-       for (int32 i = 0; i < cpuCount; i++) {
-               if (gCPU[i].topology_id[CPU_TOPOLOGY_SMT] == 0)
-                       sCPUToCore[i] = coreCount++;
-       }
-
-       AffineRunQueue::ConstIterator iterator;
-       for (int32 i = 0; i < coreCount; i++) {
-               kprintf("\nCore %" B_PRId32 " run queue:\n", i);
-               iterator = sRunQueues[i].GetConstIterator();
-               dump_queue(iterator);
-       }
-
-       for (int32 i = 0; i < cpuCount; i++) {
-               iterator = sPinnedRunQueues[i].GetConstIterator();
-
-               if (iterator.HasNext()) {
-                       kprintf("\nCPU %" B_PRId32 " run queue:\n", i);
-                       dump_queue(iterator);
-               }
-       }
-
-       return 0;
-}
-
-
-static void
-dump_heap(AffineCPUHeap* heap)
-{
-       AffineCPUHeap temp(smp_get_num_cpus());
-
-       kprintf("cpu priority actual priority\n");
-       CPUEntry* entry = heap->PeekMinimum();
-       while (entry) {
-               int32 cpu = entry->fCPUNumber;
-               int32 key = AffineCPUHeap::GetKey(entry);
-               kprintf("%3" B_PRId32 " %8" B_PRId32 " %15" B_PRId32 "\n", cpu, 
key,
-                       
affine_get_effective_priority(gCPU[cpu].running_thread));
-
-               heap->RemoveMinimum();
-               temp.Insert(entry, key);
-
-               entry = heap->PeekMinimum();
-       }
-
-       entry = temp.PeekMinimum();
-       while (entry) {
-               int32 key = AffineCPUHeap::GetKey(entry);
-               temp.RemoveMinimum();
-               heap->Insert(entry, key);
-               entry = temp.PeekMinimum();
-       }
-}
-
-
-static void
-dump_core_thread_heap(AffineCoreThreadHeap* heap)
-{
-       AffineCoreThreadHeap temp(sRunQueueCount);
-
-       CoreEntry* entry = heap->PeekMinimum();
-       while (entry) {
-               int32 key = AffineCoreThreadHeap::GetKey(entry);
-               kprintf("%4" B_PRId32 " %6" B_PRId32 " %7" B_PRId32 " %9" 
B_PRId32 "\n",
-                       entry->fCoreID, key, entry->fThreads, 
entry->fCPUBoundThreads);
-
-               heap->RemoveMinimum();
-               temp.Insert(entry, key);
-
-               entry = heap->PeekMinimum();
-       }
-
-       entry = temp.PeekMinimum();
-       while (entry) {
-               int32 key = AffineCoreThreadHeap::GetKey(entry);
-               temp.RemoveMinimum();
-               heap->Insert(entry, key);
-               entry = temp.PeekMinimum();
-       }
-}
-
-
-static int
-dump_cpu_heap(int argc, char** argv)
-{
-       AffineCorePriorityHeap temp(sRunQueueCount);
-
-       CoreEntry* entry = sCorePriorityHeap->PeekRoot();
-       if (entry != NULL)
-               kprintf("core priority\n");
-       else
-               kprintf("No active cores.\n");
-
-       while (entry) {
-               int32 core = entry->fCoreID;
-               int32 key = AffineCorePriorityHeap::GetKey(entry);
-               kprintf("%4" B_PRId32 " %8" B_PRId32 "\n", core, key);
-
-               sCorePriorityHeap->RemoveRoot();
-               temp.Insert(entry, key);
-
-               entry = sCorePriorityHeap->PeekRoot();
-       }
-
-       entry = temp.PeekRoot();
-       while (entry) {
-               int32 key = AffineCorePriorityHeap::GetKey(entry);
-               temp.RemoveRoot();
-               sCorePriorityHeap->Insert(entry, key);
-               entry = temp.PeekRoot();
-       }
-
-       kprintf("\ncore    key threads cpu-bound\n");
-       dump_core_thread_heap(sCoreThreadHeap);
-       dump_core_thread_heap(sCoreCPUBoundThreadHeap);
-
-       for (int32 i = 0; i < sRunQueueCount; i++) {
-               kprintf("\nCore %" B_PRId32 " heap:\n", i);
-               dump_heap(&sCPUPriorityHeaps[i]);
-       }
-
-       return 0;

[ *** diff truncated: 2147 lines dropped *** ]
[haiku-commits] BRANCH pdziepak-github.scheduler [978fc08] src/system/kernel/scheduler headers/private/kernel

Other related posts: