Here are two very important functions that pertain the scheduling. The goodness function is pretty self explanatory, but I have included some comments in the schedule() function for clarity. These are originally from lr.linux.no (2.4.20).
144 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
145 {
146 int weight;
147
148 /*
149 * select the current process after every other
150 * runnable process, but before the idle thread.
151 * Also, dont trigger a counter recalculation.
152 */
153 weight = -1;
154 if (p->policy & SCHED_YIELD)
155 goto out;
156
157 /*
158 * Non-RT process - normal case first.
159 */
160 if (p->policy == SCHED_OTHER) {
161 /*
162 * Give the process a first-approximation goodness value
163 * according to the number of clock-ticks it has left.
164 *
165 * Don't do any other calculations if the time slice is
166 * over..
167 */
168 weight = p->counter;
169 if (!weight)
170 goto out;
171
172 #ifdef CONFIG_SMP
173 /* Give a largish advantage to the same processor... */
174 /* (this is equivalent to penalizing other processors) */
175 if (p->processor == this_cpu)
176 weight += PROC_CHANGE_PENALTY;
177 #endif
178
179 /* .. and a slight advantage to the current MM */
180 if (p->mm == this_mm || !p->mm)
181 weight += 1;
182 weight += 20 - p->nice;
183 goto out;
184 }
185
186 /*
187 * Realtime process, select the first one on the
188 * runqueue (taking priorities within processes
189 * into account).
190 */
191 weight = 1000 + p->rt_priority;
192 out:
193 return weight;
194 }
547 asmlinkage void schedule(void)
548 {
The point of this function is to get the next task_struct to point to the next task to run on the CPU.549 struct schedule_data * sched_data; 550 struct task_struct *prev, *next, *p; 551 struct list_head *tmp; 552 int this_cpu, c; 553 554 555 spin_lock_prefetch(&runqueue_lock); 556 557 BUG_ON(!current->active_mm); 558 need_resched_back: 559 prev = current; 560 this_cpu = prev->processor; 561For those of you who are interested, unlikely is a macro that tells the compiler that the condition given is unlikely to occur. This helps the compiler make better optimizations.
562 if (unlikely(in_interrupt())) {
563 printk("Scheduling in interrupt\n");
564 BUG();
565 }
566
Make sure prev is holding a global kernel lock567 release_kernel_lock(prev, this_cpu); 568 569 /* 570 * 'sched_data' is protected by the fact that we can run 571 * only one process per CPU. 572 */ 573 sched_data = & aligned_data[this_cpu].schedule_data; 574Make sure we are the only one modifying the global run queue
575 spin_lock_irq(&runqueue_lock); 576Remember that scheduling policy (SCHED_RR) can only be set for real time processes.
577 /* move an exhausted RR process to be last.. */ 578 if (unlikely(prev->policy == SCHED_RR))Has prev exhausted it's time quantum?
579 if (!prev->counter) {
Compute the new quantum and take into account the nice value and move this process to the back of the runqueue.580 prev->counter = NICE_TO_TICKS(prev->nice); 581 move_last_runqueue(prev); 582 } 583If a process has non-blocking signals, set it to state RUNNING. Note: This does not mean that this process will run next, but only that it will be given a *chance* to run next by being placed in the runqueue.
584 switch (prev->state) {
585 case TASK_INTERRUPTIBLE:
586 if (signal_pending(prev)) {
587 prev->state = TASK_RUNNING;
588 break;
589 }
590 default:
If p is not RUNNING, it called schedule voluntarily so remove it from the runqueue.591 del_from_runqueue(prev); 592 case TASK_RUNNING:; 593 } 594 prev->need_resched = 0; 595 596 /* 597 * this is the scheduler proper: 598 */This is the interesting part of the schedule() function. We will now select the next process to run.
599 600 repeat_schedule: 601 /* 602 * Default process to select.. 603 */If no processes want to run, run the idle process (process 0).
604 next = idle_task(this_cpu);Go through all the processes, and compute their weights using the goodness function. The highest weight will be selected to run next.
605 c = -1000;
606 list_for_each(tmp, &runqueue_head) {
607 p = list_entry(tmp, struct task_struct, run_list);
608 if (can_schedule(p, this_cpu)) {
609 int weight = goodness(p, this_cpu, prev->active_mm);
610 if (weight > c)
611 c = weight, next = p;
612 }
613 }
614
615 /* Do we need to re-calculate counters? */
If no process has any time left to run during this epoch, the new epoch must be computed. Incidentally, this is why the 2.4.20 scheduler is O(n). 2.6 will address this issue much better.
616 if (unlikely(!c)) {
617 struct task_struct *p;
618
619 spin_unlock_irq(&runqueue_lock);
620 read_lock(&tasklist_lock);
At first glance the p->counter >> 1 seems to be setting the counter to 0 * 2 + n (*all* process counters must be 0, right?) Actually, though, we are in this block because all the processes in the *TASK_RUNNING state* have no more time during this epoch. There could be (probably will be) other processes that are not RUNNING, but still have time left in their counters. This means that the Linux scheduler favors processes that voluntarily put themselves to sleep when they don't need the CPU, as they will get more time to run during the next epoch. 621 for_each_task(p) 622 p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); 623 read_unlock(&tasklist_lock); 624 spin_lock_irq(&runqueue_lock);After the next epoch is computed, go back and select a process to run.
625 goto repeat_schedule; 626 } 627At this point next will pointing to the next process to run.
628 /*
629 * from this point on nothing can prevent us from
630 * switching to the next task, save this fact in
631 * sched_data.
632 */
633 sched_data->curr = next;
634 task_set_cpu(next, this_cpu);
635 spin_unlock_irq(&runqueue_lock);
636
637 if (unlikely(prev == next)) {
638 /* We won't go through the normal tail, so do this by hand */
639 prev->policy &= ~SCHED_YIELD;
640 goto same_process;
641 }
642
643 #ifdef CONFIG_SMP
644 /*
645 * maintain the per-process 'last schedule' value.
646 * (this has to be recalculated even if we reschedule to
647 * the same process) Currently this is only used on SMP,
648 * and it's approximate, so we do not have to maintain
649 * it while holding the runqueue spinlock.
650 */
651 sched_data->last_schedule = get_cycles();
652
653 /*
654 * We drop the scheduler lock early (it's a global spinlock),
655 * thus we have to lock the previous process from getting
656 * rescheduled during switch_to().
657 */
658
659 #endif /* CONFIG_SMP */
660
Update some statistics...661 kstat.context_swtch++; 662 /* 663 * there are 3 processes which are affected by a context switch: 664 * 665 * prev == .... ==> (last => next) 666 * 667 * It's the 'much more previous' 'prev' that is on next's stack, 668 * but prev is set to (the just run) 'last' process by switch_to(). 669 * This might sound slightly confusing but makes tons of sense. 670 */Finally, setup the next processes memory...
671 prepare_to_switch();
672 {
673 struct mm_struct *mm = next->mm;
674 struct mm_struct *oldmm = prev->active_mm;
675 if (!mm) {
676 BUG_ON(next->active_mm);
677 next->active_mm = oldmm;
678 atomic_inc(&oldmm->mm_count);
679 enter_lazy_tlb(oldmm, next, this_cpu);
680 } else {
681 BUG_ON(next->active_mm != mm);
682 switch_mm(oldmm, mm, next, this_cpu);
683 }
684
685 if (!prev->mm) {
686 prev->active_mm = NULL;
687 mmdrop(oldmm);
688 }
689 }
690
691 /*
692 * This just switches the register state and the
693 * stack.
694 */
695 switch_to(prev, next, prev);
696 __schedule_tail(prev);
697
698 same_process:
699 reacquire_kernel_lock(current);
700 if (current->need_resched)
701 goto need_resched_back;
702 return;
703 }