Here are two very important functions that pertain the scheduling. The goodness function is pretty self explanatory, but I have included some comments in the schedule() function for clarity. These are originally from lr.linux.no (2.4.20).

144 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
145 {
146         int weight;
147 
148         /*
149          * select the current process after every other
150          * runnable process, but before the idle thread.
151          * Also, dont trigger a counter recalculation.
152          */
153         weight = -1;
154         if (p->policy & SCHED_YIELD)
155                 goto out;
156 
157         /*
158          * Non-RT process - normal case first.
159          */
160         if (p->policy == SCHED_OTHER) {
161                 /*
162                  * Give the process a first-approximation goodness value
163                  * according to the number of clock-ticks it has left.
164                  *
165                  * Don't do any other calculations if the time slice is
166                  * over..
167                  */
168                 weight = p->counter;
169                 if (!weight)
170                         goto out;
171                         
172 #ifdef CONFIG_SMP
173                 /* Give a largish advantage to the same processor...   */
174                 /* (this is equivalent to penalizing other processors) */
175                 if (p->processor == this_cpu)
176                         weight += PROC_CHANGE_PENALTY;
177 #endif
178 
179                 /* .. and a slight advantage to the current MM */
180                 if (p->mm == this_mm || !p->mm)
181                         weight += 1;
182                 weight += 20 - p->nice;
183                 goto out;
184         }
185 
186         /*
187          * Realtime process, select the first one on the
188          * runqueue (taking priorities within processes
189          * into account).
190          */
191         weight = 1000 + p->rt_priority;
192 out:
193         return weight;
194 }


547 asmlinkage void schedule(void)
548 {

The point of this function is to get the next task_struct to point to the next task to run on the CPU.
prev points the the current process that was just running on the CPU.
this_cpu is the processor that was executing prev.


549         struct schedule_data * sched_data;
550         struct task_struct *prev, *next, *p;
551         struct list_head *tmp;
552         int this_cpu, c;
553 
554 
555         spin_lock_prefetch(&runqueue_lock);
556 
557         BUG_ON(!current->active_mm);
558 need_resched_back:
559         prev = current;
560         this_cpu = prev->processor;
561

For those of you who are interested, unlikely is a macro that tells the compiler that the condition given is unlikely to occur. This helps the compiler make better optimizations.

562         if (unlikely(in_interrupt())) {
563                 printk("Scheduling in interrupt\n");
564                 BUG();
565         }
566

Make sure prev is holding a global kernel lock

567         release_kernel_lock(prev, this_cpu);
568 
569         /*
570          * 'sched_data' is protected by the fact that we can run
571          * only one process per CPU.
572          */
573         sched_data = & aligned_data[this_cpu].schedule_data;
574

Make sure we are the only one modifying the global run queue

575         spin_lock_irq(&runqueue_lock);
576

Remember that scheduling policy (SCHED_RR) can only be set for real time processes.

577         /* move an exhausted RR process to be last.. */
578         if (unlikely(prev->policy == SCHED_RR))

Has prev exhausted it's time quantum?

579                 if (!prev->counter) {

Compute the new quantum and take into account the nice value and move this process to the back of the runqueue.

580                         prev->counter = NICE_TO_TICKS(prev->nice);
581                         move_last_runqueue(prev);
582                 }
583

If a process has non-blocking signals, set it to state RUNNING. Note: This does not mean that this process will run next, but only that it will be given a *chance* to run next by being placed in the runqueue.

584         switch (prev->state) {
585                 case TASK_INTERRUPTIBLE:
586                         if (signal_pending(prev)) {
587                                 prev->state = TASK_RUNNING;
588                                 break;
589                         }
590                 default:

If p is not RUNNING, it called schedule voluntarily so remove it from the runqueue.

591                         del_from_runqueue(prev);
592                 case TASK_RUNNING:;
593         }
594         prev->need_resched = 0;
595 
596         /*
597          * this is the scheduler proper:
598          */

This is the interesting part of the schedule() function. We will now select the next process to run.

599 
600 repeat_schedule:
601         /*
602          * Default process to select..
603          */

If no processes want to run, run the idle process (process 0).

604         next = idle_task(this_cpu);

Go through all the processes, and compute their weights using the goodness function. The highest weight will be selected to run next.

605         c = -1000;
606         list_for_each(tmp, &runqueue_head) {
607                 p = list_entry(tmp, struct task_struct, run_list);
608                 if (can_schedule(p, this_cpu)) {
609                         int weight = goodness(p, this_cpu, prev->active_mm);
610                         if (weight > c)
611                                 c = weight, next = p;
612                 }
613         }
614 
615         /* Do we need to re-calculate counters? */

If no process has any time left to run during this epoch, the new epoch must be computed. Incidentally, this is why the 2.4.20 scheduler is O(n). 2.6 will address this issue much better.

616         if (unlikely(!c)) {
617                 struct task_struct *p;
618 
619                 spin_unlock_irq(&runqueue_lock);
620                 read_lock(&tasklist_lock);

At first glance the p->counter >> 1 seems to be setting the counter to 0 * 2 + n (*all* process counters must be 0, right?) Actually, though, we are in this block because all the processes in the *TASK_RUNNING state* have no more time during this epoch. There could be (probably will be) other processes that are not RUNNING, but still have time left in their counters. This means that the Linux scheduler favors processes that voluntarily put themselves to sleep when they don't need the CPU, as they will get more time to run during the next epoch.

621                 for_each_task(p)
622                         p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
623                 read_unlock(&tasklist_lock);
624                 spin_lock_irq(&runqueue_lock);

After the next epoch is computed, go back and select a process to run.

625                 goto repeat_schedule;
626         }
627

At this point next will pointing to the next process to run.

628         /*
629          * from this point on nothing can prevent us from
630          * switching to the next task, save this fact in
631          * sched_data.
632          */
633         sched_data->curr = next;
634         task_set_cpu(next, this_cpu);
635         spin_unlock_irq(&runqueue_lock);
636 
637         if (unlikely(prev == next)) {
638                 /* We won't go through the normal tail, so do this by hand */
639                 prev->policy &= ~SCHED_YIELD;
640                 goto same_process;
641         }
642 
643 #ifdef CONFIG_SMP
644         /*
645          * maintain the per-process 'last schedule' value.
646          * (this has to be recalculated even if we reschedule to
647          * the same process) Currently this is only used on SMP,
648          * and it's approximate, so we do not have to maintain
649          * it while holding the runqueue spinlock.
650          */
651         sched_data->last_schedule = get_cycles();
652 
653         /*
654          * We drop the scheduler lock early (it's a global spinlock),
655          * thus we have to lock the previous process from getting
656          * rescheduled during switch_to().
657          */
658 
659 #endif /* CONFIG_SMP */
660

Update some statistics...

661         kstat.context_swtch++;
662         /*
663          * there are 3 processes which are affected by a context switch:
664          *
665          * prev == .... ==> (last => next)
666          *
667          * It's the 'much more previous' 'prev' that is on next's stack,
668          * but prev is set to (the just run) 'last' process by switch_to().
669          * This might sound slightly confusing but makes tons of sense.
670          */

Finally, setup the next processes memory...

671         prepare_to_switch();
672         {
673                 struct mm_struct *mm = next->mm;
674                 struct mm_struct *oldmm = prev->active_mm;
675                 if (!mm) {
676                         BUG_ON(next->active_mm);
677                         next->active_mm = oldmm;
678                         atomic_inc(&oldmm->mm_count);
679                         enter_lazy_tlb(oldmm, next, this_cpu);
680                 } else {
681                         BUG_ON(next->active_mm != mm);
682                         switch_mm(oldmm, mm, next, this_cpu);
683                 }
684 
685                 if (!prev->mm) {
686                         prev->active_mm = NULL;
687                         mmdrop(oldmm);
688                 }
689         }
690 
691         /*
692          * This just switches the register state and the
693          * stack.
694          */
695         switch_to(prev, next, prev);
696         __schedule_tail(prev);
697 
698 same_process:
699         reacquire_kernel_lock(current);
700         if (current->need_resched)
701                 goto need_resched_back;
702         return;
703 }