Linux系统内核抢占补丁的原理

2013-2-25中国IT实验室佚名

【大中小】

　　CPU在内核中运行时并不是处处不可抢占的，内核中存在一些空隙，在这时进行抢占是安全的，内核抢占补丁的基本原理就是将SMP可并行的代码段看成是可以进行内核抢占的区域。

　　2.4内核正好细化了多CPU下的内核线程同步机构，对不可并行的指令块用spinlock和rwlock作了细致的表示，该补丁的实现可谓水到渠成。具体的方法就是在进程的任务结构上增加一个preempt_count变量作为内核抢占锁，它随着spinlock和rwlock一起加锁和解锁。当preempt_count为0时表示可以进行内核调度。内核调度器的入口为preempt_schedule()，它将当前进程标记为TASK_PREEMPTED状态再调用schedule()，在TASK_PREEMPTED状态，schedule()不会将进程从运行队列中删除。

　　下面是内核抢占补丁的主要代码示意：

　　arch/i386/kernel/entry.S:

　　preempt_count = 4 # 将task_struct中的flags用作preempt_count,flags被移到了别

　　的位置

　　ret_from_exception: # 从异常返回

　　#ifdef CONFIG_SMP

　　GET_CURRENT(%ebx)

　　movl processor(%ebx),%eax

　　shll $CONFIG_X86_L1_CACHE_SHIFT,%eax

　　movl SYMBOL_NAME(irq_stat)(,%eax),%ecx # softirq_active

　　testl SYMBOL_NAME(irq_stat)+4(,%eax),%ecx # softirq_mask

　　#else

　　movl SYMBOL_NAME(irq_stat),%ecx # softirq_active

　　testl SYMBOL_NAME(irq_stat)+4,%ecx # softirq_mask

　　#endif

　　jne handle_softirq

　　#ifdef CONFIG_PREEMPT

　　cli

　　incl preempt_count(%ebx) # 异常的入口没有禁止内核调度的指令,与ret_from_intr

　　匹配一下

　　#endif

　　ENTRY(ret_from_intr) # 硬件中断的返回

　　GET_CURRENT(%ebx)

　　#ifdef CONFIG_PREEMPT

　　cli

　　decl preempt_count(%ebx) # 恢复内核抢占标志

　　#endif

　　movl EFLAGS(%esp),%eax # mix EFLAGS and CS

　　movb CS(%esp),%al

　　testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor?

　　jne ret_with_reschedule

　　#ifdef CONFIG_PREEMPT

　　cmpl $0,preempt_count(%ebx)

　　jnz restore_all # 如果preempt_count非零则表示禁止内核抢占

　　cmpl $0,need_resched(%ebx)

　　jz restore_all #

　　movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx

　　addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx

　　jnz restore_all

　　incl preempt_count(%ebx)

　　sti

　　call SYMBOL_NAME(preempt_schedule)

　　jmp ret_from_intr # 新进程返回,返回ret_from_intr恢复抢占标志后再返回

　　#else

　　jmp restore_all

　　#endif

　　ALIGN

　　handle_softirq:

　　#ifdef CONFIG_PREEMPT

　　cli

　　GET_CURRENT(%ebx)

　　incl preempt_count(%ebx)

　　sti

　　#endif

　　call SYMBOL_NAME(do_softirq)

　　jmp ret_from_intr

　　ALIGN

　　reschedule:

　　call SYMBOL_NAME(schedule) # test

　　jmp ret_from_sys_call

　　include/asm/hw_irq.h:

　　...

　　#ifdef CONFIG_PREEMPT

　　#define BUMP_CONTEX_SWITCH_LOCK \

　　GET_CURRENT \

　　"incl 4(%ebx)\n\t"

　　#else

　　#define BUMP_CONTEX_SWITCH_LOCK

　　#endif

　　#define SAVE_ALL \ 硬件中断保护入口现场

　　"cld\n\t" \

　　"pushl %es\n\t" \

　　"pushl %ds\n\t" \

　　"pushl %eax\n\t" \

　　"pushl %ebp\n\t" \

　　"pushl %edi\n\t" \

　　"pushl %esi\n\t" \

　　"pushl %edx\n\t" \

　　"pushl %ecx\n\t" \

　　"pushl %ebx\n\t" \

　　"movl $" STR(__KERNEL_DS) ",%edx\n\t" \

　　"movl %edx,%ds\n\t" \

　　"movl %edx,%es\n\t" \

　　BUMP_CONTEX_SWITCH_LOCK # 硬件中断的入口禁止内核抢占

　　include/linux/spinlock.h:

　　#ifdef CONFIG_PREEMPT

　　#define switch_lock_count() current->preempt_count

　　#define in_ctx_sw_off() (switch_lock_count().counter) 判断当前进程的抢占计数

　　是否非零

　　#define atomic_ptr_in_ctx_sw_off() (&switch_lock_count())

　　#define ctx_sw_off() \ 禁止内核抢占

　　do { \

　　atomic_inc(atomic_ptr_in_ctx_sw_off()); \ 当前进程的内核抢占计数增1

　　} while (0)

　　#define ctx_sw_on_no_preempt() \ 允许内核抢占

　　do { \

　　atomic_dec(atomic_ptr_in_ctx_sw_off()); \ 当前进程的内核抢占计数减1

　　} while (0)

　　#define ctx_sw_on() \ 允许并完成内核抢占

　　do { \

　　if (atomic_dec_and_test(atomic_ptr_in_ctx_sw_off()) && \

　　current->need_resched) \

　　preempt_schedule(); \

　　} while (0)

　　#define spin_lock(lock) \

　　do { \

　　ctx_sw_off(); \ 进入自旋锁时禁止抢占

　　_raw_spin_lock(lock); \

　　} while(0)

　　#define spin_trylock(lock) ({ctx_sw_off(); _raw_spin_trylock(lock) ? \锁定并

　　测试原来是否上锁

　　1 : ({ctx_sw_on(); 0;});})

　　#define spin_unlock(lock) \

　　do { \

　　_raw_spin_unlock(lock); \

　　ctx_sw_on(); \ 离开自旋锁时允许并完成内核抢占

　　} while (0)

　　#define read_lock(lock) ({ctx_sw_off(); _raw_read_lock(lock);})

　　#define read_unlock(lock) ({_raw_read_unlock(lock); ctx_sw_on();})

　　#define write_lock(lock) ({ctx_sw_off(); _raw_write_lock(lock);})

　　#define write_unlock(lock) ({_raw_write_unlock(lock); ctx_sw_on();})

　　#define write_trylock(lock) ({ctx_sw_off(); _raw_write_trylock(lock) ? \

　　1 : ({ctx_sw_on(); 0;});})

　　...

　　include/asm/softirq.h:

　　#define cpu_bh_disable(cpu) do { ctx_sw_off(); local_bh_count(CPU)++; barrie

　　r(); } while (0)

　　#define cpu_bh_enable(cpu) do { barrier(); local_bh_count(CPU)--;ctx_sw_on()

　　; } while (0)

　　kernel/schedule.c:

　　#ifdef CONFIG_PREEMPT

　　asmlinkage void preempt_schedule(void)

　　{

　　while (current->need_resched) {

　　ctx_sw_off();

　　current->state |= TASK_PREEMPTED;

　　schedule();

　　current->state &= ~TASK_PREEMPTED;

　　ctx_sw_on_no_preempt();

　　}

　　#endif

　　asmlinkage void schedule(void)

　　{

　　struct schedule_data * sched_data;

　　struct task_struct *prev, *next, *p;

　　struct list_head *tmp;

　　int this_CPU, c;

　　#ifdef CONFIG_PREEMPT

　　ctx_sw_off();

　　#endif

　　if (!current->active_mm) BUG();

　　need_resched_back:

　　prev = current;

　　this_CPU = prev->processor;

　　if (in_interrupt())

　　goto scheduling_in_interrupt;

　　release_kernel_lock(prev, this_CPU);

　　/* Do "administrative" work here while we don't hold any locks */

　　if (softirq_active(this_cpu) & softirq_mask(this_CPU))

　　goto handle_softirq;

　　handle_softirq_back:

　　* 'sched_data' is protected by the fact that we can run

　　* only one process per CPU.

　　sched_data = & aligned_data[this_CPU].schedule_data;

　　spin_lock_irq(&runqueue_lock);

　　/* move an exhausted RR process to be last.. */

　　if (prev->policy == SCHED_RR)

　　goto move_rr_last;

　　move_rr_back:

　　switch (prev->state) {

　　case TASK_INTERRUPTIBLE:

　　if (signal_pending(prev)) {

　　prev->state = TASK_RUNNING;

　　break;

　　}

　　default:

　　#ifdef CONFIG_PREEMPT

　　if (prev->state & TASK_PREEMPTED)

　　break; 如果是内核抢占调度,则保留运行队列

　　#endif

　　del_from_runqueue(prev);

　　#ifdef CONFIG_PREEMPT

　　case TASK_PREEMPTED:

　　#endif

　　case TASK_RUNNING:

　　}

　　prev->need_resched = 0;

　　* this is the scheduler proper:

　　repeat_schedule:

　　* Default process to select..

　　next = idle_task(this_CPU);

　　c = -1000;

　　if (task_on_runqueue(prev))

　　goto still_running;

　　still_running_back:

　　list_for_each(tmp, &runqueue_head) {

　　p = list_entry(tmp, struct task_struct, run_list);

　　if (can_schedule(p, this_CPU)) {

　　int weight = goodness(p, this_CPU, prev->active_mm);

　　if (weight > c)

　　c = weight, next = p;

　　}

　　/* Do we need to re-calculate counters? */

　　if (!c)

　　goto recalculate;

　　* from this point on nothing can prevent us from

　　* switching to the next task, save this fact in

　　* sched_data.

　　sched_data->curr = next;

　　#ifdef CONFIG_SMP

　　next->has_CPU = 1;

　　next->processor = this_CPU;

　　#endif

　　spin_unlock_irq(&runqueue_lock);

　　if (prev == next)

　　goto same_process;

　　#ifdef CONFIG_SMP

　　* maintain the per-process 'last schedule' value.

　　* (this has to be recalculated even if we reschedule to

　　* the same process) Currently this is only used on SMP,

　　* and it's approximate, so we do not have to maintain

　　* it while holding the runqueue spinlock.

　　sched_data->last_schedule = get_cycles();

　　* We drop the scheduler lock early (it's a global spinlock),

　　* thus we have to lock the previous process from getting

　　* rescheduled during switch_to().

　　#endif /* CONFIG_SMP */

　　kstat.context_swtch++;

　　* there are 3 processes which are affected by a context switch:

　　* prev == .... ==> (last => next)

　　* It's the 'much more previous' 'prev' that is on next's stack,

　　* but prev is set to (the just run) 'last' process by switch_to().

　　* This might sound slightly confusing but makes tons of sense.

　　prepare_to_switch();

　　{

　　struct mm_struct *mm = next->mm;

　　struct mm_struct *oldmm = prev->active_mm;

　　if (!mm) {

　　if (next->active_mm) BUG();

　　next->active_mm = oldmm;

　　atomic_inc(&oldmm->mm_count);

　　enter_lazy_tlb(oldmm, next, this_CPU);

　　} else {

　　if (next->active_mm != mm) BUG();

　　switch_mm(oldmm, mm, next, this_CPU);

　　}

　　if (!prev->mm) {

　　prev->active_mm = NULL;

　　mmdrop(oldmm);

　　}

　　* This just switches the register state and the

　　* stack.

　　switch_to(prev, next, prev);

　　__schedule_tail(prev);

　　same_process:

　　reacquire_kernel_lock(current);

　　if (current->need_resched)

　　goto need_resched_back;

　　#ifdef CONFIG_PREEMPT

　　ctx_sw_on_no_preempt();

　　#endif

　　return;

　　recalculate:

　　{

　　struct task_struct *p;

　　spin_unlock_irq(&runqueue_lock);

　　read_lock(&tasklist_lock);

　　for_each_task(p)

　　p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);

　　read_unlock(&tasklist_lock);

　　spin_lock_irq(&runqueue_lock);

　　}

　　goto repeat_schedule;

　　still_running:

　　c = goodness(prev, this_CPU, prev->active_mm);

　　next = prev;

　　goto still_running_back;

　　handle_softirq:

　　do_softirq();

　　goto handle_softirq_back;

　　move_rr_last:

　　if (!prev->counter) {

　　prev->counter = NICE_TO_TICKS(prev->nice);

　　move_last_runqueue(prev);

　　}

　　goto move_rr_back;

　　scheduling_in_interrupt:

　　printk("Scheduling in interrupt\n");

　　BUG();

　　return;

　　}

　　void schedule_tail(struct task_struct *prev)

　　{

　　__schedule_tail(prev);

　　#ifdef CONFIG_PREEMPT

　　ctx_sw_on();

　　#endif

　　}

本文来源：中国IT实验室作者：佚名

上一篇文章：别等Win7 SP2，Windows Blue预览版已开发中

下一篇文章： Win8版Skype重要更新：增文件和图片传输功能

LinuxONE中国社区成立促进企业级Linux应用创新   广东Linux中心发布新支点服务器操作系统   盘点Linux操作系统对用户说的七个“不”
Linux下用shntool通过cue实现无损音乐的分割   Linux操作系统挂载U盘和硬盘光驱的问题   如何在Linux使用HAProxy配置HTTP负载均衡系统
4MLinux 10.1 发布小型 Linux 系统   微软宣布ASP.NET5开源，跨Win10、Mac和Linux  深藏功与名：生活中被忽略的Linux系统
模块化笔记本诞生：Win10/安卓/Linux任意选择

聚合推荐

2022年网吧恢复营业时间网吧卫生网吧电脑配置开网吧网吧键盘网吧配置网吧GHOST ROS 网吧软件故障解决网众无盘网吧游戏菜单网吧活动网吧优化网吧精品网吧新手

声明

声明：本站所发表的文章、评论及图片仅代表作者本人观点，与本站立场无关。若文章侵犯了您的相关权益，请及时与我们联系，我们会及时处理，感谢您对本站的支持！联系email:support@txwb.com，系统开号，技术支持，服务联系QQ：1175525021本站所有有注明来源为天下网吧或天下网吧论坛的原创作品，各位转载时请注明来源链接！

天下网吧·网吧天下

Linux系统内核抢占补丁的原理

推荐文章

最新文章