1 概述

本篇为 Linux 调试篇,主要是观察 Linux 常见的各种调用过程.

这里我简单书写了一个内核模块.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// helloworld.c
#include <linux/init.h>
#include <linux/module.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jvle");
MODULE_DESCRIPTION("Hello World Kernel Module for Linux 6.6.109");

static int __init hello_init(void)
{
pr_info("Hello, kernel world!\n");
return 0;
}

static void __exit hello_exit(void)
{
pr_info("Goodbye, kernel world!\n");
}

module_init(hello_init);
module_exit(hello_exit);

2 观察

当执行完 insmod helloworld.ko 之后,我们来到了 init_module.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// umod 就是 .ko 的二进制数据
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
int err;
// 后续模块的数据被填充到 info 结构体
struct load_info info = { };

// 检查模块信息
err = may_init_module();
if (err)
return err;

pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
umod, len, uargs);

// 把模块从用户空间拷贝到内核空间
err = copy_module_from_user(umod, len, &info);
if (err) {
mod_stat_inc(&failed_kreads);
mod_stat_add_long(len, &invalid_kread_bytes);
return err;
}

// 加载内核模块
return load_module(&info, uargs, 0);
}


struct load_info {
const char *name; // 模块名
/* pointer to module in temporary copy, freed at end of load_module() */
struct module *mod; // 指向最终注册到内核的模块
// ko 文件的 ELF 信息
Elf_Ehdr *hdr;
unsigned long len;
Elf_Shdr *sechdrs;
char *secstrings, *strtab; // 符号字符串表
unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
bool sig_ok;
#ifdef CONFIG_KALLSYMS
unsigned long mod_kallsyms_init_off;
#endif
#ifdef CONFIG_MODULE_DECOMPRESS
#ifdef CONFIG_MODULE_STATS
unsigned long compressed_len;
#endif
struct page **pages;
unsigned int max_pages;
unsigned int used_pages;
#endif
// ELF 的索引信息
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
};

load_module 是模块加载的核心.

阶段 作用 关键函数
模块签名与 ELF 校验 module_sig_check()elf_validity_cache_copy()
布局与内存分配 layout_and_allocate()
模块注册准备 add_unformed_module()module_unload_init()
符号处理与重定位 simplify_symbols()apply_relocations()
参数解析与 sysfs 挂载 parse_args()mod_sysfs_setup()
模块初始化 do_init_module()
错误回滚与清理 多层级 goto cleanup
成功返回 模块进入 MODULE_STATE_LIVE 状态
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
struct module *mod;
bool module_allocated = false;
long err = 0;
char *after_dashes;

/*
* Do the signature check (if any) first. All that
* the signature check needs is info->len, it does
* not need any of the section info. That can be
* set up later. This will minimize the chances
* of a corrupt module causing problems before
* we even get to the signature check.
*
* The check will also adjust info->len by stripping
* off the sig length at the end of the module, making
* checks against info->len more correct.
*/
err = module_sig_check(info, flags);
if (err)
goto free_copy;

/*
* Do basic sanity checks against the ELF header and
* sections. Cache useful sections and set the
* info->mod to the userspace passed struct module.
*/
err = elf_validity_cache_copy(info, flags);
if (err)
goto free_copy;

err = early_mod_check(info, flags);
if (err)
goto free_copy;

/* Figure out module layout, and allocate all the memory. */
mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
err = PTR_ERR(mod);
goto free_copy;
}

module_allocated = true;

audit_log_kern_module(info->name);

/* Reserve our place in the list. */
err = add_unformed_module(mod);
if (err)
goto free_module;

/*
* We are tainting your kernel if your module gets into
* the modules linked list somehow.
*/
module_augment_kernel_taints(mod, info);

/* To avoid stressing percpu allocator, do this once we're unique. */
err = percpu_modalloc(mod, info);
if (err)
goto unlink_mod;

/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
goto unlink_mod;

init_param_lock(mod);

/*
* Now we've got everything in the final locations, we can
* find optional sections.
*/
err = find_module_sections(mod, info);
if (err)
goto free_unload;

err = check_export_symbol_versions(mod);
if (err)
goto free_unload;

/* Set up MODINFO_ATTR fields */
setup_modinfo(mod, info);

/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;

err = apply_relocations(mod, info);
if (err < 0)
goto free_modinfo;

err = post_relocation(mod, info);
if (err < 0)
goto free_modinfo;

flush_module_icache(mod);

/* Now copy in args */
mod->args = strndup_user(uargs, ~0UL >> 1);
if (IS_ERR(mod->args)) {
err = PTR_ERR(mod->args);
goto free_arch_cleanup;
}

init_build_id(mod, info);

/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);

/* Finally it's fully formed, ready to start executing. */
err = complete_formation(mod, info);
if (err)
goto ddebug_cleanup;

err = prepare_coming_module(mod);
if (err)
goto bug_cleanup;

mod->async_probe_requested = async_probe;

/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, mod,
unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
goto coming_cleanup;
} else if (after_dashes) {
pr_warn("%s: parameters '%s' after `--' ignored\n",
mod->name, after_dashes);
}

/* Link in to sysfs. */
// 创建 sys/module/<name>
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
goto coming_cleanup;

if (is_livepatch_module(mod)) {
err = copy_module_elf(mod, info);
if (err < 0)
goto sysfs_cleanup;
}

/* Get rid of temporary copy. */
free_copy(info, flags);

/* Done! */
trace_module_load(mod);

// 调用模块的 init 函数(即宏 module_init() 注册的函数)
return do_init_module(mod);

sysfs_cleanup:
mod_sysfs_teardown(mod);
coming_cleanup:
mod->state = MODULE_STATE_GOING;
destroy_params(mod->kp, mod->num_kp);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
bug_cleanup:
mod->state = MODULE_STATE_GOING;
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);

ddebug_cleanup:
ftrace_release_mod(mod);
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
module_arch_cleanup(mod);
free_modinfo:
free_modinfo(mod);
free_unload:
module_unload_free(mod);
unlink_mod:
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
mod_tree_remove(mod);
wake_up_all(&module_wq);
/* Wait for RCU-sched synchronizing before releasing mod->list. */
synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
mod_stat_bump_invalid(info, flags);
/* Free lock-classes; relies on the preceding sync_rcu() */
for_class_mod_mem_type(type, core_data) {
lockdep_free_key_range(mod->mem[type].base,
mod->mem[type].size);
}

module_deallocate(mod, info);
free_copy:
/*
* The info->len is always set. We distinguish between
* failures once the proper module was allocated and
* before that.
*/
if (!module_allocated) {
audit_log_kern_module(info->name ? info->name : "?");
mod_stat_bump_becoming(info, flags);
}
free_copy(info, flags);
return err;
}

此时, ko 的各段已经被解析并加载到了内存当中.

然后重点看 do_init_module,这是 module 真正被执行发生的地方.

这里主要是对模块的 module 结构体进行解析.

其间调用 do_one_initcall(mod->init) 调用 init 的过程.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
struct module {
// enum module_state {
// MODULE_STATE_LIVE, // 模块已激活
// MODULE_STATE_COMING, // 模块正在初始化
// MODULE_STATE_GOING, // 模块正在卸载
// MODULE_STATE_UNFORMED // 未完全初始化(刚分配)
// };
enum module_state state; // 当前模块的状态

/* Member of list of modules */
struct list_head list; // 被挂载到全局模块链表

/* Unique handle for this module */
char name[MODULE_NAME_LEN]; // 模块名
...
/* Sysfs stuff. */
struct module_kobject mkobj;
struct module_attribute *modinfo_attrs;
const char *version;
const char *srcversion;
struct kobject *holders_dir; // 保存依赖此模块的模块目录(/sys/module/<name>/holders/)

/* Exported symbols */
const struct kernel_symbol *syms;
const s32 *crcs;
unsigned int num_syms;
...
struct kernel_param *kp;
unsigned int num_kp;

/* GPL-only exported symbols. */
unsigned int num_gpl_syms;
const struct kernel_symbol *gpl_syms;
const s32 *gpl_crcs;
bool using_gplonly_symbols;
...
bool async_probe_requested;

/* Exception table */
unsigned int num_exentries;
struct exception_table_entry *extable;

/* Startup function. */
// 模块入口函数,即 module_init() 宏注册的函数
int (*init)(void);

// 模块的内存布局描述
struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align;

/* Arch-specific module values */
struct mod_arch_specific arch;

unsigned long taints; /* same bits as kernel:taint_flags */

...
/* The command line arguments (may be mangled). People like
keeping pointers to this stuff */
char *args;
...
void *noinstr_text_start;
unsigned int noinstr_text_size;
...
} ____cacheline_aligned __randomize_layout;

// do_init_module 函数
static noinline int do_init_module(struct module *mod)
{
int ret = 0;
struct mod_initfree *freeinit;
#if defined(CONFIG_MODULE_STATS)
unsigned int text_size = 0, total_size = 0;

for_each_mod_mem_type(type) {
const struct module_memory *mod_mem = &mod->mem[type];
if (mod_mem->size) {
total_size += mod_mem->size;
if (type == MOD_TEXT || type == MOD_INIT_TEXT)
text_size += mod_mem->size;
}
}
#endif

freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
if (!freeinit) {
ret = -ENOMEM;
goto fail;
}
freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;

do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
// 真正执行 init_module 的地方
ret = do_one_initcall(mod->init);
if (ret < 0) {
goto fail_free_freeinit;
}
if (ret > 0) {
pr_warn("%s: '%s'->init suspiciously returned %d, it should "
"follow 0/-E convention\n"
"%s: loading module anyway...\n",
__func__, mod->name, ret, __func__);
dump_stack();
}

/* Now it's a first class citizen! */
mod->state = MODULE_STATE_LIVE;
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);

/* Delay uevent until module has finished its init routine */
kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);

/*
* We need to finish all async code before the module init sequence
* is done. This has potential to deadlock if synchronous module
* loading is requested from async (which is not allowed!).
*
* See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
* request_module() from async workers") for more details.
*/
if (!mod->async_probe_requested)
async_synchronize_full();

ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
module_arch_freeing_init(mod);
for_class_mod_mem_type(type, init) {
mod->mem[type].base = NULL;
mod->mem[type].size = 0;
}

#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
mod->btf_data = NULL;
#endif
/*
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success
* path. module_memfree() cannot be called in an interrupt, so do the
* work and call synchronize_rcu() in a work queue.
*
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work by invoking
* flush_module_init_free_work().
*/
if (llist_add(&freeinit->node, &init_free_list))
schedule_work(&init_free_wq);

mutex_unlock(&module_mutex);
wake_up_all(&module_wq);

mod_stat_add_long(text_size, &total_text_size);
mod_stat_add_long(total_size, &total_mod_size);

mod_stat_inc(&modcount);

return 0;

fail_free_freeinit:
kfree(freeinit);
fail:
/* Try to protect us from buggy refcounters. */
mod->state = MODULE_STATE_GOING;
synchronize_rcu();
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
ftrace_release_mod(mod);
free_module(mod);
wake_up_all(&module_wq);

return ret;
}