-
Notifications
You must be signed in to change notification settings - Fork 0
/
k-init.cc
378 lines (306 loc) · 12.4 KB
/
k-init.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#include "kernel.hh"
#include "lib.hh"
#include "k-ahci.hh"
#include "k-apic.hh"
#include "k-devices.hh"
#include "elf.h"
// sata_disk: pointer to the first SATA disk found
ahcistate* sata_disk;
// init_hardware
// Initialize hardware. Calls other functions below.
static void init_early_memory();
static void init_interrupts();
static void init_constructors();
static void init_physical_ranges();
static void init_other_processors();
void init_hardware() {
// initialize early-stage virtual memory structures
init_early_memory();
// initialize console position
cursorpos = 3 * CONSOLE_COLUMNS;
// initialize interrupt descriptors and controller
init_interrupts();
// call C++ constructors for global objects
// (NB none of these constructors may allocate memory)
init_constructors();
// initialize this CPU
ncpu = 1;
cpus[0].init();
// initialize the `physical_ranges` object that tracks
// kernel and reserved physical memory
init_physical_ranges();
// initialize kernel allocator
init_kalloc();
// initialize other CPUs
init_other_processors();
#if HAVE_SANITIZERS
// after CPUs initialize, enable address sanitization
enable_asan();
#endif
// enable interrupts
cpus[0].enable_irq(IRQ_KEYBOARD);
// initialize SATA drive
sata_disk = ahcistate::find();
if (sata_disk && sata_disk->irq_ > 0) {
cpus[ncpu - 1].enable_irq(sata_disk->irq_);
}
}
// init_early_memory
// Set up early-stage segment registers and page table.
//
// The early-stage segment registers and global descriptors are
// used during hardware and secondary-processor initialization.
// Once a CPU boots, it sets up its own segment registers and
// global descriptors; see cpustate::init_cpu_hardware(). The
// early-stage page table is used whenever no appropriate process
// page table exists.
//
// The interrupt descriptor table tells the processor where to jump
// when an interrupt or exception happens. See k-interrupt.S.
//
// The layouts of these types are defined by the hardware.
static void set_app_segment(uint64_t* segment, uint64_t type, int dpl) {
*segment = type
| X86SEG_S // code/data segment
| ((uint64_t) dpl << 45)
| X86SEG_P; // segment present
}
static void set_sys_segment(uint64_t* segment, uintptr_t addr, size_t size,
uint64_t type, int dpl) {
segment[0] = ((addr & 0x0000000000FFFFFFUL) << 16)
| ((addr & 0x00000000FF000000UL) << 32)
| ((size - 1) & 0x0FFFFUL)
| (((size - 1) & 0xF0000UL) << 48)
| type
| ((uint64_t) dpl << 45)
| X86SEG_P; // segment present
segment[1] = addr >> 32;
}
static void set_gate(x86_64_gatedescriptor* gate, uintptr_t addr,
int type, int dpl, int ist) {
assert(unsigned(type) < 16 && unsigned(dpl) < 4 && unsigned(ist) < 8);
gate->gd_low = (addr & 0x000000000000FFFFUL)
| (SEGSEL_KERN_CODE << 16)
| (uint64_t(ist) << 32)
| (uint64_t(type) << 40)
| (uint64_t(dpl) << 45)
| X86SEG_P
| ((addr & 0x00000000FFFF0000UL) << 32);
gate->gd_high = addr >> 32;
}
x86_64_pagetable __section(".lowdata") early_pagetable[3];
uint64_t __section(".lowdata") early_gdt_segments[3];
x86_64_pseudodescriptor __section(".lowdata") early_gdt;
void init_early_memory() {
// initialize segment descriptors for kernel code and data
early_gdt_segments[0] = 0;
set_app_segment(&early_gdt_segments[SEGSEL_KERN_CODE >> 3],
X86SEG_X | X86SEG_L, 0);
set_app_segment(&early_gdt_segments[SEGSEL_KERN_DATA >> 3],
X86SEG_W, 0);
early_gdt.limit = sizeof(early_gdt_segments) - 1;
early_gdt.base = (uint64_t) early_gdt_segments;
asm volatile("lgdt %0" : : "m" (early_gdt.limit));
// initialize early page table
memset(early_pagetable, 0, sizeof(early_pagetable));
// level-4 page table:
// - entry 0 maps first 512GiB of low canonical addresses
// - entry 256 maps first 512GiB of high canonical addresses
// - entry 511 maps last 2GiB of high canonical addresses (kernel text)
early_pagetable->entry[0] = ktext2pa(&early_pagetable[1]) | PTE_P | PTE_W;
early_pagetable->entry[256] = early_pagetable->entry[0];
early_pagetable->entry[511] = ktext2pa(&early_pagetable[2]) | PTE_P | PTE_W;
// first level-3 page table maps first 512GiB of physical memory
// (low canonical and high canonical)
for (uintptr_t p = 0; p < 512; ++p) {
early_pagetable[1].entry[p] = (p << 30) | PTE_P | PTE_W | PTE_PS;
}
// second level-3 page table maps its last 2 slots to the first 2GiB
// of physical memory (kernel text)
early_pagetable[2].entry[510] = early_pagetable[1].entry[0];
early_pagetable[2].entry[511] = early_pagetable[1].entry[1];
wrcr3(ktext2pa(early_pagetable));
// Now that boot-time structures (pagetable and global descriptor
// table) have been replaced, we can reuse boot-time memory.
}
extern x86_64_gatedescriptor interrupt_descriptors[256];
void init_interrupts() {
// initialize interrupt descriptors
// Macros in `k-exception.S` initialized `interrupt_descriptors[]` with
// function pointers in the `gd_low` members. We must change them to the
// weird format x86-64 expects.
for (int i = 0; i < 256; ++i) {
uintptr_t addr = interrupt_descriptors[i].gd_low;
set_gate(&interrupt_descriptors[i], addr,
X86GATE_INTERRUPT, i == INT_BP ? 3 : 0,
i == INT_DB || i == INT_NM || i == INT_MC);
}
// ensure machine has an enabled APIC
assert(cpuid(1).edx & (1 << 9));
uint64_t apic_base = rdmsr(MSR_IA32_APIC_BASE);
assert(apic_base & IA32_APIC_BASE_ENABLED);
assert((apic_base & 0xFFFFFFFFF000) == lapicstate::lapic_pa);
// ensure machine has an IOAPIC
auto& ioapic = ioapicstate::get();
uint32_t ioapic_ver = ioapic.read(ioapic.reg_ver);
assert((ioapic_ver & 0xFF) == 0x11 || (ioapic_ver & 0xFF) == 0x20);
assert((ioapic_ver >> 16) >= 0x17);
// disable the old programmable interrupt controller
#define IO_PIC1 0x20 // Master (IRQs 0-7)
#define IO_PIC2 0xA0 // Slave (IRQs 8-15)
outb(IO_PIC1 + 1, 0xFF);
outb(IO_PIC2 + 1, 0xFF);
}
void init_constructors() {
typedef void (*constructor_function)();
extern constructor_function __init_array_start[];
extern constructor_function __init_array_end[];
for (auto fp = __init_array_start; fp != __init_array_end; ++fp) {
(*fp)();
}
}
memrangeset<16> physical_ranges(0x100000000UL);
void init_physical_ranges() {
// [0, MEMSIZE_PHYSICAL) starts out available
physical_ranges.set(0, MEMSIZE_PHYSICAL, mem_available);
// 0 page is reserved (because nullptr)
physical_ranges.set(0, PAGESIZE, mem_reserved);
// I/O memory is reserved (except the console is `mem_console`)
physical_ranges.set(PA_IOLOWMIN, PA_IOLOWEND, mem_reserved);
physical_ranges.set(PA_IOHIGHMIN, PA_IOHIGHEND, mem_reserved);
physical_ranges.set(ktext2pa(console), ktext2pa(console) + PAGESIZE,
mem_console);
// kernel text and data is owned by the kernel
extern unsigned char _low_data_start[], _low_data_end[];
physical_ranges.set(round_down(ktext2pa(_low_data_start), PAGESIZE),
round_up(ktext2pa(_low_data_end), PAGESIZE),
mem_kernel);
extern unsigned char _kernel_start[], _kernel_end[];
physical_ranges.set(round_down(ktext2pa(_kernel_start), PAGESIZE),
round_up(ktext2pa(_kernel_end), PAGESIZE),
mem_kernel);
// reserve memory for debugging facilities
extern elf_symtabref symtab;
if (symtab.size) {
auto sympa = ktext2pa(symtab.sym);
physical_ranges.set(round_down(sympa, PAGESIZE),
round_up(sympa + symtab.size, PAGESIZE),
mem_kernel);
}
#if HAVE_SANITIZERS
init_sanitizers();
#endif
// `physical_ranges` is constant after this point.
}
extern "C" { void syscall_entry(); }
void cpustate::init_cpu_hardware() {
// initialize per-CPU segments
gdt_segments_[0] = 0;
set_app_segment(&gdt_segments_[SEGSEL_KERN_CODE >> 3],
X86SEG_X | X86SEG_L, 0);
set_app_segment(&gdt_segments_[SEGSEL_KERN_DATA >> 3],
X86SEG_W, 0);
set_app_segment(&gdt_segments_[SEGSEL_APP_CODE >> 3],
X86SEG_X | X86SEG_L, 3);
set_app_segment(&gdt_segments_[SEGSEL_APP_DATA >> 3],
X86SEG_W, 3);
set_sys_segment(&gdt_segments_[SEGSEL_TASKSTATE >> 3],
(uintptr_t) &taskstate_, sizeof(taskstate_),
X86SEG_TSS, 0);
memset(&taskstate_, 0, sizeof(taskstate_));
taskstate_.ts_rsp[0] = (uintptr_t) this + CPUSTACK_SIZE;
taskstate_.ts_ist[1] = (uintptr_t) this + CPUALTSTACK_SIZE;
x86_64_pseudodescriptor gdt, idt;
gdt.limit = sizeof(gdt_segments_) - 1;
gdt.base = (uint64_t) gdt_segments_;
idt.limit = sizeof(interrupt_descriptors) - 1;
idt.base = (uint64_t) interrupt_descriptors;
// load segment descriptor tables
asm volatile("lgdt %0; ltr %1; lidt %2"
:
: "m" (gdt.limit),
"r" ((uint16_t) SEGSEL_TASKSTATE),
"m" (idt.limit)
: "memory", "cc");
// initialize segments, including `%gs`, which points at this cpustate
asm volatile("movw %%ax, %%fs; movw %%ax, %%gs"
: : "a" ((uint16_t) SEGSEL_KERN_DATA));
wrmsr(MSR_IA32_GS_BASE, reinterpret_cast<uint64_t>(this));
// set up control registers
uint32_t cr0 = rdcr0();
cr0 |= CR0_PE | CR0_PG | CR0_WP | CR0_AM | CR0_MP | CR0_NE;
wrcr0(cr0);
// set up syscall/sysret
wrmsr(MSR_IA32_KERNEL_GS_BASE, 0);
wrmsr(MSR_IA32_STAR, (uintptr_t(SEGSEL_KERN_CODE) << 32)
| (uintptr_t(SEGSEL_APP_CODE) << 48));
wrmsr(MSR_IA32_LSTAR, reinterpret_cast<uint64_t>(syscall_entry));
wrmsr(MSR_IA32_FMASK, EFLAGS_TF | EFLAGS_DF | EFLAGS_IF
| EFLAGS_IOPL_MASK | EFLAGS_AC | EFLAGS_NT);
// initialize local APIC (interrupt controller)
auto& lapic = lapicstate::get();
lapic.enable_lapic(INT_IRQ + IRQ_SPURIOUS);
lapic_id_ = lapic.id();
// lapic timer goes off every 0.01s
lapic.write(lapic.reg_timer_divide, lapic.timer_divide_1);
lapic.write(lapic.reg_lvt_timer,
lapic.timer_periodic | (INT_IRQ + IRQ_TIMER));
lapic.write(lapic.reg_timer_initial_count, 1000000000 / HZ);
// disable logical interrupt lines
lapic.write(lapic.reg_lvt_lint0, lapic.lvt_masked);
lapic.write(lapic.reg_lvt_lint1, lapic.lvt_masked);
// 12. set LVT error handling entry
lapic.write(lapic.reg_lvt_error, INT_IRQ + IRQ_ERROR);
// clear error status by reading the error;
// acknowledge any outstanding interrupts
lapic.error();
lapic.ack();
}
static void microdelay(int amount) {
uint64_t x = rdtsc() + (uint64_t) amount * 10000;
while ((int64_t) (x - rdtsc()) > 0) {
asm volatile("pause");
}
}
extern "C" {
extern void ap_entry();
extern spinlock ap_entry_lock;
extern bool ap_init_allowed;
}
void cpustate::init_ap() {
init();
ap_entry_lock.unlock_noirq();
schedule(nullptr);
}
void init_other_processors() {
// 10. convert entry point to an 8-bit vector
uintptr_t ap_entry_pa = ktext2pa(ap_entry);
assert((ap_entry_pa & 0xFFFFFFFFFFF00FFF) == 0);
// record this CPU as CPU 0
// mark APs as initializing
ap_init_allowed = true;
// XXX CMOS shutdown code, warm reset vector
// 15. broadcast INIT-SIPI-SIPI
auto& lapic = lapicstate::get();
lapic.ipi_others(lapic.ipi_init);
microdelay(10000);
while (lapic.ipi_pending()) {
}
lapic.ipi_others(lapic.ipi_startup, ap_entry_pa >> 12);
microdelay(200);
while (lapic.ipi_pending()) {
}
lapic.ipi_others(lapic.ipi_startup, ap_entry_pa >> 12);
// wait for processors to start up
microdelay(20000);
while (lapic.ipi_pending()) {
}
ap_entry_lock.lock_noirq();
ap_init_allowed = false;
ap_entry_lock.unlock_noirq();
// Now that `ap_init_allowed` is false, no further CPUs will
// initialize.
for (int i = 0; i < ncpu; ++i) {
log_printf("CPU %d: LAPIC ID %d\n", i, cpus[i].lapic_id_);
}
}