1 // mem/pagetable.cc -- Generic page table implementation
3 // Most architectures should be able to use this as is, though
4 // architectures with weird paging hardware can provide their own implementation.
6 // OPT: Dynamically adjust the number of pagetable levels for PTEs that
7 // support it (mainly for generic-pte).
9 // This software is copyright (c) 2006 Scott Wood <scott@buserror.net>.
11 // This software is provided 'as-is', without any express or implied warranty.
12 // In no event will the authors or contributors be held liable for any damages
13 // arising from the use of this software.
15 // Permission is hereby granted to everyone, free of charge, to use, copy,
16 // modify, prepare derivative works of, publish, distribute, perform,
17 // sublicense, and/or sell copies of the Software, provided that the above
18 // copyright notice and disclaimer of warranty be included in all copies or
19 // substantial portions of this software.
22 #include <kern/pagealloc.h>
23 #include <kern/generic-pagetable.h>
24 #include <kern/generic-pte.h>
25 #include <lowlevel/atomic.h>
26 #include <util/misc.h>
29 PageTable *kernel_page_table;
31 // The architecture must specify at least one data structure
32 // representing one PTE. Non-directory PTEs must support the |
33 // operation, either through overloading or providing a constructor and
34 // conversion operator for an integral type. It is assumed that a PTE
35 // of zero (by using bzero on the table) is a reasonable invalid PTE,
36 // but a PTE-specific bulk zero method could be added if necessary.
38 // Eventually multiple PTE formats will need to be supported in
39 // order to dynamically choose between PAE and non-PAE on 32-bit
40 // x86. When that happens, the arch will instantiate its choice
41 // of PageTableImpl template rather than export Arch::PTE.
43 // A PTE must support typedefs PTE::PhysAddr and PTE::VirtAddr which
44 // refers to integer types of the same size as the supported physical
45 // and virtual addresses, respectively...
47 // A PTE must support a typedef PTE::DirPTE which is used for
48 // non-final page tables. DirPTE may be the same as PTE. DirPTE must
49 // support valid_pte, set_pte, and shift_per_level as in a normal PTE,
50 // and must implement the following methods:
54 // A function to return the virtual address of the table pointed to
55 // by this DirPTE entry.
57 // static DirPTE set_table(void *table)
59 // A function to return a DirPTE pointing to the specified virtual
62 // A normal PTE must support the following methods:
64 // static void flags_to_pte(Mem::PTEFlags flagsin,
65 // Mem::PTEFlags maskin,
69 // A function to turn Mem::PTEFlags into a PTE. It also produces
70 // a mask which can be used to produce a new pte by calling
71 // oldpte.set_flags(mask, newpteflags).
73 // PTE set_flags(PTE mask, PTE flags)
75 // Apply the flags to the PTE according to the provided mask.
77 // Mem::PTEFlags pte_to_flags()
79 // A function to turn a PTE into Mem::PTEFlags
81 // static uint addr_to_offset(VirtAddr addr, int shift)
83 // A function to take a virtual address and a shift level and return
84 // the offset into the page table in entries.
86 // PhysAddr pte_to_addr()
88 // A function to take a PTE and return the physical address contained
91 // static PTE addr_to_pte(PhysAddr phys)
93 // A function to take a physical address and return a PTE with that
94 // address and no flags set.
99 // A function to return whether the PTE is valid/dirty or not. This
100 // is a shortcut to keep from having to do a pte_to_flags repeatedly.
101 // It would have been slightly cleaner to make this a method of PTE,
102 // but that would require that PTE be implemented as a complex type,
103 // and I'd rather leave that up to the architecture.
105 // void set_pte(PTE *table, uint offset)
107 // A function to set a PTE in a page table. Normally, this is just
108 // a simple assignment, but some architectures may need to do something
109 // unusual (such as ensure atomicity if the PTE size is greater than
112 // PTE xchg_pte(PTE *table, uint offset)
114 // As set_pte, but atomically reads the old PTE while setting the
115 // new PTE, and returns the old one.
117 // A PTE must have the following constants:
120 // The number of bits of virtual address space represented by one
121 // level of page tables. This is log2(number of pages per table).
124 // The number of page table levels; this is used, but not imported,
125 // due to a conflict with PageTable::num_levels.
127 // page_size: the size of a page
128 // page_shift: log2(page_size)
130 // kmap_start, kmap_end:
131 // The kernel mappings portion of the address space is mapped into all
132 // address spaces, using shared page tables. This sharing occurs at
133 // the top-level page table (hopefully this will work for all
134 // architectures; it can be made configurable, at the cost of some
135 // complexity). kmap_start and kmap_end are indices into the top
136 // level page table that define which region is shared. These are only
137 // relevant for process address spaces.
139 using Util::round_up;
141 template<typename PTE>
142 void PageTableImpl<PTE>::end_map(RegionWithOffset region, PTE flags,
145 uint start = PTE::addr_to_offset(region.start, PTE::page_shift);
146 uint end = PTE::addr_to_offset(region.end, PTE::page_shift);
148 Page *page = kvirt_to_page(table);
150 assert(start < pages_per_table());
151 assert(end < pages_per_table());
152 assert(page->flags & Page::InUse);
154 PTE *ptable = static_cast<PTE *>(table);
156 for (uint i = start; i <= end; i++) {
157 PTE newpte = PTE::addr_to_pte(region.offset) | flags;
158 PTE oldpte = newpte.xchg_pte(ptable, i);
160 if (!newpte.addronly_pte())
161 retain_if_phys(region.offset);
164 // vaddr is only for process aspaces, so don't worry
167 ulong vaddr = (ulong)region.start +
168 ((i - start) << PTE::page_shift);
170 kill_pte(vaddr, oldpte.pte_to_addr(),
171 oldpte.dirty_pte(), oldpte.valid_pte(),
172 oldpte.addronly_pte());
177 region.offset += PTE::page_size;
181 template<typename PTE>
182 void PageTableImpl<PTE>::rec_map(RegionWithOffset region, PTE flags,
183 void *table, int shift)
185 if (shift < lastlevel_shift) {
186 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
187 end_map(region, flags, table);
191 Page *page = kvirt_to_page(table);
193 DirPTE *dtable = static_cast<DirPTE *>(table);
194 uint start = DirPTE::addr_to_offset(region.start, shift);
195 uint end = DirPTE::addr_to_offset(region.end, shift);
196 u64 orig_end = region.end;
198 assert(start < pages_per_dtable());
199 assert(end < pages_per_dtable());
200 assert(page->flags & Page::InUse);
203 region.end = round_up(region.start + 1, shift) - 1;
205 for (uint i = start; i <= end; i++) {
208 if (!dtable[i].valid_pte()) {
209 subtable = Mem::alloc_pages(1);
210 bzero(subtable, PTE::page_size);
211 DirPTE newpte = DirPTE::set_table(subtable);
212 newpte.set_pte(dtable, i);
215 subtable = dtable[i].get_table();
218 rec_map(region, flags, subtable, shift - DirPTE::shift_per_level);
220 region.offset += region.end - region.start + 1;
221 region.start = region.end + 1;
224 region.end = orig_end;
226 region.end += 1UL << shift;
230 template <typename PTE>
231 void PageTableImpl<PTE>::end_unmap(Region region, void *table)
233 Page *page = kvirt_to_page(table);
234 uint start = PTE::addr_to_offset(region.start, PTE::page_shift);
235 uint end = PTE::addr_to_offset(region.end, PTE::page_shift);
237 assert(start < pages_per_table());
238 assert(end < pages_per_table());
239 assert(page->flags & Page::InUse);
241 PTE *ptable = static_cast<PTE *>(table);
243 for (uint i = start; i <= end; i++) {
245 PTE oldpte = PTE().xchg_pte(ptable, i);
248 // vaddr is only for process aspaces, so don't worry
251 ulong vaddr = (ulong)region.start +
252 ((i - start) << PTE::page_shift);
254 kill_pte(vaddr, oldpte.pte_to_addr(),
255 oldpte.dirty_pte(), oldpte.valid_pte(),
256 oldpte.addronly_pte());
259 assert(page->inuse.refcount > 1);
265 template <typename PTE>
266 void PageTableImpl<PTE>::rec_unmap(Region region, void *table, int shift)
268 if (shift < lastlevel_shift) {
269 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
270 end_unmap(region, table);
274 Page *page = kvirt_to_page(table);
275 uint start = DirPTE::addr_to_offset(region.start, shift);
276 uint end = DirPTE::addr_to_offset(region.end, shift);
277 u64 orig_end = region.end;
279 assert(start < pages_per_dtable());
280 assert(end < pages_per_dtable());
281 assert(page->flags & Page::InUse);
283 DirPTE *dtable = static_cast<DirPTE *>(table);
286 region.end = round_up(region.start + 1, shift) - 1;
288 for (uint i = start; i <= end; i++) {
289 if (dtable[i].valid_pte()) {
290 void *subtable = dtable[i].get_table();
292 rec_unmap(region, subtable, shift - DirPTE::shift_per_level);
294 Page *subpage = kvirt_to_page(subtable);
295 assert(subpage->flags & Page::InUse);
296 assert(subpage->inuse.refcount > 0);
298 if (subpage->inuse.refcount == 1) {
299 DirPTE().set_pte(dtable, i);
302 assert(page->inuse.refcount > 1);
307 region.start = region.end + 1;
310 region.end = orig_end;
312 region.end += 1UL << shift;
316 template <typename PTE>
317 void PageTableImpl<PTE>::end_set_flags(Region region, PTE flags,
318 PTE mask, void *table)
320 uint start = PTE::addr_to_offset(region.start, PTE::page_shift);
321 uint end = PTE::addr_to_offset(region.end, PTE::page_shift);
323 assert(start < pages_per_table());
324 assert(end < pages_per_table());
326 PTE *ptable = static_cast<PTE *>(table);
328 for (uint i = start; i <= end; i++) {
330 PTE oldpte = ptable[i].set_flags(mask, flags);
332 // vaddr is only for process aspaces, so don't worry
335 ulong vaddr = (ulong)region.start +
336 ((i - start) << PTE::page_shift);
338 kill_pte(vaddr, oldpte.pte_to_addr(),
339 oldpte.dirty_pte(), oldpte.valid_pte(), true);
344 template <typename PTE>
345 void PageTableImpl<PTE>::rec_set_flags(Region region, PTE flags,
346 PTE mask, void *table,
349 if (shift < lastlevel_shift) {
350 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
351 shift = PTE::page_shift;
354 uint start = DirPTE::addr_to_offset(region.start, shift);
355 uint end = DirPTE::addr_to_offset(region.end, shift);
356 u64 orig_end = region.end;
358 assert(start < pages_per_dtable());
359 assert(end < pages_per_dtable());
361 DirPTE *dtable = static_cast<DirPTE *>(table);
364 region.end = round_up(region.start + 1, shift) - 1;
366 for (uint i = start; i <= end; i++) {
367 if (dtable[i].valid_pte()) {
368 void *subtable = dtable[i].get_table();
370 rec_set_flags(region, flags, mask, subtable,
371 shift - DirPTE::shift_per_level);
374 region.start = region.end + 1;
377 region.end = orig_end;
379 region.end += 1UL << shift;
383 template <typename PTE>
384 void PageTableImpl<PTE>::map(RegionWithOffset region, Flags flags)
386 Lock::AutoLock autolock(lock);
388 PTE::flags_to_pte(flags, ~0UL, pte, ptemask);
389 PTE *table = static_cast<PTE *>(toplevel);
390 rec_map(region, pte, table, toplevel_shift);
393 template <typename PTE>
394 void PageTableImpl<PTE>::unmap(Region region)
396 Lock::AutoLock autolock(lock);
397 PTE *table = static_cast<PTE *>(toplevel);
398 rec_unmap(region, table, toplevel_shift);
401 template <typename PTE>
402 void PageTableImpl<PTE>::set_flags(Region region, Flags flags, Flags mask)
404 Lock::AutoLock autolock(lock);
406 PTE::flags_to_pte(flags, mask, pte, ptemask);
407 PTE *table = static_cast<PTE *>(toplevel);
408 rec_set_flags(region, pte, ptemask, table, toplevel_shift);
411 template <typename PTE>
412 void PageTableImpl<PTE>::get_mapping(u64 addr, u64 *phys, Flags *flags)
414 Lock::AutoLock autolock(lock);
415 int shift = toplevel_shift;
417 void *table = toplevel;
419 while (shift >= lastlevel_shift) {
420 DirPTE *dtable = static_cast<DirPTE *>(table);
421 DirPTE dpte = dtable[DirPTE::addr_to_offset(addr, shift)];
423 if (!dpte.valid_pte()) {
428 table = dpte.get_table();
429 shift -= DirPTE::shift_per_level;
432 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
434 PTE *ptable = static_cast<PTE *>(table);
436 uint off = PTE::addr_to_offset(addr, PTE::page_shift);
438 *phys = ptable[off].pte_to_addr();
439 *flags = ptable[off].pte_to_flags();
442 template <typename PTE>
443 PageTableImpl<PTE>::PageTableImpl(bool process) : PageTable(process)
445 toplevel = Mem::alloc_pages(1);
446 PTE *table = static_cast<PTE *>(toplevel);
449 num_levels = PTE::num_levels;
451 if (PTE::kmap_start != 0)
452 bzero(table, PTE::kmap_start * sizeof(PTE));
454 if (PTE::kmap_end != pages_per_dtable() - 1)
455 bzero(table + PTE::kmap_end + 1,
456 (pages_per_dtable() - PTE::kmap_end - 1) * sizeof(PTE));
458 PTE *ktable = static_cast<PTE *>(kernel_page_table->toplevel);
460 memcpy(table + PTE::kmap_start, ktable + PTE::kmap_start,
461 (PTE::kmap_end - PTE::kmap_start + 1) * sizeof(PTE));
463 // FIXME: growable levels
464 num_levels = PTE::num_levels;
465 bzero(table, PTE::page_size);
468 toplevel_shift = lastlevel_shift = PTE::page_shift;
470 if (num_levels > 1) {
471 lastlevel_shift += PTE::shift_per_level;
472 toplevel_shift += PTE::shift_per_level +
473 (num_levels - 2) * DirPTE::shift_per_level;
477 template <typename PTE>
478 PageTableImpl<PTE>::PageTableImpl(void *table) : PageTable(true)
480 assert(!kernel_page_table);
483 num_levels = PTE::num_levels;
484 kernel_page_table = this;
487 template <typename PTE>
488 PageTableImpl<PTE>::~PageTableImpl()
490 assert(this != kernel_page_table);
493 Region region1 = { 0, ((VirtAddr)PTE::kmap_start - 1) << toplevel_shift };
494 Region region2 = { ((VirtAddr)PTE::kmap_end + 1) << toplevel_shift, ~0UL };
496 if (PTE::kmap_start != 0)
498 if (PTE::kmap_end != pages_per_dtable() - 1)
501 Region region = { 0, ~0UL };
505 Page *page = kvirt_to_page(toplevel);
506 assert(page->flags & Page::InUse);
507 assert(page->inuse.refcount == 1);
509 Mem::free_pages(toplevel, 1);
512 template class PageTableImpl<Arch::PTE>;
513 template class PageTableImpl<GenPTE>;