1 // mem/pagetable.cc -- Generic page table implementation
3 // Most architectures should be able to use this as is, though
4 // architectures with weird paging hardware can provide their own implementation.
6 // OPT: Dynamically adjust the number of pagetable levels for PTEs that
7 // support it (mainly for generic-pte).
9 // This software is copyright (c) 2006 Scott Wood <scott@buserror.net>.
11 // This software is provided 'as-is', without any express or implied warranty.
12 // In no event will the authors or contributors be held liable for any damages
13 // arising from the use of this software.
15 // Permission is hereby granted to everyone, free of charge, to use, copy,
16 // modify, prepare derivative works of, publish, distribute, perform,
17 // sublicense, and/or sell copies of the Software, provided that the above
18 // copyright notice and disclaimer of warranty be included in all copies or
19 // substantial portions of this software.
22 #include <kern/pagealloc.h>
23 #include <kern/generic-pagetable.h>
24 #include <kern/generic-pte.h>
25 #include <lowlevel/atomic.h>
26 #include <util/misc.h>
29 PageTable *kernel_page_table;
31 // The architecture must specify at least one data structure
32 // representing one PTE. Non-directory PTEs must support the |
33 // operation, either through overloading or providing a constructor and
34 // conversion operator for an integral type. It is assumed that a PTE
35 // of zero (by using bzero on the table) is a reasonable invalid PTE,
36 // but a PTE-specific bulk zero method could be added if necessary.
38 // Eventually multiple PTE formats will need to be supported in
39 // order to dynamically choose between PAE and non-PAE on 32-bit
40 // x86. When that happens, the arch will instantiate its choice
41 // of PageTableImpl template rather than export Arch::PTE.
43 // A PTE must support typedefs PTE::PhysAddr and PTE::VirtAddr which
44 // refers to integer types of the same size as the supported physical
45 // and virtual addresses, respectively...
47 // A PTE must support a typedef PTE::DirPTE which is used for
48 // non-final page tables. DirPTE may be the same as PTE. DirPTE must
49 // support valid_pte, set_pte, and shift_per_level as in a normal PTE,
50 // and must implement the following methods:
54 // A function to return the virtual address of the table pointed to
55 // by this DirPTE entry.
57 // static DirPTE set_table(void *table)
59 // A function to return a DirPTE pointing to the specified virtual
62 // A normal PTE must support the following methods:
64 // static void flags_to_pte(Mem::PTEFlags flagsin,
65 // Mem::PTEFlags maskin,
69 // A function to turn Mem::PTEFlags into a PTE. It also produces
70 // a mask which can be used to produce a new pte by calling
71 // oldpte.set_flags(mask, newpteflags).
73 // PTE set_flags(PTE mask, PTE flags)
75 // Apply the flags to the PTE according to the provided mask.
77 // Mem::PTEFlags pte_to_flags()
79 // A function to turn a PTE into Mem::PTEFlags
81 // static uint addr_to_offset(VirtAddr addr, int shift)
83 // A function to take a virtual address and a shift level and return
84 // the offset into the page table in entries.
86 // PhysAddr pte_to_addr()
88 // A function to take a PTE and return the physical address contained
91 // static PTE addr_to_pte(PhysAddr phys)
93 // A function to take a physical address and return a PTE with that
94 // address and no flags set.
99 // A function to return whether the PTE is valid/dirty or not. This
100 // is a shortcut to keep from having to do a pte_to_flags repeatedly.
101 // It would have been slightly cleaner to make this a method of PTE,
102 // but that would require that PTE be implemented as a complex type,
103 // and I'd rather leave that up to the architecture.
105 // void set_pte(PTE *table, uint offset)
107 // A function to set a PTE in a page table. Normally, this is just
108 // a simple assignment, but some architectures may need to do something
109 // unusual (such as ensure atomicity if the PTE size is greater than
112 // PTE xchg_pte(PTE *table, uint offset)
114 // As set_pte, but atomically reads the old PTE while setting the
115 // new PTE, and returns the old one.
117 // A PTE must have the following constants:
120 // The number of bits of virtual address space represented by one
121 // level of page tables. This is log2(number of pages per table).
124 // The number of page table levels; this is used, but not imported,
125 // due to a conflict with PageTable::num_levels.
127 // page_size: the size of a page
128 // page_shift: log2(page_size)
130 // kmap_start, kmap_end:
131 // The kernel mappings portion of the address space is mapped into all
132 // address spaces, using shared page tables. This sharing occurs at
133 // the top-level page table (hopefully this will work for all
134 // architectures; it can be made configurable, at the cost of some
135 // complexity). kmap_start and kmap_end are indices into the top
136 // level page table that define which region is shared. These are only
137 // relevant for process address spaces.
139 using Util::round_up;
141 template<typename PTE>
142 void PageTableImpl<PTE>::end_map(RegionWithOffset region, PTE flags,
145 uint start = PTE::addr_to_offset(region.start, PTE::page_shift);
146 uint end = PTE::addr_to_offset(region.end, PTE::page_shift);
148 Page *page = kvirt_to_page(table);
150 assert(start < pages_per_table());
151 assert(end < pages_per_table());
152 assert(page->flags & Page::InUse);
154 PTE *ptable = static_cast<PTE *>(table);
156 for (uint i = start; i <= end; i++) {
157 PTE newpte = PTE::addr_to_pte(region.offset) | flags;
158 PTE oldpte = newpte.xchg_pte(ptable, i);
160 retain_if_phys(region.offset);
163 // vaddr is only for process aspaces, so don't worry
166 ulong vaddr = (ulong)region.start +
167 ((i - start) << PTE::page_shift);
169 kill_pte(vaddr, oldpte.pte_to_addr(),
170 oldpte.dirty_pte(), oldpte.valid_pte());
175 region.offset += PTE::page_size;
179 template<typename PTE>
180 void PageTableImpl<PTE>::rec_map(RegionWithOffset region, PTE flags,
181 void *table, int shift)
183 if (shift < lastlevel_shift) {
184 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
185 end_map(region, flags, table);
189 Page *page = kvirt_to_page(table);
191 DirPTE *dtable = static_cast<DirPTE *>(table);
192 uint start = DirPTE::addr_to_offset(region.start, shift);
193 uint end = DirPTE::addr_to_offset(region.end, shift);
194 u64 orig_end = region.end;
196 assert(start < pages_per_dtable());
197 assert(end < pages_per_dtable());
198 assert(page->flags & Page::InUse);
201 region.end = round_up(region.start + 1, shift) - 1;
203 for (uint i = start; i <= end; i++) {
206 if (!dtable[i].valid_pte()) {
207 subtable = Mem::alloc_pages(1);
208 bzero(subtable, PTE::page_size);
209 DirPTE newpte = DirPTE::set_table(subtable);
210 newpte.set_pte(dtable, i);
213 subtable = dtable[i].get_table();
216 rec_map(region, flags, subtable, shift - DirPTE::shift_per_level);
218 region.offset += region.end - region.start + 1;
219 region.start = region.end + 1;
222 region.end = orig_end;
224 region.end += 1UL << shift;
228 template <typename PTE>
229 void PageTableImpl<PTE>::end_unmap(Region region, void *table)
231 Page *page = kvirt_to_page(table);
232 uint start = PTE::addr_to_offset(region.start, PTE::page_shift);
233 uint end = PTE::addr_to_offset(region.end, PTE::page_shift);
235 assert(start < pages_per_table());
236 assert(end < pages_per_table());
237 assert(page->flags & Page::InUse);
239 PTE *ptable = static_cast<PTE *>(table);
241 for (uint i = start; i <= end; i++) {
243 PTE oldpte = PTE().xchg_pte(ptable, i);
246 // vaddr is only for process aspaces, so don't worry
249 ulong vaddr = (ulong)region.start +
250 ((i - start) << PTE::page_shift);
252 kill_pte(vaddr, oldpte.pte_to_addr(),
253 oldpte.dirty_pte(), oldpte.valid_pte());
256 assert(page->inuse.refcount > 1);
262 template <typename PTE>
263 void PageTableImpl<PTE>::rec_unmap(Region region, void *table, int shift)
265 if (shift < lastlevel_shift) {
266 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
267 end_unmap(region, table);
271 Page *page = kvirt_to_page(table);
272 uint start = DirPTE::addr_to_offset(region.start, shift);
273 uint end = DirPTE::addr_to_offset(region.end, shift);
274 u64 orig_end = region.end;
276 assert(start < pages_per_dtable());
277 assert(end < pages_per_dtable());
278 assert(page->flags & Page::InUse);
280 DirPTE *dtable = static_cast<DirPTE *>(table);
283 region.end = round_up(region.start + 1, shift) - 1;
285 for (uint i = start; i <= end; i++) {
286 if (dtable[i].valid_pte()) {
287 void *subtable = dtable[i].get_table();
289 rec_unmap(region, subtable, shift - DirPTE::shift_per_level);
291 Page *subpage = kvirt_to_page(subtable);
292 assert(subpage->flags & Page::InUse);
293 assert(subpage->inuse.refcount > 0);
295 if (subpage->inuse.refcount == 1) {
296 DirPTE().set_pte(dtable, i);
299 assert(page->inuse.refcount > 1);
304 region.start = region.end + 1;
307 region.end = orig_end;
309 region.end += 1UL << shift;
313 template <typename PTE>
314 void PageTableImpl<PTE>::end_set_flags(Region region, PTE flags,
315 PTE mask, void *table)
317 uint start = PTE::addr_to_offset(region.start, PTE::page_shift);
318 uint end = PTE::addr_to_offset(region.end, PTE::page_shift);
320 assert(start < pages_per_table());
321 assert(end < pages_per_table());
323 PTE *ptable = static_cast<PTE *>(table);
325 for (uint i = start; i <= end; i++) {
327 PTE oldpte = ptable[i].set_flags(mask, flags);
329 // vaddr is only for process aspaces, so don't worry
332 ulong vaddr = (ulong)region.start +
333 ((i - start) << PTE::page_shift);
335 kill_pte(vaddr, oldpte.pte_to_addr(),
336 oldpte.dirty_pte(), oldpte.valid_pte(), true);
341 template <typename PTE>
342 void PageTableImpl<PTE>::rec_set_flags(Region region, PTE flags,
343 PTE mask, void *table,
346 if (shift < lastlevel_shift) {
347 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
348 shift = PTE::page_shift;
351 uint start = DirPTE::addr_to_offset(region.start, shift);
352 uint end = DirPTE::addr_to_offset(region.end, shift);
353 u64 orig_end = region.end;
355 assert(start < pages_per_dtable());
356 assert(end < pages_per_dtable());
358 DirPTE *dtable = static_cast<DirPTE *>(table);
361 region.end = round_up(region.start + 1, shift) - 1;
363 for (uint i = start; i <= end; i++) {
364 if (dtable[i].valid_pte()) {
365 void *subtable = dtable[i].get_table();
367 rec_set_flags(region, flags, mask, subtable,
368 shift - DirPTE::shift_per_level);
371 region.start = region.end + 1;
374 region.end = orig_end;
376 region.end += 1UL << shift;
380 template <typename PTE>
381 void PageTableImpl<PTE>::map(RegionWithOffset region, Flags flags)
383 Lock::AutoLock autolock(lock);
385 PTE::flags_to_pte(flags, ~0UL, pte, ptemask);
386 PTE *table = static_cast<PTE *>(toplevel);
387 rec_map(region, pte, table, toplevel_shift);
390 template <typename PTE>
391 void PageTableImpl<PTE>::unmap(Region region)
393 Lock::AutoLock autolock(lock);
394 PTE *table = static_cast<PTE *>(toplevel);
395 rec_unmap(region, table, toplevel_shift);
398 template <typename PTE>
399 void PageTableImpl<PTE>::set_flags(Region region, Flags flags, Flags mask)
401 Lock::AutoLock autolock(lock);
403 PTE::flags_to_pte(flags, mask, pte, ptemask);
404 PTE *table = static_cast<PTE *>(toplevel);
405 rec_set_flags(region, pte, ptemask, table, toplevel_shift);
408 template <typename PTE>
409 void PageTableImpl<PTE>::get_mapping(u64 addr, u64 *phys, Flags *flags)
411 Lock::AutoLock autolock(lock);
412 int shift = toplevel_shift;
414 void *table = toplevel;
416 while (shift >= lastlevel_shift) {
417 DirPTE *dtable = static_cast<DirPTE *>(table);
418 DirPTE dpte = dtable[DirPTE::addr_to_offset(addr, shift)];
420 if (!dpte.valid_pte()) {
425 table = dpte.get_table();
426 shift -= DirPTE::shift_per_level;
429 assert(shift + DirPTE::shift_per_level - PTE::shift_per_level == PTE::page_shift);
431 PTE *ptable = static_cast<PTE *>(table);
433 uint off = PTE::addr_to_offset(addr, PTE::page_shift);
435 *phys = ptable[off].pte_to_addr();
436 *flags = ptable[off].pte_to_flags();
439 template <typename PTE>
440 PageTableImpl<PTE>::PageTableImpl(bool process) : PageTable(process)
442 toplevel = Mem::alloc_pages(1);
443 PTE *table = static_cast<PTE *>(toplevel);
446 num_levels = PTE::num_levels;
448 if (PTE::kmap_start != 0)
449 bzero(table, PTE::kmap_start * sizeof(PTE));
451 if (PTE::kmap_end != pages_per_dtable() - 1)
452 bzero(table + PTE::kmap_end + 1,
453 (pages_per_dtable() - PTE::kmap_end - 1) * sizeof(PTE));
455 PTE *ktable = static_cast<PTE *>(kernel_page_table->toplevel);
457 memcpy(table + PTE::kmap_start, ktable + PTE::kmap_start,
458 (PTE::kmap_end - PTE::kmap_start + 1) * sizeof(PTE));
460 // FIXME: growable levels
461 num_levels = PTE::num_levels;
462 bzero(table, PTE::page_size);
465 toplevel_shift = lastlevel_shift = PTE::page_shift;
467 if (num_levels > 1) {
468 lastlevel_shift += PTE::shift_per_level;
469 toplevel_shift += PTE::shift_per_level +
470 (num_levels - 2) * DirPTE::shift_per_level;
474 template <typename PTE>
475 PageTableImpl<PTE>::PageTableImpl(void *table) : PageTable(true)
477 assert(!kernel_page_table);
480 num_levels = PTE::num_levels;
481 kernel_page_table = this;
484 template <typename PTE>
485 PageTableImpl<PTE>::~PageTableImpl()
487 assert(this != kernel_page_table);
490 Region region1 = { 0, ((VirtAddr)PTE::kmap_start - 1) << toplevel_shift };
491 Region region2 = { ((VirtAddr)PTE::kmap_end + 1) << toplevel_shift, ~0UL };
493 if (PTE::kmap_start != 0)
495 if (PTE::kmap_end != pages_per_dtable() - 1)
498 Region region = { 0, ~0UL };
502 Page *page = kvirt_to_page(toplevel);
503 assert(page->flags & Page::InUse);
504 assert(page->inuse.refcount == 1);
506 Mem::free_pages(toplevel, 1);
509 template class PageTableImpl<Arch::PTE>;
510 template class PageTableImpl<GenPTE>;