File-copy from v4.4.100
This is the result of 'cp' from a linux-stable tree with the 'v4.4.100'
tag checked out (commit 26d6298789e695c9f627ce49a7bbd2286405798a) on
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
Please refer to that tree for all history prior to this point.
Change-Id: I8a9ee2aea93cd29c52c847d0ce33091a73ae6afe
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
new file mode 100644
index 0000000..65c47fd
--- /dev/null
+++ b/arch/x86/mm/Makefile
@@ -0,0 +1,35 @@
+obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+ pat.o pgtable.o physaddr.o gup.o setup_nx.o
+
+# Make sure __phys_addr has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_physaddr.o := $(nostackp)
+CFLAGS_setup_nx.o := $(nostackp)
+
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
+obj-$(CONFIG_X86_PAT) += pat_rbtree.o
+obj-$(CONFIG_SMP) += tlb.o
+
+obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o
+
+obj-$(CONFIG_HIGHMEM) += highmem_32.o
+
+obj-$(CONFIG_KMEMCHECK) += kmemcheck/
+
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
+
+obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
+mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
+
+obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
+obj-$(CONFIG_AMD_NUMA) += amdtopology.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
+obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
+
+obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
new file mode 100644
index 0000000..2ca15b5
--- /dev/null
+++ b/arch/x86/mm/amdtopology.c
@@ -0,0 +1,196 @@
+/*
+ * AMD NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * This version reads it directly from the AMD northbridge.
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <linux/acpi.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/amd_nb.h>
+
+static unsigned char __initdata nodeids[8];
+
+static __init int find_northbridge(void)
+{
+ int num;
+
+ for (num = 0; num < 32; num++) {
+ u32 header;
+
+ header = read_pci_config(0, num, 0, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+ continue;
+
+ header = read_pci_config(0, num, 1, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+ continue;
+ return num;
+ }
+
+ return -ENOENT;
+}
+
+static __init void early_get_boot_cpu_id(void)
+{
+ /*
+ * need to get the APIC ID of the BSP so can use that to
+ * create apicid_to_node in amd_scan_nodes()
+ */
+#ifdef CONFIG_X86_MPPARSE
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ early_get_smp_config();
+#endif
+}
+
+int __init amd_numa_init(void)
+{
+ u64 start = PFN_PHYS(0);
+ u64 end = PFN_PHYS(max_pfn);
+ unsigned numnodes;
+ u64 prevbase;
+ int i, j, nb;
+ u32 nodeid, reg;
+ unsigned int bits, cores, apicid_base;
+
+ if (!early_pci_allowed())
+ return -EINVAL;
+
+ nb = find_northbridge();
+ if (nb < 0)
+ return nb;
+
+ pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
+
+ reg = read_pci_config(0, nb, 0, 0x60);
+ numnodes = ((reg >> 4) & 0xF) + 1;
+ if (numnodes <= 1)
+ return -ENOENT;
+
+ pr_info("Number of physical nodes %d\n", numnodes);
+
+ prevbase = 0;
+ for (i = 0; i < 8; i++) {
+ u64 base, limit;
+
+ base = read_pci_config(0, nb, 1, 0x40 + i*8);
+ limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+ nodeids[i] = nodeid = limit & 7;
+ if ((base & 3) == 0) {
+ if (i < numnodes)
+ pr_info("Skipping disabled node %d\n", i);
+ continue;
+ }
+ if (nodeid >= numnodes) {
+ pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
+ base, limit);
+ continue;
+ }
+
+ if (!limit) {
+ pr_info("Skipping node entry %d (base %Lx)\n",
+ i, base);
+ continue;
+ }
+ if ((base >> 8) & 3 || (limit >> 8) & 3) {
+ pr_err("Node %d using interleaving mode %Lx/%Lx\n",
+ nodeid, (base >> 8) & 3, (limit >> 8) & 3);
+ return -EINVAL;
+ }
+ if (node_isset(nodeid, numa_nodes_parsed)) {
+ pr_info("Node %d already present, skipping\n",
+ nodeid);
+ continue;
+ }
+
+ limit >>= 16;
+ limit++;
+ limit <<= 24;
+
+ if (limit > end)
+ limit = end;
+ if (limit <= base)
+ continue;
+
+ base >>= 16;
+ base <<= 24;
+
+ if (base < start)
+ base = start;
+ if (limit > end)
+ limit = end;
+ if (limit == base) {
+ pr_err("Empty node %d\n", nodeid);
+ continue;
+ }
+ if (limit < base) {
+ pr_err("Node %d bogus settings %Lx-%Lx.\n",
+ nodeid, base, limit);
+ continue;
+ }
+
+ /* Could sort here, but pun for now. Should not happen anyroads. */
+ if (prevbase > base) {
+ pr_err("Node map not sorted %Lx,%Lx\n",
+ prevbase, base);
+ return -EINVAL;
+ }
+
+ pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
+ nodeid, base, limit);
+
+ prevbase = base;
+ numa_add_memblk(nodeid, base, limit);
+ node_set(nodeid, numa_nodes_parsed);
+ }
+
+ if (!nodes_weight(numa_nodes_parsed))
+ return -ENOENT;
+
+ /*
+ * We seem to have valid NUMA configuration. Map apicids to nodes
+ * using the coreid bits from early_identify_cpu.
+ */
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = 1 << bits;
+ apicid_base = 0;
+
+ /* get the APIC ID of the BSP early for systems with apicid lifting */
+ early_get_boot_cpu_id();
+ if (boot_cpu_physical_apicid > 0) {
+ pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+ apicid_base = boot_cpu_physical_apicid;
+ }
+
+ for_each_node_mask(i, numa_nodes_parsed)
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ set_apicid_to_node((i << bits) + j, i);
+
+ return 0;
+}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
new file mode 100644
index 0000000..0f1c6fc
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables.c
@@ -0,0 +1,484 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+ int level;
+ pgprot_t current_prot;
+ unsigned long start_address;
+ unsigned long current_address;
+ const struct addr_marker *marker;
+ unsigned long lines;
+ bool to_dmesg;
+ bool check_wx;
+ unsigned long wx_pages;
+};
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+ unsigned long max_lines;
+};
+
+/* indices for address_markers; keep sync'd w/ address_markers below */
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
+#ifdef CONFIG_X86_64
+ KERNEL_SPACE_NR,
+ LOW_KERNEL_NR,
+ VMALLOC_START_NR,
+ VMEMMAP_START_NR,
+# ifdef CONFIG_X86_ESPFIX64
+ ESPFIX_START_NR,
+# endif
+ HIGH_KERNEL_NR,
+ MODULES_VADDR_NR,
+ MODULES_END_NR,
+#else
+ KERNEL_SPACE_NR,
+ VMALLOC_START_NR,
+ VMALLOC_END_NR,
+# ifdef CONFIG_HIGHMEM
+ PKMAP_BASE_NR,
+# endif
+ FIXADDR_START_NR,
+#endif
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+ { 0, "User Space" },
+#ifdef CONFIG_X86_64
+ { 0x8000000000000000UL, "Kernel Space" },
+ { PAGE_OFFSET, "Low Kernel Mapping" },
+ { VMALLOC_START, "vmalloc() Area" },
+ { VMEMMAP_START, "Vmemmap" },
+# ifdef CONFIG_X86_ESPFIX64
+ { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
+# endif
+# ifdef CONFIG_EFI
+ { EFI_VA_END, "EFI Runtime Services" },
+# endif
+ { __START_KERNEL_map, "High Kernel Mapping" },
+ { MODULES_VADDR, "Modules" },
+ { MODULES_END, "End Modules" },
+#else
+ { PAGE_OFFSET, "Kernel Mapping" },
+ { 0/* VMALLOC_START */, "vmalloc() Area" },
+ { 0/*VMALLOC_END*/, "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+ { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
+# endif
+ { 0/*FIXADDR_START*/, "Fixmap Area" },
+#endif
+ { -1, NULL } /* End of list */
+};
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_INFO fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_CONT fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
+{
+ pgprotval_t pr = pgprot_val(prot);
+ static const char * const level_name[] =
+ { "cr3", "pgd", "pud", "pmd", "pte" };
+
+ if (!pgprot_val(prot)) {
+ /* Not present */
+ pt_dump_cont_printf(m, dmsg, " ");
+ } else {
+ if (pr & _PAGE_USER)
+ pt_dump_cont_printf(m, dmsg, "USR ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_RW)
+ pt_dump_cont_printf(m, dmsg, "RW ");
+ else
+ pt_dump_cont_printf(m, dmsg, "ro ");
+ if (pr & _PAGE_PWT)
+ pt_dump_cont_printf(m, dmsg, "PWT ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_PCD)
+ pt_dump_cont_printf(m, dmsg, "PCD ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+
+ /* Bit 7 has a different meaning on level 3 vs 4 */
+ if (level <= 3 && pr & _PAGE_PSE)
+ pt_dump_cont_printf(m, dmsg, "PSE ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if ((level == 4 && pr & _PAGE_PAT) ||
+ ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+ pt_dump_cont_printf(m, dmsg, "PAT ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_GLOBAL)
+ pt_dump_cont_printf(m, dmsg, "GLB ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_NX)
+ pt_dump_cont_printf(m, dmsg, "NX ");
+ else
+ pt_dump_cont_printf(m, dmsg, "x ");
+ }
+ pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
+}
+
+/*
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long normalize_addr(unsigned long u)
+{
+#ifdef CONFIG_X86_64
+ return (signed long)(u << 16) >> 16;
+#else
+ return u;
+#endif
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+ pgprot_t new_prot, int level)
+{
+ pgprotval_t prot, cur;
+ static const char units[] = "BKMGTPE";
+
+ /*
+ * If we have a "break" in the series, we need to flush the state that
+ * we have now. "break" is either changing perms, levels or
+ * address space marker.
+ */
+ prot = pgprot_val(new_prot);
+ cur = pgprot_val(st->current_prot);
+
+ if (!st->level) {
+ /* First entry */
+ st->current_prot = new_prot;
+ st->level = level;
+ st->marker = address_markers;
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
+ } else if (prot != cur || level != st->level ||
+ st->current_address >= st->marker[1].start_address) {
+ const char *unit = units;
+ unsigned long delta;
+ int width = sizeof(unsigned long) * 2;
+ pgprotval_t pr = pgprot_val(st->current_prot);
+
+ if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
+ WARN_ONCE(1,
+ "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
+ (void *)st->start_address,
+ (void *)st->start_address);
+ st->wx_pages += (st->current_address -
+ st->start_address) / PAGE_SIZE;
+ }
+
+ /*
+ * Now print the actual finished series
+ */
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = st->current_address - st->start_address;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
+ delta, *unit);
+ printk_prot(m, st->current_prot, st->level,
+ st->to_dmesg);
+ }
+ st->lines++;
+
+ /*
+ * We print markers for special areas of address space,
+ * such as the start of vmalloc space etc.
+ * This helps in the interpretation.
+ */
+ if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "... %lu entr%s skipped ... \n",
+ nskip,
+ nskip == 1 ? "y" : "ies");
+ }
+ st->marker++;
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
+ }
+
+ st->start_address = st->current_address;
+ st->current_prot = new_prot;
+ st->level = level;
+ }
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+ unsigned long P)
+{
+ int i;
+ pte_t *start;
+ pgprotval_t prot;
+
+ start = (pte_t *) pmd_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ prot = pte_flags(*start);
+ st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+ note_page(m, st, __pgprot(prot), 4);
+ start++;
+ }
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+ unsigned long P)
+{
+ int i;
+ pmd_t *start;
+ pgprotval_t prot;
+
+ start = (pmd_t *) pud_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
+ if (!pmd_none(*start)) {
+ if (pmd_large(*start) || !pmd_present(*start)) {
+ prot = pmd_flags(*start);
+ note_page(m, st, __pgprot(prot), 3);
+ } else {
+ walk_pte_level(m, st, *start,
+ P + i * PMD_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 3);
+ start++;
+ }
+}
+
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a) pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+ unsigned long P)
+{
+ int i;
+ pud_t *start;
+ pgprotval_t prot;
+
+ start = (pud_t *) pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
+ if (!pud_none(*start)) {
+ if (pud_large(*start) || !pud_present(*start)) {
+ prot = pud_flags(*start);
+ note_page(m, st, __pgprot(prot), 2);
+ } else {
+ walk_pmd_level(m, st, *start,
+ P + i * PUD_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#endif
+
+#ifdef CONFIG_X86_64
+static inline bool is_hypervisor_range(int idx)
+{
+ /*
+ * ffff800000000000 - ffff87ffffffffff is reserved for
+ * the hypervisor.
+ */
+ return paravirt_enabled() &&
+ (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+ (idx < pgd_index(__PAGE_OFFSET));
+}
+#else
+static inline bool is_hypervisor_range(int idx) { return false; }
+#endif
+
+static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
+ bool checkwx)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+ pgd_t *start = swapper_pg_dir;
+#endif
+ pgprotval_t prot;
+ int i;
+ struct pg_state st = {};
+
+ if (pgd) {
+ start = pgd;
+ st.to_dmesg = true;
+ }
+
+ st.check_wx = checkwx;
+ if (checkwx)
+ st.wx_pages = 0;
+
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+ if (!pgd_none(*start) && !is_hypervisor_range(i)) {
+ if (pgd_large(*start) || !pgd_present(*start)) {
+ prot = pgd_flags(*start);
+ note_page(m, &st, __pgprot(prot), 1);
+ } else {
+ walk_pud_level(m, &st, *start,
+ i * PGD_LEVEL_MULT);
+ }
+ } else
+ note_page(m, &st, __pgprot(0), 1);
+
+ start++;
+ }
+
+ /* Flush out the last page */
+ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+ note_page(m, &st, __pgprot(0), 0);
+ if (!checkwx)
+ return;
+ if (st.wx_pages)
+ pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
+ st.wx_pages);
+ else
+ pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+}
+
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+{
+ ptdump_walk_pgd_level_core(m, pgd, false);
+}
+
+void ptdump_walk_pgd_level_checkwx(void)
+{
+ ptdump_walk_pgd_level_core(NULL, NULL, true);
+}
+
+#ifdef CONFIG_X86_PTDUMP
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ ptdump_walk_pgd_level(m, NULL);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static int pt_dump_init(void)
+{
+#ifdef CONFIG_X86_PTDUMP
+ struct dentry *pe;
+#endif
+
+#ifdef CONFIG_X86_32
+ /* Not a compile-time constant on x86-32 */
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+ address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
+# endif
+ address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+#endif
+
+#ifdef CONFIG_X86_PTDUMP
+ pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+ &ptdump_fops);
+ if (!pe)
+ return -ENOMEM;
+#endif
+
+ return 0;
+}
+
+__initcall(pt_dump_init);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 0000000..903ec1e
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,169 @@
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+
+static inline unsigned long
+ex_insn_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->insn + x->insn;
+}
+static inline unsigned long
+ex_fixup_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->fixup + x->fixup;
+}
+
+int fixup_exception(struct pt_regs *regs)
+{
+ const struct exception_table_entry *fixup;
+ unsigned long new_ip;
+
+#ifdef CONFIG_PNPBIOS
+ if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
+ extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+ extern u32 pnp_bios_is_utter_crap;
+ pnp_bios_is_utter_crap = 1;
+ printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+ __asm__ volatile(
+ "movl %0, %%esp\n\t"
+ "jmp *%1\n\t"
+ : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+ panic("do_trap: can't hit this");
+ }
+#endif
+
+ fixup = search_exception_tables(regs->ip);
+ if (fixup) {
+ new_ip = ex_fixup_addr(fixup);
+
+ if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+ /* Special hack for uaccess_err */
+ current_thread_info()->uaccess_err = 1;
+ new_ip -= 0x7ffffff0;
+ }
+ regs->ip = new_ip;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Restricted version used during very early boot */
+int __init early_fixup_exception(unsigned long *ip)
+{
+ const struct exception_table_entry *fixup;
+ unsigned long new_ip;
+
+ fixup = search_exception_tables(*ip);
+ if (fixup) {
+ new_ip = ex_fixup_addr(fixup);
+
+ if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+ /* uaccess handling not supported during early boot */
+ return 0;
+ }
+
+ *ip = new_ip;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Search one exception table for an entry corresponding to the
+ * given instruction address, and return the address of the entry,
+ * or NULL if none is found.
+ * We use a binary search, and thus we assume that the table is
+ * already sorted.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+ const struct exception_table_entry *last,
+ unsigned long value)
+{
+ while (first <= last) {
+ const struct exception_table_entry *mid;
+ unsigned long addr;
+
+ mid = ((last - first) >> 1) + first;
+ addr = ex_insn_addr(mid);
+ if (addr < value)
+ first = mid + 1;
+ else if (addr > value)
+ last = mid - 1;
+ else
+ return mid;
+ }
+ return NULL;
+}
+
+/*
+ * The exception table needs to be sorted so that the binary
+ * search that we use to find entries in it works properly.
+ * This is used both for the kernel exception table and for
+ * the exception tables of modules that get loaded.
+ *
+ */
+static int cmp_ex(const void *a, const void *b)
+{
+ const struct exception_table_entry *x = a, *y = b;
+
+ /*
+ * This value will always end up fittin in an int, because on
+ * both i386 and x86-64 the kernel symbol-reachable address
+ * space is < 2 GiB.
+ *
+ * This compare is only valid after normalization.
+ */
+ return x->insn - y->insn;
+}
+
+void sort_extable(struct exception_table_entry *start,
+ struct exception_table_entry *finish)
+{
+ struct exception_table_entry *p;
+ int i;
+
+ /* Convert all entries to being relative to the start of the section */
+ i = 0;
+ for (p = start; p < finish; p++) {
+ p->insn += i;
+ i += 4;
+ p->fixup += i;
+ i += 4;
+ }
+
+ sort(start, finish - start, sizeof(struct exception_table_entry),
+ cmp_ex, NULL);
+
+ /* Denormalize all entries */
+ i = 0;
+ for (p = start; p < finish; p++) {
+ p->insn -= i;
+ i += 4;
+ p->fixup -= i;
+ i += 4;
+ }
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+ /*trim the beginning*/
+ while (m->num_exentries &&
+ within_module_init(ex_insn_addr(&m->extable[0]), m)) {
+ m->extable++;
+ m->num_exentries--;
+ }
+ /*trim the end*/
+ while (m->num_exentries &&
+ within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
+ m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
new file mode 100644
index 0000000..e830c71
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,1342 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
+ */
+#include <linux/sched.h> /* test_thread_flag(), ... */
+#include <linux/kdebug.h> /* oops_begin/end, ... */
+#include <linux/module.h> /* search_exception_table */
+#include <linux/bootmem.h> /* max_low_pfn */
+#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
+#include <linux/mmiotrace.h> /* kmmio_handler, ... */
+#include <linux/perf_event.h> /* perf_sw_event */
+#include <linux/hugetlb.h> /* hstate_index_to_shift */
+#include <linux/prefetch.h> /* prefetchw */
+#include <linux/context_tracking.h> /* exception_enter(), ... */
+#include <linux/uaccess.h> /* faulthandler_disabled() */
+
+#include <asm/traps.h> /* dotraplinkage, ... */
+#include <asm/pgalloc.h> /* pgd_*(), ... */
+#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
+#include <asm/fixmap.h> /* VSYSCALL_ADDR */
+#include <asm/vsyscall.h> /* emulate_vsyscall */
+#include <asm/vm86.h> /* struct vm86 */
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
+
+/*
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ */
+enum x86_pf_error_code {
+
+ PF_PROT = 1 << 0,
+ PF_WRITE = 1 << 1,
+ PF_USER = 1 << 2,
+ PF_RSVD = 1 << 3,
+ PF_INSTR = 1 << 4,
+};
+
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
+static nokprobe_inline int
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+ if (unlikely(is_kmmio_active()))
+ if (kmmio_handler(regs, addr) == 1)
+ return -1;
+ return 0;
+}
+
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
+{
+ int ret = 0;
+
+ /* kprobe_running() needs smp_processor_id() */
+ if (kprobes_built_in() && !user_mode(regs)) {
+ preempt_disable();
+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
+ ret = 1;
+ preempt_enable();
+ }
+
+ return ret;
+}
+
+/*
+ * Prefetch quirks:
+ *
+ * 32-bit mode:
+ *
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * 64-bit mode:
+ *
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner.
+ */
+static inline int
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+ unsigned char opcode, int *prefetch)
+{
+ unsigned char instr_hi = opcode & 0xf0;
+ unsigned char instr_lo = opcode & 0x0f;
+
+ switch (instr_hi) {
+ case 0x20:
+ case 0x30:
+ /*
+ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+ * In X86_64 long mode, the CPU will signal invalid
+ * opcode if some of these prefixes are present so
+ * X86_64 will never get here anyway
+ */
+ return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+ case 0x40:
+ /*
+ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+ * Need to figure out under what instruction mode the
+ * instruction was issued. Could check the LDT for lm,
+ * but for now it's good enough to assume that long
+ * mode only uses well known segments or kernel.
+ */
+ return (!user_mode(regs) || user_64bit_mode(regs));
+#endif
+ case 0x60:
+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
+ return (instr_lo & 0xC) == 0x4;
+ case 0xF0:
+ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+ return !instr_lo || (instr_lo>>1) == 1;
+ case 0x00:
+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
+ if (probe_kernel_address(instr, opcode))
+ return 0;
+
+ *prefetch = (instr_lo == 0xF) &&
+ (opcode == 0x0D || opcode == 0x18);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+{
+ unsigned char *max_instr;
+ unsigned char *instr;
+ int prefetch = 0;
+
+ /*
+ * If it was a exec (instruction fetch) fault on NX page, then
+ * do not ignore the fault:
+ */
+ if (error_code & PF_INSTR)
+ return 0;
+
+ instr = (void *)convert_ip_to_linear(current, regs);
+ max_instr = instr + 15;
+
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
+ return 0;
+
+ while (instr < max_instr) {
+ unsigned char opcode;
+
+ if (probe_kernel_address(instr, opcode))
+ break;
+
+ instr++;
+
+ if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
+ break;
+ }
+ return prefetch;
+}
+
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+ struct task_struct *tsk, int fault)
+{
+ unsigned lsb = 0;
+ siginfo_t info;
+
+ info.si_signo = si_signo;
+ info.si_errno = 0;
+ info.si_code = si_code;
+ info.si_addr = (void __user *)address;
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+ info.si_addr_lsb = lsb;
+
+ force_sig_info(si_signo, &info, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+ unsigned index = pgd_index(address);
+ pgd_t *pgd_k;
+ pud_t *pud, *pud_k;
+ pmd_t *pmd, *pmd_k;
+
+ pgd += index;
+ pgd_k = init_mm.pgd + index;
+
+ if (!pgd_present(*pgd_k))
+ return NULL;
+
+ /*
+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
+ * and redundant with the set_pmd() on non-PAE. As would
+ * set_pud.
+ */
+ pud = pud_offset(pgd, address);
+ pud_k = pud_offset(pgd_k, address);
+ if (!pud_present(*pud_k))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ pmd_k = pmd_offset(pud_k, address);
+ if (!pmd_present(*pmd_k))
+ return NULL;
+
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, *pmd_k);
+ else
+ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+
+ return pmd_k;
+}
+
+void vmalloc_sync_all(void)
+{
+ unsigned long address;
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ for (address = VMALLOC_START & PMD_MASK;
+ address >= TASK_SIZE && address < FIXADDR_TOP;
+ address += PMD_SIZE) {
+ struct page *page;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ spinlock_t *pgt_lock;
+ pmd_t *ret;
+
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+ spin_lock(pgt_lock);
+ ret = vmalloc_sync_one(page_address(page), address);
+ spin_unlock(pgt_lock);
+
+ if (!ret)
+ break;
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+
+/*
+ * 32-bit:
+ *
+ * Handle a fault on the vmalloc or module mapping area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+
+ /* Make sure we are in vmalloc area: */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ WARN_ON_ONCE(in_nmi());
+
+ /*
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "current" here. We might be inside
+ * an interrupt in the middle of a task switch..
+ */
+ pgd_paddr = read_cr3();
+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ if (!pmd_k)
+ return -1;
+
+ if (pmd_huge(*pmd_k))
+ return 0;
+
+ pte_k = pte_offset_kernel(pmd_k, address);
+ if (!pte_present(*pte_k))
+ return -1;
+
+ return 0;
+}
+NOKPROBE_SYMBOL(vmalloc_fault);
+
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ struct task_struct *tsk)
+{
+#ifdef CONFIG_VM86
+ unsigned long bit;
+
+ if (!v8086_mode(regs) || !tsk->thread.vm86)
+ return;
+
+ bit = (address - 0xA0000) >> PAGE_SHIFT;
+ if (bit < 32)
+ tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
+}
+
+static bool low_pfn(unsigned long pfn)
+{
+ return pfn < max_low_pfn;
+}
+
+static void dump_pagetable(unsigned long address)
+{
+ pgd_t *base = __va(read_cr3());
+ pgd_t *pgd = &base[pgd_index(address)];
+ pmd_t *pmd;
+ pte_t *pte;
+
+#ifdef CONFIG_X86_PAE
+ printk("*pdpt = %016Lx ", pgd_val(*pgd));
+ if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
+ goto out;
+#endif
+ pmd = pmd_offset(pud_offset(pgd, address), address);
+ printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+
+ /*
+ * We must not directly access the pte in the highpte
+ * case if the page table is located in highmem.
+ * And let's rather not kmap-atomic the pte, just in case
+ * it's allocated already:
+ */
+ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
+ goto out;
+
+ pte = pte_offset_kernel(pmd, address);
+ printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
+out:
+ printk("\n");
+}
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_all(void)
+{
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
+}
+
+/*
+ * 64-bit:
+ *
+ * Handle a fault on the vmalloc area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+ pgd_t *pgd, *pgd_ref;
+ pud_t *pud, *pud_ref;
+ pmd_t *pmd, *pmd_ref;
+ pte_t *pte, *pte_ref;
+
+ /* Make sure we are in vmalloc area: */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ WARN_ON_ONCE(in_nmi());
+
+ /*
+ * Copy kernel mappings over when needed. This can also
+ * happen within a race in page table update. In the later
+ * case just flush:
+ */
+ pgd = pgd_offset(current->active_mm, address);
+ pgd_ref = pgd_offset_k(address);
+ if (pgd_none(*pgd_ref))
+ return -1;
+
+ if (pgd_none(*pgd)) {
+ set_pgd(pgd, *pgd_ref);
+ arch_flush_lazy_mmu_mode();
+ } else {
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ }
+
+ /*
+ * Below here mismatches are bugs because these lower tables
+ * are shared:
+ */
+
+ pud = pud_offset(pgd, address);
+ pud_ref = pud_offset(pgd_ref, address);
+ if (pud_none(*pud_ref))
+ return -1;
+
+ if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
+ BUG();
+
+ if (pud_huge(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, address);
+ pmd_ref = pmd_offset(pud_ref, address);
+ if (pmd_none(*pmd_ref))
+ return -1;
+
+ if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
+ BUG();
+
+ if (pmd_huge(*pmd))
+ return 0;
+
+ pte_ref = pte_offset_kernel(pmd_ref, address);
+ if (!pte_present(*pte_ref))
+ return -1;
+
+ pte = pte_offset_kernel(pmd, address);
+
+ /*
+ * Don't use pte_page here, because the mappings can point
+ * outside mem_map, and the NUMA hash lookup cannot handle
+ * that:
+ */
+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+ BUG();
+
+ return 0;
+}
+NOKPROBE_SYMBOL(vmalloc_fault);
+
+#ifdef CONFIG_CPU_SUP_AMD
+static const char errata93_warning[] =
+KERN_ERR
+"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+"******* Working around it, but it may cause SEGVs or burn power.\n"
+"******* Please consider a BIOS update.\n"
+"******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+ unsigned long dummy;
+
+ return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address)
+{
+ pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd_t *pgd = base + pgd_index(address);
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (bad_address(pgd))
+ goto bad;
+
+ printk("PGD %lx ", pgd_val(*pgd));
+
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (bad_address(pud))
+ goto bad;
+
+ printk("PUD %lx ", pud_val(*pud));
+ if (!pud_present(*pud) || pud_large(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (bad_address(pmd))
+ goto bad;
+
+ printk("PMD %lx ", pmd_val(*pmd));
+ if (!pmd_present(*pmd) || pmd_large(*pmd))
+ goto out;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (bad_address(pte))
+ goto bad;
+
+ printk("PTE %lx", pte_val(*pte));
+out:
+ printk("\n");
+ return;
+bad:
+ printk("BAD\n");
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Workaround for K8 erratum #93 & buggy BIOS.
+ *
+ * BIOS SMM functions are required to use a specific workaround
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
+ * A lot of BIOS that didn't get tested properly miss this.
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+ || boot_cpu_data.x86 != 0xf)
+ return 0;
+
+ if (address != regs->ip)
+ return 0;
+
+ if ((address >> 32) != 0)
+ return 0;
+
+ address |= 0xffffffffUL << 32;
+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
+ (address >= MODULES_VADDR && address <= MODULES_END)) {
+ printk_once(errata93_warning);
+ regs->ip = address;
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+ * to illegal addresses >4GB.
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return. Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
+ return 1;
+#endif
+ return 0;
+}
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+ unsigned long nr;
+
+ /*
+ * Pentium F0 0F C7 C8 bug workaround:
+ */
+ if (boot_cpu_has_bug(X86_BUG_F00F)) {
+ nr = (address - idt_descr.address) >> 3;
+
+ if (nr == 6) {
+ do_invalid_op(regs, 0);
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+static const char nx_warning[] = KERN_CRIT
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static const char smep_warning[] = KERN_CRIT
+"unable to execute userspace code (SMEP?) (uid: %d)\n";
+
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ if (!oops_may_print())
+ return;
+
+ if (error_code & PF_INSTR) {
+ unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
+
+ pte = lookup_address_in_pgd(pgd, address, &level);
+
+ if (pte && pte_present(*pte) && !pte_exec(*pte))
+ printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+ if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ (pgd_flags(*pgd) & _PAGE_USER) &&
+ (__read_cr4() & X86_CR4_SMEP))
+ printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
+ }
+
+ printk(KERN_ALERT "BUG: unable to handle kernel ");
+ if (address < PAGE_SIZE)
+ printk(KERN_CONT "NULL pointer dereference");
+ else
+ printk(KERN_CONT "paging request");
+
+ printk(KERN_CONT " at %p\n", (void *) address);
+ printk(KERN_ALERT "IP:");
+ printk_address(regs->ip);
+
+ dump_pagetable(address);
+}
+
+static noinline void
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ struct task_struct *tsk;
+ unsigned long flags;
+ int sig;
+
+ flags = oops_begin();
+ tsk = current;
+ sig = SIGKILL;
+
+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+ tsk->comm, address);
+ dump_pagetable(address);
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code;
+
+ if (__die("Bad pagetable", regs, error_code))
+ sig = 0;
+
+ oops_end(flags, regs, sig);
+}
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code)
+{
+ struct task_struct *tsk = current;
+ unsigned long flags;
+ int sig;
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current_thread_info()->sig_on_uaccess_error && signal) {
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.cr2 = address;
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_info_fault(signal, si_code, address, tsk, 0);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
+ return;
+ }
+
+ /*
+ * 32-bit:
+ *
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ *
+ * 64-bit:
+ *
+ * Hall of shame of CPU/BIOS bugs.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata93(regs, address))
+ return;
+
+ /*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice:
+ */
+ flags = oops_begin();
+
+ show_fault_oops(regs, error_code, address);
+
+ if (task_stack_end_corrupted(tsk))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code;
+
+ sig = SIGKILL;
+ if (__die("Oops", regs, error_code))
+ sig = 0;
+
+ /* Executive summary in case the body of the oops scrolled away */
+ printk(KERN_DEFAULT "CR2: %016lx\n", address);
+
+ oops_end(flags, regs, sig);
+}
+
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, struct task_struct *tsk)
+{
+ if (!unhandled_signal(tsk, SIGSEGV))
+ return;
+
+ if (!printk_ratelimit())
+ return;
+
+ printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, task_pid_nr(tsk), address,
+ (void *)regs->ip, (void *)regs->sp, error_code);
+
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+
+ printk(KERN_CONT "\n");
+}
+
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code)
+{
+ struct task_struct *tsk = current;
+
+ /* User mode accesses just cause a SIGSEGV */
+ if (error_code & PF_USER) {
+ /*
+ * It's possible to have interrupts off here:
+ */
+ local_irq_enable();
+
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space:
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata100(regs, address))
+ return;
+
+#ifdef CONFIG_X86_64
+ /*
+ * Instruction fetch faults in the vsyscall page might need
+ * emulation.
+ */
+ if (unlikely((error_code & PF_INSTR) &&
+ ((address & ~0xfff) == VSYSCALL_ADDR))) {
+ if (emulate_vsyscall(regs, address))
+ return;
+ }
+#endif
+ /* Kernel addresses are always protection faults: */
+ if (address >= TASK_SIZE)
+ error_code |= PF_PROT;
+
+ if (likely(show_unhandled_signals))
+ show_signal_msg(regs, error_code, address, tsk);
+
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+
+ return;
+ }
+
+ if (is_f00f_bug(regs, address))
+ return;
+
+ no_context(regs, error_code, address, SIGSEGV, si_code);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int si_code)
+{
+ struct mm_struct *mm = current->mm;
+
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ up_read(&mm->mmap_sem);
+
+ __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+ unsigned int fault)
+{
+ struct task_struct *tsk = current;
+ int code = BUS_ADRERR;
+
+ /* Kernel mode? Handle exceptions or die: */
+ if (!(error_code & PF_USER)) {
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ return;
+ }
+
+ /* User-space => ok to do another page fault: */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+#ifdef CONFIG_MEMORY_FAILURE
+ if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+ printk(KERN_ERR
+ "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+ tsk->comm, tsk->pid, address);
+ code = BUS_MCEERR_AR;
+ }
+#endif
+ force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+}
+
+static noinline void
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, unsigned int fault)
+{
+ if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ no_context(regs, error_code, address, 0, 0);
+ return;
+ }
+
+ if (fault & VM_FAULT_OOM) {
+ /* Kernel mode? Handle exceptions or die: */
+ if (!(error_code & PF_USER)) {
+ no_context(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+ return;
+ }
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ do_sigbus(regs, error_code, address, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ bad_area_nosemaphore(regs, error_code, address);
+ else
+ BUG();
+ }
+}
+
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+ if ((error_code & PF_WRITE) && !pte_write(*pte))
+ return 0;
+
+ if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X). Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry. Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ *
+ * Returns non-zero if a spurious fault was handled, zero otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
+ */
+static noinline int
+spurious_fault(unsigned long error_code, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int ret;
+
+ /*
+ * Only writes to RO or instruction fetches from NX may cause
+ * spurious faults.
+ *
+ * These could be from user or supervisor accesses but the TLB
+ * is only lazily flushed after a kernel mapping protection
+ * change, so user accesses are not expected to cause spurious
+ * faults.
+ */
+ if (error_code != (PF_WRITE | PF_PROT)
+ && error_code != (PF_INSTR | PF_PROT))
+ return 0;
+
+ pgd = init_mm.pgd + pgd_index(address);
+ if (!pgd_present(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return 0;
+
+ if (pud_large(*pud))
+ return spurious_fault_check(error_code, (pte_t *) pud);
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ if (pmd_large(*pmd))
+ return spurious_fault_check(error_code, (pte_t *) pmd);
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+
+ ret = spurious_fault_check(error_code, pte);
+ if (!ret)
+ return 0;
+
+ /*
+ * Make sure we have permissions in PMD.
+ * If not, then there's a bug in the page tables:
+ */
+ ret = spurious_fault_check(error_code, (pte_t *) pmd);
+ WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+ return ret;
+}
+NOKPROBE_SYMBOL(spurious_fault);
+
+int show_unhandled_signals = 1;
+
+static inline int
+access_error(unsigned long error_code, struct vm_area_struct *vma)
+{
+ if (error_code & PF_WRITE) {
+ /* write, present and write, not present: */
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return 1;
+ return 0;
+ }
+
+ /* read, present: */
+ if (unlikely(error_code & PF_PROT))
+ return 1;
+
+ /* read, not present: */
+ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ return 1;
+
+ return 0;
+}
+
+static int fault_in_kernel_space(unsigned long address)
+{
+ return address >= TASK_SIZE_MAX;
+}
+
+static inline bool smap_violation(int error_code, struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_X86_SMAP))
+ return false;
+
+ if (!static_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ if (error_code & PF_USER)
+ return false;
+
+ if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
+ return false;
+
+ return true;
+}
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * This function must have noinline because both callers
+ * {,trace_}do_page_fault() have notrace on. Having this an actual function
+ * guarantees there's a function trace entry.
+ */
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ int fault, major = 0;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ tsk = current;
+ mm = tsk->mm;
+
+ /*
+ * Detect and handle instructions that would cause a page fault for
+ * both a tracked kernel page and a userspace page.
+ */
+ if (kmemcheck_active(regs))
+ kmemcheck_hide(regs);
+ prefetchw(&mm->mmap_sem);
+
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
+
+ /*
+ * We fault-in kernel-space virtual memory on-demand. The
+ * 'reference' page table is init_mm.pgd.
+ *
+ * NOTE! We MUST NOT take any locks for this case. We may
+ * be in an interrupt or a critical region, and should
+ * only copy the information from the master page table,
+ * nothing more.
+ *
+ * This verifies that the fault happens in kernel space
+ * (error_code & 4) == 0, and that the fault was not a
+ * protection error (error_code & 9) == 0.
+ */
+ if (unlikely(fault_in_kernel_space(address))) {
+ if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
+ return;
+
+ if (kmemcheck_fault(regs, address, error_code))
+ return;
+ }
+
+ /* Can handle a stale RO->RW TLB: */
+ if (spurious_fault(error_code, address))
+ return;
+
+ /* kprobes don't want to hook the spurious faults: */
+ if (kprobes_fault(regs))
+ return;
+ /*
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock:
+ */
+ bad_area_nosemaphore(regs, error_code, address);
+
+ return;
+ }
+
+ /* kprobes don't want to hook the spurious faults: */
+ if (unlikely(kprobes_fault(regs)))
+ return;
+
+ if (unlikely(error_code & PF_RSVD))
+ pgtable_bad(regs, error_code, address);
+
+ if (unlikely(smap_violation(error_code, regs))) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * If we're in an interrupt, have no user context or are running
+ * in a region with pagefaults disabled then we must not take the fault
+ */
+ if (unlikely(faulthandler_disabled() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * It's safe to allow irq's after cr2 has been saved and the
+ * vmalloc fault has been handled.
+ *
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet:
+ */
+ if (user_mode(regs)) {
+ local_irq_enable();
+ error_code |= PF_USER;
+ flags |= FAULT_FLAG_USER;
+ } else {
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+ }
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ if (error_code & PF_WRITE)
+ flags |= FAULT_FLAG_WRITE;
+
+ /*
+ * When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in
+ * the kernel and should generate an OOPS. Unfortunately, in the
+ * case of an erroneous fault occurring in a code path which already
+ * holds mmap_sem we will deadlock attempting to validate the fault
+ * against the address space. Luckily the kernel only validly
+ * references user space from well defined areas of code, which are
+ * listed in the exceptions table.
+ *
+ * As the vast majority of faults will be valid we will only perform
+ * the source reference check when there is a possibility of a
+ * deadlock. Attempt to lock the address space, if we cannot we then
+ * validate the source. If this is invalid we can skip the address
+ * space check, thus avoiding the deadlock:
+ */
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ if ((error_code & PF_USER) == 0 &&
+ !search_exception_tables(regs->ip)) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
+ }
+retry:
+ down_read(&mm->mmap_sem);
+ } else {
+ /*
+ * The above down_read_trylock() might have succeeded in
+ * which case we'll have missed the might_sleep() from
+ * down_read():
+ */
+ might_sleep();
+ }
+
+ vma = find_vma(mm, address);
+ if (unlikely(!vma)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ if (likely(vma->vm_start <= address))
+ goto good_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ if (error_code & PF_USER) {
+ /*
+ * Accessing the stack below %sp is always a bug.
+ * The large cushion allows instructions like enter
+ * and pusha to work. ("enter $65535, $31" pushes
+ * 32 pointers and then decrements %sp by 65535.)
+ */
+ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ }
+ if (unlikely(expand_stack(vma, address))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+ */
+ fault = handle_mm_fault(mm, vma, address, flags);
+ major |= fault & VM_FAULT_MAJOR;
+
+ /*
+ * If we need to retry the mmap_sem has already been released,
+ * and if there is a fatal signal pending there is no guarantee
+ * that we made any progress. Handle this case first.
+ */
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ /* Retry at most once */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ if (!fatal_signal_pending(tsk))
+ goto retry;
+ }
+
+ /* User mode? Just return to handle the fatal exception */
+ if (flags & FAULT_FLAG_USER)
+ return;
+
+ /* Not returning to user mode? Handle exceptions or die: */
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ return;
+ }
+
+ up_read(&mm->mmap_sem);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mm_fault_error(regs, error_code, address, fault);
+ return;
+ }
+
+ /*
+ * Major/minor page fault accounting. If any of the events
+ * returned VM_FAULT_MAJOR, we account it as a major fault.
+ */
+ if (major) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+ }
+
+ check_v8086_mode(regs, address, tsk);
+}
+NOKPROBE_SYMBOL(__do_page_fault);
+
+dotraplinkage void notrace
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ unsigned long address = read_cr2(); /* Get the faulting address */
+ enum ctx_state prev_state;
+
+ /*
+ * We must have this function tagged with __kprobes, notrace and call
+ * read_cr2() before calling anything else. To avoid calling any kind
+ * of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contain all sorts of tracepoints.
+ */
+
+ prev_state = exception_enter();
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(do_page_fault);
+
+#ifdef CONFIG_TRACING
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user(address, regs, error_code);
+ else
+ trace_page_fault_kernel(address, regs, error_code);
+}
+
+dotraplinkage void notrace
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ /*
+ * The exception_enter and tracepoint processing could
+ * trigger another page faults (user space callchain
+ * reading) and destroy the original cr2 value, so read
+ * the faulting address now.
+ */
+ unsigned long address = read_cr2();
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
+ trace_page_fault_entries(address, regs, error_code);
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(trace_do_page_fault);
+#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 0000000..ae9a37b
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,403 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+
+#include <asm/pgtable.h>
+
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+ return READ_ONCE(*ptep);
+#else
+ /*
+ * With get_user_pages_fast, we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atomically,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a pte will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees l iff pte_high
+ * sees h. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have got rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+ * we're safe.
+ *
+ * gup_get_pte should not be used or copied outside gup.c without being
+ * very careful -- it does not atomically load the pte or anything that
+ * is likely to be useful for you.
+ */
+ pte_t pte;
+
+retry:
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ if (unlikely(pte.pte_low != ptep->pte_low))
+ goto retry;
+
+ return pte;
+#endif
+}
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ pte_t *ptep;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+
+ ptep = pte_offset_map(&pmd, addr);
+ do {
+ pte_t pte = gup_get_pte(ptep);
+ struct page *page;
+
+ /* Similar to the PMD case, NUMA hinting must take slow path */
+ if (pte_protnone(pte)) {
+ pte_unmap(ptep);
+ return 0;
+ }
+
+ if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ pte_unmap(ptep);
+ return 0;
+ }
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ get_page(page);
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+ pte_unmap(ptep - 1);
+
+ return 1;
+}
+
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+ VM_BUG_ON_PAGE(page != compound_head(page), page);
+ VM_BUG_ON_PAGE(page_count(page) == 0, page);
+ atomic_add(nr, &page->_count);
+ SetPageReferenced(page);
+}
+
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ struct page *head, *page;
+ int refs;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pmd_flags(pmd) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
+ VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
+
+ refs = 0;
+ head = pmd_page(pmd);
+ page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+ get_head_page_multiple(head, refs);
+
+ return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = *pmdp;
+
+ next = pmd_addr_end(addr, end);
+ /*
+ * The pmd_trans_splitting() check below explains why
+ * pmdp_splitting_flush has to flush the tlb, to stop
+ * this gup-fast code from running while we set the
+ * splitting bit in the pmd. Returning zero will take
+ * the slow path that will call wait_split_huge_page()
+ * if the pmd is still in splitting state. gup-fast
+ * can't because it has irq disabled and
+ * wait_split_huge_page() would never return as the
+ * tlb flush IPI wouldn't run.
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ return 0;
+ if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_protnone(pmd))
+ return 0;
+ if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+ return 0;
+ } else {
+ if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ }
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ struct page *head, *page;
+ int refs;
+
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pud_flags(pud) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
+ VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
+
+ refs = 0;
+ head = pud_page(pud);
+ page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+ get_head_page_multiple(head, refs);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(&pgd, addr);
+ do {
+ pud_t pud = *pudp;
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (unlikely(pud_large(pud))) {
+ if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+ return 0;
+ } else {
+ if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ }
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ unsigned long flags;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ (void __user *)start, len)))
+ return 0;
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables and pages from being freed on x86.
+ *
+ * So long as we atomically load page table pointers versus teardown
+ * (which we do on x86, with the above PAE exception), we can follow the
+ * address down to the the page and take a ref on it.
+ */
+ local_irq_save(flags);
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ break;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ break;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_restore(flags);
+
+ return nr;
+}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+
+ end = start + len;
+ if (end < start)
+ goto slow_irqon;
+
+#ifdef CONFIG_X86_64
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ goto slow_irqon;
+#endif
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables and pages from being freed on x86.
+ *
+ * So long as we atomically load page table pointers versus teardown
+ * (which we do on x86, with the above PAE exception), we can follow the
+ * address down to the the page and take a ref on it.
+ */
+ local_irq_disable();
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_enable();
+
+ VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+ return nr;
+
+ {
+ int ret;
+
+slow:
+ local_irq_enable();
+slow_irqon:
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ ret = get_user_pages_unlocked(current, mm, start,
+ (end - start) >> PAGE_SHIFT,
+ write, 0, pages);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+
+ return ret;
+ }
+}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
new file mode 100644
index 0000000..a6d7392
--- /dev/null
+++ b/arch/x86/mm/highmem_32.c
@@ -0,0 +1,133 @@
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */
+#include <linux/bootmem.h>
+
+void *kmap(struct page *page)
+{
+ might_sleep();
+ if (!PageHighMem(page))
+ return page_address(page);
+ return kmap_high(page);
+}
+EXPORT_SYMBOL(kmap);
+
+void kunmap(struct page *page)
+{
+ if (in_interrupt())
+ BUG();
+ if (!PageHighMem(page))
+ return;
+ kunmap_high(page);
+}
+EXPORT_SYMBOL(kunmap);
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap it is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+ unsigned long vaddr;
+ int idx, type;
+
+ preempt_disable();
+ pagefault_disable();
+
+ if (!PageHighMem(page))
+ return page_address(page);
+
+ type = kmap_atomic_idx_push();
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ BUG_ON(!pte_none(*(kmap_pte-idx)));
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void *kmap_atomic(struct page *page)
+{
+ return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(kmap_atomic);
+
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
+{
+ return kmap_atomic_prot_pfn(pfn, kmap_prot);
+}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
+
+void __kunmap_atomic(void *kvaddr)
+{
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+ if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+ vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+ int idx, type;
+
+ type = kmap_atomic_idx();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+ WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+ /*
+ * Force other mappings to Oops if they'll try to access this
+ * pte without first remap it. Keeping stale mappings around
+ * is a bad idea also, in case the page changes cacheability
+ * attributes or becomes a protected page in a hypervisor.
+ */
+ kpte_clear_flush(kmap_pte-idx, vaddr);
+ kmap_atomic_idx_pop();
+ arch_flush_lazy_mmu_mode();
+ }
+#ifdef CONFIG_DEBUG_HIGHMEM
+ else {
+ BUG_ON(vaddr < PAGE_OFFSET);
+ BUG_ON(vaddr >= (unsigned long)high_memory);
+ }
+#endif
+
+ pagefault_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
+
+void __init set_highmem_pages_init(void)
+{
+ struct zone *zone;
+ int nid;
+
+ /*
+ * Explicitly reset zone->managed_pages because set_highmem_pages_init()
+ * is invoked before free_all_bootmem()
+ */
+ reset_all_zones_managed_pages();
+ for_each_zone(zone) {
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ if (!is_highmem(zone))
+ continue;
+
+ zone_start_pfn = zone->zone_start_pfn;
+ zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+ nid = zone_to_nid(zone);
+ printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+ zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+ add_highpages_with_active_regions(nid, zone_start_pfn,
+ zone_end_pfn);
+ }
+}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
new file mode 100644
index 0000000..39bdaf3
--- /dev/null
+++ b/arch/x86/mm/hugetlbpage.c
@@ -0,0 +1,186 @@
+/*
+ * IA-32 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+
+#if 0 /* This is just for testing */
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+ unsigned long start = address;
+ int length = 1;
+ int nr;
+ struct page *page;
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, addr);
+ if (!vma || !is_vm_hugetlb_page(vma))
+ return ERR_PTR(-EINVAL);
+
+ pte = huge_pte_offset(mm, address);
+
+ /* hugetlb should be locked, and hence, prefaulted */
+ WARN_ON(!pte || pte_none(*pte));
+
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+ WARN_ON(!PageHead(page));
+
+ return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
+
+int pud_huge(pud_t pud)
+{
+ return 0;
+}
+
+#else
+
+/*
+ * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
+ * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
+ * Otherwise, returns 0.
+ */
+int pmd_huge(pmd_t pmd)
+{
+ return !pmd_none(pmd) &&
+ (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
+}
+
+int pud_huge(pud_t pud)
+{
+ return !!(pud_val(pud) & _PAGE_PSE);
+}
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_legacy_base;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
+
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+ unsigned long addr0, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+ unsigned long addr;
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = current->mm->mmap_base;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED) {
+ if (prepare_hugepage_range(file, addr, len))
+ return -EINVAL;
+ return addr;
+ }
+
+ if (addr) {
+ addr = ALIGN(addr, huge_page_size(h));
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ return addr;
+ }
+ if (mm->get_unmapped_area == arch_get_unmapped_area)
+ return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+ pgoff, flags);
+ else
+ return hugetlb_get_unmapped_area_topdown(file, addr, len,
+ pgoff, flags);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_X86_64
+static __init int setup_hugepagesz(char *opt)
+{
+ unsigned long ps = memparse(opt, &opt);
+ if (ps == PMD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE && cpu_has_gbpages) {
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else {
+ printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
+ ps >> 20);
+ return 0;
+ }
+ return 1;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_CMA
+static __init int gigantic_pages_init(void)
+{
+ /* With CMA we can allocate gigantic pages at runtime */
+ if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ return 0;
+}
+arch_initcall(gigantic_pages_init);
+#endif
+#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 0000000..3aebbd6
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,771 @@
+#include <linux/gfp.h>
+#include <linux/initrd.h>
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h> /* for max_low_pfn */
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/proto.h>
+#include <asm/dma.h> /* for MAX_DMA_PFN */
+#include <asm/microcode.h>
+
+/*
+ * We need to define the tracepoints somewhere, and tlb.c
+ * is only compied when SMP=y.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/tlb.h>
+
+#include "mm_internal.h"
+
+/*
+ * Tables translating between page_cache_type_t and pte encoding.
+ *
+ * The default values are defined statically as minimal supported mode;
+ * WC and WT fall back to UC-. pat_init() updates these values to support
+ * more cache modes, WC and WT, when it is safe to do so. See pat_init()
+ * for the details. Note, __early_ioremap() used during early boot-time
+ * takes pgprot_t (pte encoding) and does not use these tables.
+ *
+ * Index into __cachemode2pte_tbl[] is the cachemode.
+ *
+ * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
+ * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
+ */
+uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
+ [_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
+ [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
+};
+EXPORT_SYMBOL(__cachemode2pte_tbl);
+
+uint8_t __pte2cachemode_tbl[8] = {
+ [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
+ [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
+};
+EXPORT_SYMBOL(__pte2cachemode_tbl);
+
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
+
+static unsigned long min_pfn_mapped;
+
+static bool __initdata can_use_brk_pgt = true;
+
+/*
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
+ */
+__ref void *alloc_low_pages(unsigned int num)
+{
+ unsigned long pfn;
+ int i;
+
+ if (after_bootmem) {
+ unsigned int order;
+
+ order = get_order((unsigned long)num << PAGE_SHIFT);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+ __GFP_ZERO, order);
+ }
+
+ if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+ unsigned long ret;
+ if (min_pfn_mapped >= max_pfn_mapped)
+ panic("alloc_low_pages: ran out of memory");
+ ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ max_pfn_mapped << PAGE_SHIFT,
+ PAGE_SIZE * num , PAGE_SIZE);
+ if (!ret)
+ panic("alloc_low_pages: can not alloc memory");
+ memblock_reserve(ret, PAGE_SIZE * num);
+ pfn = ret >> PAGE_SHIFT;
+ } else {
+ pfn = pgt_buf_end;
+ pgt_buf_end += num;
+ printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+ pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+ }
+
+ for (i = 0; i < num; i++) {
+ void *adr;
+
+ adr = __va((pfn + i) << PAGE_SHIFT);
+ clear_page(adr);
+ }
+
+ return __va(pfn << PAGE_SHIFT);
+}
+
+/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void __init early_alloc_pgt_buf(void)
+{
+ unsigned long tables = INIT_PGT_BUF_SIZE;
+ phys_addr_t base;
+
+ base = __pa(extend_brk(tables, PAGE_SIZE));
+
+ pgt_buf_start = base >> PAGE_SHIFT;
+ pgt_buf_end = pgt_buf_start;
+ pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
+int after_bootmem;
+
+early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
+{
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
+ /*
+ * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ if (cpu_has_pse)
+ page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+ /* Enable PSE if available */
+ if (cpu_has_pse)
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ if (cpu_has_pge) {
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ } else
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+
+ /* Enable 1 GB linear kernel mappings if available: */
+ if (direct_gbpages && cpu_has_gbpages) {
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ } else {
+ direct_gbpages = 0;
+ }
+}
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
+/*
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
+ */
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+ int nr_range)
+{
+ int i;
+
+ for (i = 0; i < nr_range; i++) {
+ if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+ unsigned long start = round_down(mr[i].start, PMD_SIZE);
+ unsigned long end = round_up(mr[i].end, PMD_SIZE);
+
+#ifdef CONFIG_X86_32
+ if ((end >> PAGE_SHIFT) > max_low_pfn)
+ continue;
+#endif
+
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+ }
+ if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+ unsigned long start = round_down(mr[i].start, PUD_SIZE);
+ unsigned long end = round_up(mr[i].end, PUD_SIZE);
+
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+ }
+ }
+}
+
+static const char *page_size_string(struct map_range *mr)
+{
+ static const char str_1g[] = "1G";
+ static const char str_2m[] = "2M";
+ static const char str_4m[] = "4M";
+ static const char str_4k[] = "4k";
+
+ if (mr->page_size_mask & (1<<PG_LEVEL_1G))
+ return str_1g;
+ /*
+ * 32-bit without PAE has a 4M large page size.
+ * PG_LEVEL_2M is misnamed, but we can at least
+ * print out the right size in the string.
+ */
+ if (IS_ENABLED(CONFIG_X86_32) &&
+ !IS_ENABLED(CONFIG_X86_PAE) &&
+ mr->page_size_mask & (1<<PG_LEVEL_2M))
+ return str_4m;
+
+ if (mr->page_size_mask & (1<<PG_LEVEL_2M))
+ return str_2m;
+
+ return str_4k;
+}
+
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, limit_pfn;
+ unsigned long pfn;
+ int i;
+
+ limit_pfn = PFN_DOWN(end);
+
+ /* head if not big page alignment ? */
+ pfn = start_pfn = PFN_DOWN(start);
+#ifdef CONFIG_X86_32
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (pfn == 0)
+ end_pfn = PFN_DOWN(PMD_SIZE);
+ else
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#else /* CONFIG_X86_64 */
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#endif
+ if (end_pfn > limit_pfn)
+ end_pfn = limit_pfn;
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pfn = end_pfn;
+ }
+
+ /* big page (2M) range */
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#ifdef CONFIG_X86_32
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+#else /* CONFIG_X86_64 */
+ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+#endif
+
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pfn = end_pfn;
+ }
+
+#ifdef CONFIG_X86_64
+ /* big page (1G) range */
+ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pfn = end_pfn;
+ }
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pfn = end_pfn;
+ }
+#endif
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = pfn;
+ end_pfn = limit_pfn;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ if (!after_bootmem)
+ adjust_range_page_size_mask(mr, nr_range);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof(struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
+ }
+
+ for (i = 0; i < nr_range; i++)
+ pr_debug(" [mem %#010lx-%#010lx] page %s\n",
+ mr[i].start, mr[i].end - 1,
+ page_size_string(&mr[i]));
+
+ return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+ nr_pfn_mapped, start_pfn, end_pfn);
+ nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+ max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+ if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+ max_low_pfn_mapped = max(max_low_pfn_mapped,
+ min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_pfn_mapped; i++)
+ if ((start_pfn >= pfn_mapped[i].start) &&
+ (end_pfn <= pfn_mapped[i].end))
+ return true;
+
+ return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ struct map_range mr[NR_RANGE_MR];
+ unsigned long ret = 0;
+ int nr_range, i;
+
+ pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ start, end - 1);
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = split_mem_range(mr, 0, start, end);
+
+ for (i = 0; i < nr_range; i++)
+ ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+
+ add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
+
+ return ret >> PAGE_SHIFT;
+}
+
+/*
+ * We need to iterate through the E820 memory map and create direct mappings
+ * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
+ * create direct mappings for all pfns from [0 to max_low_pfn) and
+ * [4GB to max_pfn) because of possible memory holes in high addresses
+ * that cannot be marked as UC by fixed/variable range MTRRs.
+ * Depending on the alignment of E820 ranges, this may possibly result
+ * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
+ *
+ * init_mem_mapping() calls init_range_memory_mapping() with big range.
+ * That range would have hole in the middle or ends, and only ram parts
+ * will be mapped in init_range_memory_mapping().
+ */
+static unsigned long __init init_range_memory_mapping(
+ unsigned long r_start,
+ unsigned long r_end)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long mapped_ram_size = 0;
+ int i;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+ u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+ if (start >= end)
+ continue;
+
+ /*
+ * if it is overlapping with brk pgt, we need to
+ * alloc pgt buf from memblock instead.
+ */
+ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+ min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+ init_memory_mapping(start, end);
+ mapped_ram_size += end - start;
+ can_use_brk_pgt = true;
+ }
+
+ return mapped_ram_size;
+}
+
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
+ /*
+ * Initial mapped size is PMD_SIZE (2M).
+ * We can not set step_size to be PUD_SIZE (1G) yet.
+ * In worse case, when we cross the 1G boundary, and
+ * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+ * to map 1G range with PTE. Hence we use one less than the
+ * difference of page table level shifts.
+ *
+ * Don't need to worry about overflow in the top-down case, on 32bit,
+ * when step_size is 0, round_down() returns 0 for start, and that
+ * turns it into 0x100000000ULL.
+ * In the bottom-up case, round_up(x, 0) returns 0 though too, which
+ * needs to be taken into consideration by the code below.
+ */
+ return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
+}
+
+/**
+ * memory_map_top_down - Map [map_start, map_end) top down
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in top-down. That said, the page tables
+ * will be allocated at the end of the memory, and we map the
+ * memory in top-down.
+ */
+static void __init memory_map_top_down(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long real_end, start, last_start;
+ unsigned long step_size;
+ unsigned long addr;
+ unsigned long mapped_ram_size = 0;
+
+ /* xen has big range in reserved near end of ram, skip it at first.*/
+ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ step_size = PMD_SIZE;
+ max_pfn_mapped = 0; /* will get exact value next */
+ min_pfn_mapped = real_end >> PAGE_SHIFT;
+ last_start = start = real_end;
+
+ /*
+ * We start from the top (end of memory) and go to the bottom.
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (last_start > map_start) {
+ if (last_start > step_size) {
+ start = round_down(last_start - 1, step_size);
+ if (start < map_start)
+ start = map_start;
+ } else
+ start = map_start;
+ mapped_ram_size += init_range_memory_mapping(start,
+ last_start);
+ last_start = start;
+ min_pfn_mapped = last_start >> PAGE_SHIFT;
+ if (mapped_ram_size >= step_size)
+ step_size = get_new_step_size(step_size);
+ }
+
+ if (real_end < map_end)
+ init_range_memory_mapping(real_end, map_end);
+}
+
+/**
+ * memory_map_bottom_up - Map [map_start, map_end) bottom up
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in bottom-up. Since we have limited the
+ * bottom-up allocation above the kernel, the page tables will
+ * be allocated just above the kernel and we map the memory
+ * in [map_start, map_end) in bottom-up.
+ */
+static void __init memory_map_bottom_up(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long next, start;
+ unsigned long mapped_ram_size = 0;
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ unsigned long step_size = PMD_SIZE;
+
+ start = map_start;
+ min_pfn_mapped = start >> PAGE_SHIFT;
+
+ /*
+ * We start from the bottom (@map_start) and go to the top (@map_end).
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (start < map_end) {
+ if (step_size && map_end - start > step_size) {
+ next = round_up(start + 1, step_size);
+ if (next > map_end)
+ next = map_end;
+ } else {
+ next = map_end;
+ }
+
+ mapped_ram_size += init_range_memory_mapping(start, next);
+ start = next;
+
+ if (mapped_ram_size >= step_size)
+ step_size = get_new_step_size(step_size);
+ }
+}
+
+void __init init_mem_mapping(void)
+{
+ unsigned long end;
+
+ probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /*
+ * If the allocation is in bottom-up direction, we setup direct mapping
+ * in bottom-up, otherwise we setup direct mapping in top-down.
+ */
+ if (memblock_bottom_up()) {
+ unsigned long kernel_end = __pa_symbol(_end);
+
+ /*
+ * we need two separate calls here. This is because we want to
+ * allocate page tables above the kernel. So we first map
+ * [kernel_end, end) to make memory above the kernel be mapped
+ * as soon as possible. And then use page tables allocated above
+ * the kernel to map [ISA_END_ADDRESS, kernel_end).
+ */
+ memory_map_bottom_up(kernel_end, end);
+ memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
+ } else {
+ memory_map_top_down(ISA_END_ADDRESS, end);
+ }
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#else
+ early_ioremap_page_table_range_init();
+#endif
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ * On x86, access has to be given to the first megabyte of RAM because that
+ * area traditionally contains BIOS code and data regions used by X, dosemu,
+ * and similar apps. Since they map the entire memory range, the whole range
+ * must be allowed (for mapping), but any areas that would otherwise be
+ * disallowed are flagged as being "zero filled" instead of rejected.
+ * Access has to be given to non-kernel-ram areas as well, these contain the
+ * PCI mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (page_is_ram(pagenr)) {
+ /*
+ * For disallowed memory regions in the low 1MB range,
+ * request that the page be shown as all zeros.
+ */
+ if (pagenr < 256)
+ return 2;
+
+ return 0;
+ }
+
+ /*
+ * This must follow RAM test, since System RAM is considered a
+ * restricted resource under CONFIG_STRICT_IOMEM.
+ */
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
+ /* Low 1MB bypasses iomem restrictions. */
+ if (pagenr < 256)
+ return 1;
+
+ return 0;
+ }
+
+ return 1;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+ unsigned long begin_aligned, end_aligned;
+
+ /* Make sure boundaries are page aligned */
+ begin_aligned = PAGE_ALIGN(begin);
+ end_aligned = end & PAGE_MASK;
+
+ if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
+ begin = begin_aligned;
+ end = end_aligned;
+ }
+
+ if (begin >= end)
+ return;
+
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
+ begin, end - 1);
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable and non-executable first.
+ */
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+ free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
+#endif
+}
+
+void free_initmem(void)
+{
+ free_init_pages("unused kernel",
+ (unsigned long)(&__init_begin),
+ (unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+ /*
+ * Remember, initrd memory may contain microcode or other useful things.
+ * Before we lose initrd mem, we need to find a place to hold them
+ * now that normal virtual memory is enabled.
+ */
+ save_microcode_in_initrd();
+
+ /*
+ * end could be not aligned, and We can not align that,
+ * decompresser could be confused by aligned initrd_end
+ * We already reserve the end partial page before in
+ * - i386_start_kernel()
+ * - x86_64_start_kernel()
+ * - relocate_initrd()
+ * So here We can do PAGE_ALIGN() safely to get partial page to be freed
+ */
+ free_init_pages("initrd", start, PAGE_ALIGN(end));
+}
+#endif
+
+void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
+#endif
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+ .active_mm = &init_mm,
+ .state = 0,
+#endif
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
+
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
+{
+ /* entry 0 MUST be WB (hardwired to speed up translations) */
+ BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
+
+ __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
+ __pte2cachemode_tbl[entry] = cache;
+}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
new file mode 100644
index 0000000..cb4ef3d
--- /dev/null
+++ b/arch/x86/mm/init_32.c
@@ -0,0 +1,964 @@
+/*
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+
+#include <asm/asm.h>
+#include <asm/bios_ebda.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/bugs.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+#include <asm/paravirt.h>
+#include <asm/setup.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/init.h>
+
+#include "mm_internal.h"
+
+unsigned long highstart_pfn, highend_pfn;
+
+static noinline int do_test_wp_bit(void);
+
+bool __read_mostly __vmalloc_start_set = false;
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+ pud_t *pud;
+ pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
+ pmd_table = (pmd_t *)alloc_low_page();
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ pud = pud_offset(pgd, 0);
+ BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+ return pmd_table;
+ }
+#endif
+ pud = pud_offset(pgd, 0);
+ pmd_table = pmd_offset(pud, 0);
+
+ return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry:
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+ if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
+ pte_t *page_table = (pte_t *)alloc_low_page();
+
+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+ BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+ }
+
+ return pte_offset_kernel(pmd, 0);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ int pgd_idx = pgd_index(vaddr);
+ int pmd_idx = pmd_index(vaddr);
+
+ return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ int pte_idx = pte_index(vaddr);
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return one_page_table_init(pmd) + pte_idx;
+}
+
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+ unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+
+ if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ return 0;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd_idx++) {
+ if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ count++;
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+#endif
+ return count;
+}
+
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+ unsigned long vaddr, pte_t *lastpte,
+ void **adr)
+{
+#ifdef CONFIG_HIGHMEM
+ /*
+ * Something (early fixmap) may already have put a pte
+ * page here, which causes the page table allocation
+ * to become nonlinear. Attempt to fix it, and if it
+ * is still nonlinear then we have to bug.
+ */
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+ if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+ && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
+ pte_t *newpte;
+ int i;
+
+ BUG_ON(after_bootmem);
+ newpte = *adr;
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ set_pte(newpte + i, pte[i]);
+ *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
+
+ paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+ BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+ __flush_tlb_all();
+
+ paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+ pte = newpte;
+ }
+ BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+ && vaddr > fix_to_virt(FIX_KMAP_END)
+ && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+ return pte;
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
+ * checking the pgd every time.
+ */
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte = NULL;
+ unsigned long count = page_table_range_init_count(start, end);
+ void *adr = NULL;
+
+ if (count)
+ adr = alloc_low_pages(count);
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
+ pgd = pgd_base + pgd_idx;
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+ pmd = one_md_table_init(pgd);
+ pmd = pmd + pmd_index(vaddr);
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd++, pmd_idx++) {
+ pte = page_table_kmap_check(one_page_table_init(pmd),
+ pmd, vaddr, pte, &adr);
+
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
+ */
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
+{
+ int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+ unsigned long last_map_addr = end;
+ unsigned long start_pfn, end_pfn;
+ pgd_t *pgd_base = swapper_pg_dir;
+ int pgd_idx, pmd_idx, pte_ofs;
+ unsigned long pfn;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned pages_2m, pages_4k;
+ int mapping_iter;
+
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
+ /*
+ * First iteration will setup identity mapping using large/small pages
+ * based on use_pse, with other attributes same as set by
+ * the early code in head_32.S
+ *
+ * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+ * as desired for the kernel identity mapping.
+ *
+ * This two pass mechanism conforms to the TLB app note which says:
+ *
+ * "Software should not write to a paging-structure entry in a way
+ * that would change, for any linear address, both the page size
+ * and either the page frame or attributes."
+ */
+ mapping_iter = 1;
+
+ if (!cpu_has_pse)
+ use_pse = 0;
+
+repeat:
+ pages_2m = pages_4k = 0;
+ pfn = start_pfn;
+ pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pgd = pgd_base + pgd_idx;
+ for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+ pmd = one_md_table_init(pgd);
+
+ if (pfn >= end_pfn)
+ continue;
+#ifdef CONFIG_X86_PAE
+ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pmd += pmd_idx;
+#else
+ pmd_idx = 0;
+#endif
+ for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
+ pmd++, pmd_idx++) {
+ unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+ /*
+ * Map with big pages if possible, otherwise
+ * create normal page tables:
+ */
+ if (use_pse) {
+ unsigned int addr2;
+ pgprot_t prot = PAGE_KERNEL_LARGE;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute + _PAGE_PSE.
+ */
+ pgprot_t init_prot =
+ __pgprot(PTE_IDENT_ATTR |
+ _PAGE_PSE);
+
+ pfn &= PMD_MASK >> PAGE_SHIFT;
+ addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+ PAGE_OFFSET + PAGE_SIZE-1;
+
+ if (is_kernel_text(addr) ||
+ is_kernel_text(addr2))
+ prot = PAGE_KERNEL_LARGE_EXEC;
+
+ pages_2m++;
+ if (mapping_iter == 1)
+ set_pmd(pmd, pfn_pmd(pfn, init_prot));
+ else
+ set_pmd(pmd, pfn_pmd(pfn, prot));
+
+ pfn += PTRS_PER_PTE;
+ continue;
+ }
+ pte = one_page_table_init(pmd);
+
+ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pte += pte_ofs;
+ for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
+ pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+ pgprot_t prot = PAGE_KERNEL;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute.
+ */
+ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
+
+ if (is_kernel_text(addr))
+ prot = PAGE_KERNEL_EXEC;
+
+ pages_4k++;
+ if (mapping_iter == 1) {
+ set_pte(pte, pfn_pte(pfn, init_prot));
+ last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
+ } else
+ set_pte(pte, pfn_pte(pfn, prot));
+ }
+ }
+ }
+ if (mapping_iter == 1) {
+ /*
+ * update direct mapping page count only in the first
+ * iteration.
+ */
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
+
+ /*
+ * local global flush tlb, which will flush the previous
+ * mappings present in both small and large page TLB's.
+ */
+ __flush_tlb_all();
+
+ /*
+ * Second iteration will set the actual desired PTE attributes.
+ */
+ mapping_iter = 2;
+ goto repeat;
+ }
+ return last_map_addr;
+}
+
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+ return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+ vaddr), vaddr), vaddr);
+}
+
+static void __init kmap_init(void)
+{
+ unsigned long kmap_vstart;
+
+ /*
+ * Cache the first kmap pte:
+ */
+ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+ kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+ kmap_prot = PAGE_KERNEL;
+}
+
+#ifdef CONFIG_HIGHMEM
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = PKMAP_BASE;
+ page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ pud = pud_offset(pgd, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ pte = pte_offset_kernel(pmd, vaddr);
+ pkmap_page_table = pte;
+}
+
+void __init add_highpages_with_active_regions(int nid,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
+ unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+ start_pfn, end_pfn);
+ unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+ start_pfn, end_pfn);
+ for ( ; pfn < e_pfn; pfn++)
+ if (pfn_valid(pfn))
+ free_highmem_page(pfn_to_page(pfn));
+ }
+}
+#else
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+#endif /* CONFIG_HIGHMEM */
+
+void __init native_pagetable_init(void)
+{
+ unsigned long pfn, va;
+ pgd_t *pgd, *base = swapper_pg_dir;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /*
+ * Remove any mappings which extend past the end of physical
+ * memory from the boot time page table.
+ * In virtual address space, we should have at least two pages
+ * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ * definition. And max_low_pfn is set to VMALLOC_END physical
+ * address. If initial memory mapping is doing right job, we
+ * should have pte used near max_low_pfn or one pmd is not present.
+ */
+ for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
+ pgd = base + pgd_index(va);
+ if (!pgd_present(*pgd))
+ break;
+
+ pud = pud_offset(pgd, va);
+ pmd = pmd_offset(pud, va);
+ if (!pmd_present(*pmd))
+ break;
+
+ /* should not be large page here */
+ if (pmd_large(*pmd)) {
+ pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ pfn, pmd, __pa(pmd));
+ BUG_ON(1);
+ }
+
+ pte = pte_offset_kernel(pmd, va);
+ if (!pte_present(*pte))
+ break;
+
+ printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ pfn, pmd, __pa(pmd), pte, __pa(pte));
+ pte_clear(NULL, va, pte);
+ }
+ paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
+ paging_init();
+}
+
+/*
+ * Build a proper pagetable for the kernel mappings. Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/x86/kernel/head_32.S. The root of the
+ * pagetable will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir. In any case,
+ * paravirt_pagetable_init() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+void __init early_ioremap_page_table_range_init(void)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
+ unsigned long vaddr, end;
+
+ /*
+ * Fixed mappings, only the page table structure has to be
+ * created - mappings will be set by set_fixmap():
+ */
+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+ end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+ page_table_range_init(vaddr, end, pgd_base);
+ early_ioremap_reset();
+}
+
+static void __init pagetable_init(void)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
+
+ permanent_kmaps_init(pgd_base);
+}
+
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
+
+#define MSG_HIGHMEM_TOO_BIG \
+ "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+ "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
+/*
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
+ */
+static void __init lowmem_pfn_init(void)
+{
+ /* max_low_pfn is 0, we already have early_res support */
+ max_low_pfn = max_pfn;
+
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+ pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+ printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+ "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+ "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+static void __init highmem_pfn_init(void)
+{
+ max_low_pfn = MAXMEM_PFN;
+
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+ pages_to_mb(max_pfn - MAXMEM_PFN),
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+#ifndef CONFIG_HIGHMEM
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+ }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+ /* it could update max_pfn */
+
+ if (max_pfn <= MAXMEM_PFN)
+ lowmem_pfn_init();
+ else
+ highmem_pfn_init();
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+ sparse_memory_present_with_active_regions(0);
+
+#ifdef CONFIG_FLATMEM
+ max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
+#endif
+ __vmalloc_start_set = true;
+
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+
+ setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+}
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+ pagetable_init();
+
+ __flush_tlb_all();
+
+ kmap_init();
+
+ /*
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+ olpc_dt_build_devicetree();
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_init();
+ zone_sizes_init();
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
+ */
+static void __init test_wp_bit(void)
+{
+ printk(KERN_INFO
+ "Checking if this processor honours the WP bit even in supervisor mode...");
+
+ /* Any page-aligned address will do, the test is non-destructive */
+ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
+ boot_cpu_data.wp_works_ok = do_test_wp_bit();
+ clear_fixmap(FIX_WP_TEST);
+
+ if (!boot_cpu_data.wp_works_ok) {
+ printk(KERN_CONT "No.\n");
+ panic("Linux doesn't support CPUs with broken WP.");
+ } else {
+ printk(KERN_CONT "Ok.\n");
+ }
+}
+
+void __init mem_init(void)
+{
+ pci_iommu_alloc();
+
+#ifdef CONFIG_FLATMEM
+ BUG_ON(!mem_map);
+#endif
+ /*
+ * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+ * be done before free_all_bootmem(). Memblock use free low memory for
+ * temporary data (see find_range_array()) and for this purpose can use
+ * pages that was already passed to the buddy allocator, hence marked as
+ * not accessible in the page tables when compiled with
+ * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+ * important here.
+ */
+ set_highmem_pages_init();
+
+ /* this will put all low memory onto the freelists */
+ free_all_bootmem();
+
+ after_bootmem = 1;
+
+ mem_init_print_info(NULL);
+ printk(KERN_INFO "virtual kernel memory layout:\n"
+ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+#endif
+ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
+ FIXADDR_START, FIXADDR_TOP,
+ (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+ (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+ VMALLOC_START, VMALLOC_END,
+ (VMALLOC_END - VMALLOC_START) >> 20,
+
+ (unsigned long)__va(0), (unsigned long)high_memory,
+ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+ (unsigned long)&__init_begin, (unsigned long)&__init_end,
+ ((unsigned long)&__init_end -
+ (unsigned long)&__init_begin) >> 10,
+
+ (unsigned long)&_etext, (unsigned long)&_edata,
+ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+ (unsigned long)&_text, (unsigned long)&_etext,
+ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+ /*
+ * Check boundaries twice: Some fundamental inconsistencies can
+ * be detected at build time already.
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+ BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+ BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+#ifdef CONFIG_RANDOMIZE_BASE
+ BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+ BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+ BUG_ON(VMALLOC_START >= VMALLOC_END);
+ BUG_ON((unsigned long)high_memory > VMALLOC_START);
+
+ if (boot_cpu_data.wp_works_ok < 0)
+ test_wp_bit();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+{
+ struct pglist_data *pgdata = NODE_DATA(nid);
+ struct zone *zone = pgdata->node_zones +
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ return __add_pages(nid, zone, start_pfn, nr_pages);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
+#endif
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section. Put this after the callers, so that it cannot be inlined.
+ */
+static noinline int do_test_wp_bit(void)
+{
+ char tmp_reg;
+ int flag;
+
+ __asm__ __volatile__(
+ " movb %0, %1 \n"
+ "1: movb %1, %0 \n"
+ " xorl %2, %2 \n"
+ "2: \n"
+ _ASM_EXTABLE(1b,2b)
+ :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+ "=q" (tmp_reg),
+ "=r" (flag)
+ :"2" (1)
+ :"memory");
+
+ return flag;
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly __read_mostly;
+
+void set_kernel_text_rw(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read write\n",
+ start, start+size);
+
+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read only\n",
+ start, start+size);
+
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+static void mark_nxdata_nx(void)
+{
+ /*
+ * When this called, init has already been executed and released,
+ * so everything past _etext should be NX.
+ */
+ unsigned long start = PFN_ALIGN(_etext);
+ /*
+ * This comes from is_kernel_text upper limit. Also HPAGE where used:
+ */
+ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+ set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ size >> 10);
+
+ kernel_set_to_readonly = 1;
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+ start, start+size);
+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
+
+ start += size;
+ size = (unsigned long)__end_rodata - start;
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ size >> 10);
+ rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
+ mark_nxdata_nx();
+ if (__supported_pte_mask & _PAGE_NX)
+ debug_checkwx();
+}
+#endif
+
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
new file mode 100644
index 0000000..ec081fe
--- /dev/null
+++ b/arch/x86/mm/init_64.c
@@ -0,0 +1,1378 @@
+/*
+ * linux/arch/x86_64/mm/init.c
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
+#include <linux/gfp.h>
+#include <linux/kcore.h>
+
+#include <asm/processor.h>
+#include <asm/bios_ebda.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
+#include <asm/cacheflush.h>
+#include <asm/init.h>
+#include <asm/setup.h>
+
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+ unsigned long addr, unsigned long end)
+{
+ addr &= PMD_MASK;
+ for (; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd = pmd_page + pmd_index(addr);
+
+ if (!pmd_present(*pmd))
+ set_pmd(pmd, __pmd(addr | pmd_flag));
+ }
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ continue;
+ }
+ pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+ if (!pmd)
+ return -ENOMEM;
+ ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int result;
+ int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+ for (; addr < end; addr = next) {
+ pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pud_t *pud;
+
+ next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_present(*pgd)) {
+ pud = pud_offset(pgd, 0);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ continue;
+ }
+
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+pteval_t __supported_pte_mask __read_mostly = ~0;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end, int removed)
+{
+ unsigned long address;
+
+ for (address = start; address <= end; address += PGDIR_SIZE) {
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ struct page *page;
+
+ /*
+ * When it is called after memory hot remove, pgd_none()
+ * returns true. In this case (removed == 1), we must clear
+ * the PGD entries in the local PGD level page.
+ */
+ if (pgd_none(*pgd_ref) && !removed)
+ continue;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ BUG_ON(pgd_page_vaddr(*pgd)
+ != pgd_page_vaddr(*pgd_ref));
+
+ if (removed) {
+ if (pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ pgd_clear(pgd);
+ } else {
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ }
+
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
+{
+ void *ptr;
+
+ if (after_bootmem)
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+ else
+ ptr = alloc_bootmem_pages(PAGE_SIZE);
+
+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+ panic("set_pte_phys: cannot allocate page data %s\n",
+ after_bootmem ? "after bootmem" : "");
+ }
+
+ pr_debug("spp_getpage %p\n", ptr);
+
+ return ptr;
+}
+
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+{
+ if (pgd_none(*pgd)) {
+ pud_t *pud = (pud_t *)spp_getpage();
+ pgd_populate(&init_mm, pgd, pud);
+ if (pud != pud_offset(pgd, 0))
+ printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+ pud, pud_offset(pgd, 0));
+ }
+ return pud_offset(pgd, vaddr);
+}
+
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
+ if (pud_none(*pud)) {
+ pmd_t *pmd = (pmd_t *) spp_getpage();
+ pud_populate(&init_mm, pud, pmd);
+ if (pmd != pmd_offset(pud, 0))
+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ pmd, pmd_offset(pud, 0));
+ }
+ return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
+ if (pmd_none(*pmd)) {
+ pte_t *pte = (pte_t *) spp_getpage();
+ pmd_populate_kernel(&init_mm, pmd, pte);
+ if (pte != pte_offset_kernel(pmd, 0))
+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
+ }
+ return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = pud_page + pud_index(vaddr);
+ pmd = fill_pmd(pud, vaddr);
+ pte = fill_pte(pmd, vaddr);
+
+ set_pte(pte, new_pte);
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+ pgd_t *pgd;
+ pud_t *pud_page;
+
+ pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
+
+ pgd = pgd_offset_k(vaddr);
+ if (pgd_none(*pgd)) {
+ printk(KERN_ERR
+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ return;
+ }
+ pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+ set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(vaddr);
+ pud = fill_pud(pgd, vaddr);
+ return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return fill_pte(pmd, vaddr);
+}
+
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+ enum page_cache_mode cache)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pgprot_t prot;
+
+ pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
+ pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
+ BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+ for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+ pgd = pgd_offset_k((unsigned long)__va(phys));
+ if (pgd_none(*pgd)) {
+ pud = (pud_t *) spp_getpage();
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ pud = pud_offset(pgd, (unsigned long)__va(phys));
+ if (pud_none(*pud)) {
+ pmd = (pmd_t *) spp_getpage();
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ pmd = pmd_offset(pud, phys);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+ }
+}
+
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
+}
+
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
+}
+
+/*
+ * The head.S code sets up the kernel high mapping:
+ *
+ * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ *
+ * phys_base holds the negative offset to the kernel, which is added
+ * to the compile time generated pmds. This results in invalid pmds up
+ * to the point where we hit the physaddr 0 mapping.
+ *
+ * We limit the mappings to the region from _text to _brk_end. _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
+ * well, as they are located before _text:
+ */
+void __init cleanup_highmap(void)
+{
+ unsigned long vaddr = __START_KERNEL_map;
+ unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
+ unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+ pmd_t *pmd = level2_kernel_pgt;
+
+ /*
+ * Native path, max_pfn_mapped is not set yet.
+ * Xen has valid max_pfn_mapped set in
+ * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+ */
+ if (max_pfn_mapped)
+ vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
+ for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
+ if (pmd_none(*pmd))
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
+ }
+}
+
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+ pgprot_t prot)
+{
+ unsigned long pages = 0, next;
+ unsigned long last_map_addr = end;
+ int i;
+
+ pte_t *pte = pte_page + pte_index(addr);
+
+ for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+ next = (addr & PAGE_MASK) + PAGE_SIZE;
+ if (addr >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ set_pte(pte, __pte(0));
+ continue;
+ }
+
+ /*
+ * We will re-use the existing mapping.
+ * Xen for example has some special requirements, like mapping
+ * pagetable pages as RO. So assume someone who pre-setup
+ * these mappings are more intelligent.
+ */
+ if (pte_val(*pte)) {
+ if (!after_bootmem)
+ pages++;
+ continue;
+ }
+
+ if (0)
+ printk(" pte=%p addr=%lx pte=%016lx\n",
+ pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+ pages++;
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
+ last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+ }
+
+ update_page_count(PG_LEVEL_4K, pages);
+
+ return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+ unsigned long page_size_mask, pgprot_t prot)
+{
+ unsigned long pages = 0, next;
+ unsigned long last_map_addr = end;
+
+ int i = pmd_index(address);
+
+ for (; i < PTRS_PER_PMD; i++, address = next) {
+ pmd_t *pmd = pmd_page + pmd_index(address);
+ pte_t *pte;
+ pgprot_t new_prot = prot;
+
+ next = (address & PMD_MASK) + PMD_SIZE;
+ if (address >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ set_pmd(pmd, __pmd(0));
+ continue;
+ }
+
+ if (pmd_val(*pmd)) {
+ if (!pmd_large(*pmd)) {
+ spin_lock(&init_mm.page_table_lock);
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
+ last_map_addr = phys_pte_init(pte, address,
+ end, prot);
+ spin_unlock(&init_mm.page_table_lock);
+ continue;
+ }
+ /*
+ * If we are ok with PG_LEVEL_2M mapping, then we will
+ * use the existing mapping,
+ *
+ * Otherwise, we will split the large page mapping but
+ * use the same existing protection bits except for
+ * large page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_2M)) {
+ if (!after_bootmem)
+ pages++;
+ last_map_addr = next;
+ continue;
+ }
+ new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
+ }
+
+ if (page_size_mask & (1<<PG_LEVEL_2M)) {
+ pages++;
+ spin_lock(&init_mm.page_table_lock);
+ set_pte((pte_t *)pmd,
+ pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ spin_unlock(&init_mm.page_table_lock);
+ last_map_addr = next;
+ continue;
+ }
+
+ pte = alloc_low_page();
+ last_map_addr = phys_pte_init(pte, address, end, new_prot);
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_populate_kernel(&init_mm, pmd, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ update_page_count(PG_LEVEL_2M, pages);
+ return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+ unsigned long page_size_mask)
+{
+ unsigned long pages = 0, next;
+ unsigned long last_map_addr = end;
+ int i = pud_index(addr);
+
+ for (; i < PTRS_PER_PUD; i++, addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+ pgprot_t prot = PAGE_KERNEL;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (addr >= end) {
+ if (!after_bootmem &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+ !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ set_pud(pud, __pud(0));
+ continue;
+ }
+
+ if (pud_val(*pud)) {
+ if (!pud_large(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ last_map_addr = phys_pmd_init(pmd, addr, end,
+ page_size_mask, prot);
+ __flush_tlb_all();
+ continue;
+ }
+ /*
+ * If we are ok with PG_LEVEL_1G mapping, then we will
+ * use the existing mapping.
+ *
+ * Otherwise, we will split the gbpage mapping but use
+ * the same existing protection bits except for large
+ * page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_1G)) {
+ if (!after_bootmem)
+ pages++;
+ last_map_addr = next;
+ continue;
+ }
+ prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
+ }
+
+ if (page_size_mask & (1<<PG_LEVEL_1G)) {
+ pages++;
+ spin_lock(&init_mm.page_table_lock);
+ set_pte((pte_t *)pud,
+ pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
+ spin_unlock(&init_mm.page_table_lock);
+ last_map_addr = next;
+ continue;
+ }
+
+ pmd = alloc_low_page();
+ last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
+ prot);
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_populate(&init_mm, pud, pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ __flush_tlb_all();
+
+ update_page_count(PG_LEVEL_1G, pages);
+
+ return last_map_addr;
+}
+
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
+{
+ bool pgd_changed = false;
+ unsigned long next, last_map_addr = end;
+ unsigned long addr;
+
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+ addr = start;
+
+ for (; start < end; start = next) {
+ pgd_t *pgd = pgd_offset_k(start);
+ pud_t *pud;
+
+ next = (start & PGDIR_MASK) + PGDIR_SIZE;
+
+ if (pgd_val(*pgd)) {
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+ last_map_addr = phys_pud_init(pud, __pa(start),
+ __pa(end), page_size_mask);
+ continue;
+ }
+
+ pud = alloc_low_page();
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
+ page_size_mask);
+
+ spin_lock(&init_mm.page_table_lock);
+ pgd_populate(&init_mm, pgd, pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pgd_changed = true;
+ }
+
+ if (pgd_changed)
+ sync_global_pgds(addr, end - 1, 0);
+
+ __flush_tlb_all();
+
+ return last_map_addr;
+}
+
+#ifndef CONFIG_NUMA
+void __init initmem_init(void)
+{
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+}
+#endif
+
+void __init paging_init(void)
+{
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_init();
+
+ /*
+ * clear the default setting with node 0
+ * note: don't use nodes_clear here, that is really clearing when
+ * numa support is not compiled in, and later node_set_state
+ * will not set it back.
+ */
+ node_clear_state(0, N_MEMORY);
+ if (N_MEMORY != N_NORMAL_MEMORY)
+ node_clear_state(0, N_NORMAL_MEMORY);
+
+ zone_sizes_init();
+}
+
+/*
+ * Memory hotplug specific functions
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void update_end_of_memory_vars(u64 start, u64 size)
+{
+ unsigned long end_pfn = PFN_UP(start + size);
+
+ if (end_pfn > max_pfn) {
+ max_pfn = end_pfn;
+ max_low_pfn = end_pfn;
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+ }
+}
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = pgdat->node_zones +
+ zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
+
+ init_memory_mapping(start, start + size);
+
+ ret = __add_pages(nid, zone, start_pfn, nr_pages);
+ WARN_ON_ONCE(ret);
+
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start, size);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+ unsigned long magic;
+ unsigned int nr_pages = 1 << order;
+
+ /* bootmem page has reserved flag */
+ if (PageReserved(page)) {
+ __ClearPageReserved(page);
+
+ magic = (unsigned long)page->lru.next;
+ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ while (nr_pages--)
+ put_page_bootmem(page++);
+ } else
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (pte_val(*pte))
+ return;
+ }
+
+ /* free a pte talbe */
+ free_pagetable(pmd_page(*pmd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (pmd_val(*pmd))
+ return;
+ }
+
+ /* free a pmd talbe */
+ free_pagetable(pud_page(*pud), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (pud_val(*pud))
+ return false;
+ }
+
+ /* free a pud table */
+ free_pagetable(pgd_page(*pgd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pgd_clear(pgd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+ void *page_addr;
+ phys_addr_t phys_addr;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ /*
+ * We mapped [0,1G) memory as identity mapping when
+ * initializing, in arch/x86/kernel/head_64.S. These
+ * pagetables cannot be removed.
+ */
+ phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+ if (phys_addr < (phys_addr_t)0x40000000)
+ return;
+
+ if (IS_ALIGNED(addr, PAGE_SIZE) &&
+ IS_ALIGNED(next, PAGE_SIZE)) {
+ /*
+ * Do not free direct mapping pages since they were
+ * freed when offlining, or simplely not in use.
+ */
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
+ } else {
+ /*
+ * If we are here, we are freeing vmemmap pages since
+ * direct mapped memory ranges to be freed are aligned.
+ *
+ * If we are not removing the whole page, it means
+ * other page structs in this page are being used and
+ * we canot remove them. So fill the unused page_structs
+ * with 0xFD, and remove the page when it is wholly
+ * filled with 0xFD.
+ */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pte_page(*pte));
+ if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+ }
+
+ /* Call free_pte_table() in remove_pmd_table(). */
+ flush_tlb_all();
+ if (direct)
+ update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte_base;
+ pmd_t *pmd;
+ void *page_addr;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pmd_page(*pmd));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PMD_SIZE)) {
+ free_pagetable(pmd_page(*pmd),
+ get_order(PMD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next, direct);
+ free_pte_table(pte_base, pmd);
+ }
+
+ /* Call free_pmd_table() in remove_pud_table(). */
+ if (direct)
+ update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd_base;
+ pud_t *pud;
+ void *page_addr;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ if (!direct)
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pud_page(*pud));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PUD_SIZE)) {
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+ remove_pmd_table(pmd_base, addr, next, direct);
+ free_pmd_table(pmd_base, pud);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ unsigned long next;
+ unsigned long addr;
+ pgd_t *pgd;
+ pud_t *pud;
+ bool pgd_changed = false;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ pud = (pud_t *)pgd_page_vaddr(*pgd);
+ remove_pud_table(pud, addr, next, direct);
+ if (free_pud_table(pud, pgd))
+ pgd_changed = true;
+ }
+
+ if (pgd_changed)
+ sync_global_pgds(start, end - 1, 1);
+
+ flush_tlb_all();
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end)
+{
+ remove_pagetable(start, end, false);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ remove_pagetable(start, end, true);
+}
+
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct zone *zone;
+ int ret;
+
+ zone = page_zone(pfn_to_page(start_pfn));
+ kernel_physical_mapping_remove(start, start + size);
+ ret = __remove_pages(zone, start_pfn, nr_pages);
+ WARN_ON_ONCE(ret);
+
+ return ret;
+}
+#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static struct kcore_list kcore_vsyscall;
+
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
+void __init mem_init(void)
+{
+ pci_iommu_alloc();
+
+ /* clear_bss() already clear the empty_zero_page */
+
+ register_page_bootmem_info();
+
+ /* this will put all memory onto the freelists */
+ free_all_bootmem();
+ after_bootmem = 1;
+
+ /* Register memory areas for /proc/kcore */
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
+ PAGE_SIZE, KCORE_OTHER);
+
+ mem_init_print_info(NULL);
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read write\n",
+ start, end);
+
+ /*
+ * Make the kernel identity mapping for text RW. Kernel text
+ * mapping will always be RO. Refer to the comment in
+ * static_protections() in pageattr.c
+ */
+ set_memory_rw(start, (end - start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read only\n",
+ start, end);
+
+ /*
+ * Set the kernel identity mapping for text RO.
+ */
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
+ unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+ unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+ unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long all_end;
+
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ (end - start) >> 10);
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+ kernel_set_to_readonly = 1;
+
+ /*
+ * The rodata/data/bss/brk section (but not the kernel text!)
+ * should also be not-executable.
+ *
+ * We align all_end to PMD_SIZE because the existing mapping
+ * is a full PMD. If we would align _brk_end to PAGE_SIZE we
+ * split the PMD and the reminder between _brk_end and the end
+ * of the PMD will remain mapped executable.
+ *
+ * Any PMD which was setup after the one which covers _brk_end
+ * has been zapped already via cleanup_highmem().
+ */
+ all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
+ set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
+
+ rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+ set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: again\n");
+ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
+
+ free_init_pages("unused kernel",
+ (unsigned long) __va(__pa_symbol(text_end)),
+ (unsigned long) __va(__pa_symbol(rodata_start)));
+ free_init_pages("unused kernel",
+ (unsigned long) __va(__pa_symbol(rodata_end)),
+ (unsigned long) __va(__pa_symbol(_sdata)));
+
+ debug_checkwx();
+}
+
+#endif
+
+int kern_addr_valid(unsigned long addr)
+{
+ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (above != 0 && above != -1UL)
+ return 0;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud))
+ return 0;
+
+ if (pud_large(*pud))
+ return pfn_valid(pud_pfn(*pud));
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return 0;
+
+ if (pmd_large(*pmd))
+ return pfn_valid(pmd_pfn(*pmd));
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ return 0;
+
+ return pfn_valid(pte_pfn(*pte));
+}
+
+static unsigned long probe_memory_block_size(void)
+{
+ /* start from 2g */
+ unsigned long bz = 1UL<<31;
+
+ if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
+ pr_info("Using 2GB memory block size for large-memory system\n");
+ return 2UL * 1024 * 1024 * 1024;
+ }
+
+ /* less than 64g installed */
+ if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
+ return MIN_MEMORY_BLOCK_SIZE;
+
+ /* get the tail size */
+ while (bz > MIN_MEMORY_BLOCK_SIZE) {
+ if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
+ break;
+ bz >>= 1;
+ }
+
+ printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+
+ return bz;
+}
+
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
+{
+ if (!memory_block_size_probed)
+ memory_block_size_probed = probe_memory_block_size();
+
+ return memory_block_size_probed;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
+static int __meminit vmemmap_populate_hugepages(unsigned long start,
+ unsigned long end, int node)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(pgd, addr, node);
+ if (!pud)
+ return -ENOMEM;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ if (p) {
+ pte_t entry;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
+
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
+ continue;
+ }
+ } else if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
+ }
+ pr_warn_once("vmemmap: falling back to regular page backing\n");
+ if (vmemmap_populate_basepages(addr, next, node))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+{
+ int err;
+
+ if (cpu_has_pse)
+ err = vmemmap_populate_hugepages(start, end, node);
+ else
+ err = vmemmap_populate_basepages(start, end, node);
+ if (!err)
+ sync_global_pgds(start, end - 1, 0);
+ return err;
+}
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long size)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + size);
+ unsigned long next;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned int nr_pages;
+ struct page *page;
+
+ for (; addr < end; addr = next) {
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+ if (!cpu_has_pse) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+ get_page_bootmem(section_nr, pmd_page(*pmd),
+ MIX_SECTION_INFO);
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ continue;
+ get_page_bootmem(section_nr, pte_page(*pte),
+ SECTION_INFO);
+ } else {
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ nr_pages = 1 << (get_order(PMD_SIZE));
+ page = pmd_page(*pmd);
+ while (nr_pages--)
+ get_page_bootmem(section_nr, page++,
+ SECTION_INFO);
+ }
+ }
+}
+#endif
+
+void __meminit vmemmap_populate_print_last(void)
+{
+ if (p_start) {
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ p_start = NULL;
+ p_end = NULL;
+ node_start = 0;
+ }
+}
+#endif
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
new file mode 100644
index 0000000..9c0ff04
--- /dev/null
+++ b/arch/x86/mm/iomap_32.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <asm/iomap.h>
+#include <asm/pat.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
+ /* There is no way to map greater than 1 << 32 address without PAE */
+ if (base + size > 0x100000000ULL)
+ return 0;
+#endif
+ return 1;
+}
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC;
+ int ret;
+
+ if (!is_io_mapping_possible(base, size))
+ return -EINVAL;
+
+ ret = io_reserve_memtype(base, base + size, &pcm);
+ if (ret)
+ return ret;
+
+ *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void iomap_free(resource_size_t base, unsigned long size)
+{
+ io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
+
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+ unsigned long vaddr;
+ int idx, type;
+
+ preempt_disable();
+ pagefault_disable();
+
+ type = kmap_atomic_idx_push();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+}
+
+/*
+ * Map 'pfn' using protections 'prot'
+ */
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+ /*
+ * For non-PAT systems, translate non-WB request to UC- just in
+ * case the caller set the PWT bit to prot directly without using
+ * pgprot_writecombine(). UC- translates to uncached if the MTRR
+ * is UC or WC. UC- gets the real intention, of the user, which is
+ * "WC if the MTRR is WC, UC if you can't do that."
+ */
+ if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
+ prot = __pgprot(__PAGE_KERNEL |
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
+
+ return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
+}
+EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
+
+void
+iounmap_atomic(void __iomem *kvaddr)
+{
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+ if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+ vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+ int idx, type;
+
+ type = kmap_atomic_idx();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+ WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+ /*
+ * Force other mappings to Oops if they'll try to access this
+ * pte without first remap it. Keeping stale mappings around
+ * is a bad idea also, in case the page changes cacheability
+ * attributes or becomes a protected page in a hypervisor.
+ */
+ kpte_clear_flush(kmap_pte-idx, vaddr);
+ kmap_atomic_idx_pop();
+ }
+
+ pagefault_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
new file mode 100644
index 0000000..b9c78f3
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,501 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/pat.h>
+
+#include "physaddr.h"
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ enum page_cache_mode pcm)
+{
+ unsigned long nrpages = size >> PAGE_SHIFT;
+ int err;
+
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC:
+ default:
+ err = _set_memory_uc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_MODE_WC:
+ err = _set_memory_wc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_MODE_WT:
+ err = _set_memory_wt(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_MODE_WB:
+ err = _set_memory_wb(vaddr, nrpages);
+ break;
+ }
+
+ return err;
+}
+
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+ void *arg)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+ unsigned long size, enum page_cache_mode pcm, void *caller)
+{
+ unsigned long offset, vaddr;
+ resource_size_t pfn, last_pfn, last_addr;
+ const resource_size_t unaligned_phys_addr = phys_addr;
+ const unsigned long unaligned_size = size;
+ struct vm_struct *area;
+ enum page_cache_mode new_pcm;
+ pgprot_t prot;
+ int retval;
+ void __iomem *ret_addr;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr)
+ return NULL;
+
+ if (!phys_addr_valid(phys_addr)) {
+ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+ (unsigned long long)phys_addr);
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ /*
+ * Don't remap the low PCI/ISA area, it's always mapped..
+ */
+ if (is_ISA_range(phys_addr, last_addr))
+ return (__force void __iomem *)phys_to_virt(phys_addr);
+
+ /*
+ * Don't allow anybody to remap normal RAM that we're using..
+ */
+ pfn = phys_addr >> PAGE_SHIFT;
+ last_pfn = last_addr >> PAGE_SHIFT;
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1) {
+ WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
+ &phys_addr, &last_addr);
+ return NULL;
+ }
+
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PHYSICAL_PAGE_MASK;
+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+ retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
+ pcm, &new_pcm);
+ if (retval) {
+ printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
+ return NULL;
+ }
+
+ if (pcm != new_pcm) {
+ if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) {
+ printk(KERN_ERR
+ "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n",
+ (unsigned long long)phys_addr,
+ (unsigned long long)(phys_addr + size),
+ pcm, new_pcm);
+ goto err_free_memtype;
+ }
+ pcm = new_pcm;
+ }
+
+ prot = PAGE_KERNEL_IO;
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC:
+ default:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_UC));
+ break;
+ case _PAGE_CACHE_MODE_UC_MINUS:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
+ break;
+ case _PAGE_CACHE_MODE_WC:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WC));
+ break;
+ case _PAGE_CACHE_MODE_WT:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+ break;
+ case _PAGE_CACHE_MODE_WB:
+ break;
+ }
+
+ /*
+ * Ok, go for it..
+ */
+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
+ if (!area)
+ goto err_free_memtype;
+ area->phys_addr = phys_addr;
+ vaddr = (unsigned long) area->addr;
+
+ if (kernel_map_sync_memtype(phys_addr, size, pcm))
+ goto err_free_area;
+
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
+ goto err_free_area;
+
+ ret_addr = (void __iomem *) (vaddr + offset);
+ mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+ /*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+ KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
+ return ret_addr;
+err_free_area:
+ free_vm_area(area);
+err_free_memtype:
+ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
+}
+
+/**
+ * ioremap_nocache - map bus memory into CPU space
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+{
+ /*
+ * Ideally, this should be:
+ * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+ *
+ * Till we fix all X drivers to use ioremap_wc(), we will use
+ * UC MINUS. Drivers that are certain they need or can already
+ * be converted over to strong UC can use ioremap_uc().
+ */
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
+
+ return __ioremap_caller(phys_addr, size, pcm,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * ioremap_uc - map bus memory into CPU space as strongly uncachable
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC. This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+ return __ioremap_caller(phys_addr, size, pcm,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
+ * ioremap_wc - map memory into CPU space write combined
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+/**
+ * ioremap_wt - map memory into CPU space write through
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write through.
+ * Write through stores data into memory while keeping the cache up-to-date.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long prot_val)
+{
+ return __ioremap_caller(phys_addr, size,
+ pgprot2cachemode(__pgprot(prot_val)),
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+ struct vm_struct *p, *o;
+
+ if ((void __force *)addr <= high_memory)
+ return;
+
+ /*
+ * __ioremap special-cases the PCI/ISA range by not instantiating a
+ * vm_area and by simply returning an address into the kernel mapping
+ * of ISA space. So handle that here.
+ */
+ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
+ (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+ return;
+
+ addr = (volatile void __iomem *)
+ (PAGE_MASK & (unsigned long __force)addr);
+
+ mmiotrace_iounmap(addr);
+
+ /* Use the vm area unlocked, assuming the caller
+ ensures there isn't another iounmap for the same address
+ in parallel. Reuse of the virtual address is prevented by
+ leaving it in the global lists until we're done with it.
+ cpa takes care of the direct mappings. */
+ p = find_vm_area((void __force *)addr);
+
+ if (!p) {
+ printk(KERN_ERR "iounmap: bad address %p\n", addr);
+ dump_stack();
+ return;
+ }
+
+ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
+ /* Finally remove it */
+ o = remove_vm_area((void __force *)addr);
+ BUG_ON(p != o || o == NULL);
+ kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+int __init arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+ return cpu_has_gbpages;
+#else
+ return 0;
+#endif
+}
+
+int __init arch_ioremap_pmd_supported(void)
+{
+ return cpu_has_pse;
+}
+
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(phys_addr_t phys)
+{
+ unsigned long start = phys & PAGE_MASK;
+ unsigned long offset = phys & ~PAGE_MASK;
+ void *vaddr;
+
+ /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+ if (page_is_ram(start >> PAGE_SHIFT))
+ return __va(phys);
+
+ vaddr = ioremap_cache(start, PAGE_SIZE);
+ /* Only add the offset on success and return NULL if the ioremap() failed: */
+ if (vaddr)
+ vaddr += offset;
+
+ return vaddr;
+}
+
+void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
+{
+ if (page_is_ram(phys >> PAGE_SHIFT))
+ return;
+
+ iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+}
+
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
+
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+{
+ /* Don't assume we're using swapper_pg_dir at this point */
+ pgd_t *base = __va(read_cr3());
+ pgd_t *pgd = &base[pgd_index(addr)];
+ pud_t *pud = pud_offset(pgd, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ return pmd;
+}
+
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+{
+ return &bm_pte[pte_index(addr)];
+}
+
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+ return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
+void __init early_ioremap_init(void)
+{
+ pmd_t *pmd;
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#else
+ WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#endif
+
+ early_ioremap_setup();
+
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+ memset(bm_pte, 0, sizeof(bm_pte));
+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+ /*
+ * The boot-ioremap range spans multiple pmds, for which
+ * we are not prepared:
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+ BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+ != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
+#undef __FIXADDR_TOP
+ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+ WARN_ON(1);
+ printk(KERN_WARNING "pmd %p != %p\n",
+ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+ fix_to_virt(FIX_BTMAP_BEGIN));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
+ fix_to_virt(FIX_BTMAP_END));
+
+ printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
+ printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
+ FIX_BTMAP_BEGIN);
+ }
+}
+
+void __init __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ unsigned long addr = __fix_to_virt(idx);
+ pte_t *pte;
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ pte = early_ioremap_pte(addr);
+
+ if (pgprot_val(flags))
+ set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
+ else
+ pte_clear(&init_mm, addr, pte);
+ __flush_tlb_one(addr);
+}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
new file mode 100644
index 0000000..4e5ac46
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,131 @@
+#define DISABLE_BRANCH_PROFILING
+#define pr_fmt(fmt) "kasan: " fmt
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+static int __init map_range(struct range *range)
+{
+ unsigned long start;
+ unsigned long end;
+
+ start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+ end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+ /*
+ * end + 1 here is intentional. We check several shadow bytes in advance
+ * to slightly speed up fastpath. In some rare cases we could cross
+ * boundary of mapped shadow, so we just map some more here.
+ */
+ return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
+}
+
+static void __init clear_pgds(unsigned long start,
+ unsigned long end)
+{
+ for (; start < end; start += PGDIR_SIZE)
+ pgd_clear(pgd_offset_k(start));
+}
+
+static void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+ int i;
+ unsigned long start = KASAN_SHADOW_START;
+ unsigned long end = KASAN_SHADOW_END;
+
+ for (i = pgd_index(start); start < end; i++) {
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+ | _KERNPG_TABLE);
+ start += PGDIR_SIZE;
+ }
+}
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ if (val == DIE_GPF) {
+ pr_emerg("CONFIG_KASAN_INLINE enabled");
+ pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+ .notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_early_init(void)
+{
+ int i;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+ pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+ pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kasan_zero_pte[i] = __pte(pte_val);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ kasan_zero_pmd[i] = __pmd(pmd_val);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ kasan_zero_pud[i] = __pud(pud_val);
+
+ kasan_map_early_shadow(early_level4_pgt);
+ kasan_map_early_shadow(init_level4_pgt);
+}
+
+void __init kasan_init(void)
+{
+ int i;
+
+#ifdef CONFIG_KASAN_INLINE
+ register_die_notifier(&kasan_die_notifier);
+#endif
+
+ memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+ load_cr3(early_level4_pgt);
+ __flush_tlb_all();
+
+ clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+ for (i = 0; i < E820_X_MAX; i++) {
+ if (pfn_mapped[i].end == 0)
+ break;
+
+ if (map_range(&pfn_mapped[i]))
+ panic("kasan: unable to allocate shadow!");
+ }
+ kasan_populate_zero_shadow(
+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+ vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+ (unsigned long)kasan_mem_to_shadow(_end),
+ NUMA_NO_NODE);
+
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ (void *)KASAN_SHADOW_END);
+
+ memset(kasan_zero_page, 0, PAGE_SIZE);
+
+ load_cr3(init_level4_pgt);
+ __flush_tlb_all();
+ init_task.kasan_depth = 0;
+
+ pr_info("KernelAddressSanitizer initialized\n");
+}
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 0000000..520b3bc
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
+obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 0000000..dab4187
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,227 @@
+#include <linux/interrupt.h>
+#include <linux/kdebug.h>
+#include <linux/kmemcheck.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include "error.h"
+#include "shadow.h"
+
+enum kmemcheck_error_type {
+ KMEMCHECK_ERROR_INVALID_ACCESS,
+ KMEMCHECK_ERROR_BUG,
+};
+
+#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
+
+struct kmemcheck_error {
+ enum kmemcheck_error_type type;
+
+ union {
+ /* KMEMCHECK_ERROR_INVALID_ACCESS */
+ struct {
+ /* Kind of access that caused the error */
+ enum kmemcheck_shadow state;
+ /* Address and size of the erroneous read */
+ unsigned long address;
+ unsigned int size;
+ };
+ };
+
+ struct pt_regs regs;
+ struct stack_trace trace;
+ unsigned long trace_entries[32];
+
+ /* We compress it to a char. */
+ unsigned char shadow_copy[SHADOW_COPY_SIZE];
+ unsigned char memory_copy[SHADOW_COPY_SIZE];
+};
+
+/*
+ * Create a ring queue of errors to output. We can't call printk() directly
+ * from the kmemcheck traps, since this may call the console drivers and
+ * result in a recursive fault.
+ */
+static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
+static unsigned int error_count;
+static unsigned int error_rd;
+static unsigned int error_wr;
+static unsigned int error_missed_count;
+
+static struct kmemcheck_error *error_next_wr(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == ARRAY_SIZE(error_fifo)) {
+ ++error_missed_count;
+ return NULL;
+ }
+
+ e = &error_fifo[error_wr];
+ if (++error_wr == ARRAY_SIZE(error_fifo))
+ error_wr = 0;
+ ++error_count;
+ return e;
+}
+
+static struct kmemcheck_error *error_next_rd(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == 0)
+ return NULL;
+
+ e = &error_fifo[error_rd];
+ if (++error_rd == ARRAY_SIZE(error_fifo))
+ error_rd = 0;
+ --error_count;
+ return e;
+}
+
+void kmemcheck_error_recall(void)
+{
+ static const char *desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
+ [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
+ [KMEMCHECK_SHADOW_FREED] = "freed",
+ };
+
+ static const char short_desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
+ [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
+ [KMEMCHECK_SHADOW_FREED] = 'f',
+ };
+
+ struct kmemcheck_error *e;
+ unsigned int i;
+
+ e = error_next_rd();
+ if (!e)
+ return;
+
+ switch (e->type) {
+ case KMEMCHECK_ERROR_INVALID_ACCESS:
+ printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
+ 8 * e->size, e->state < ARRAY_SIZE(desc) ?
+ desc[e->state] : "(invalid shadow state)",
+ (void *) e->address);
+
+ printk(KERN_WARNING);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i)
+ printk(KERN_CONT "%02x", e->memory_copy[i]);
+ printk(KERN_CONT "\n");
+
+ printk(KERN_WARNING);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
+ if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
+ printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
+ else
+ printk(KERN_CONT " ?");
+ }
+ printk(KERN_CONT "\n");
+ printk(KERN_WARNING "%*c\n", 2 + 2
+ * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
+ break;
+ case KMEMCHECK_ERROR_BUG:
+ printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
+ break;
+ }
+
+ __show_regs(&e->regs, 1);
+ print_stack_trace(&e->trace, 0);
+}
+
+static void do_wakeup(unsigned long data)
+{
+ while (error_count > 0)
+ kmemcheck_error_recall();
+
+ if (error_missed_count > 0) {
+ printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
+ "the queue was too small\n", error_missed_count);
+ error_missed_count = 0;
+ }
+}
+
+static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
+
+/*
+ * Save the context of an error report.
+ */
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs)
+{
+ static unsigned long prev_ip;
+
+ struct kmemcheck_error *e;
+ void *shadow_copy;
+ void *memory_copy;
+
+ /* Don't report several adjacent errors from the same EIP. */
+ if (regs->ip == prev_ip)
+ return;
+ prev_ip = regs->ip;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
+
+ e->state = state;
+ e->address = address;
+ e->size = size;
+
+ /* Save regs */
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ /* Save stack trace */
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 0;
+ save_stack_trace_regs(regs, &e->trace);
+
+ /* Round address down to nearest 16 bytes */
+ shadow_copy = kmemcheck_shadow_lookup(address
+ & ~(SHADOW_COPY_SIZE - 1));
+ BUG_ON(!shadow_copy);
+
+ memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
+
+ kmemcheck_show_addr(address);
+ memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
+ memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
+ kmemcheck_hide_addr(address);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
+
+/*
+ * Save the context of a kmemcheck bug.
+ */
+void kmemcheck_error_save_bug(struct pt_regs *regs)
+{
+ struct kmemcheck_error *e;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_BUG;
+
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 1;
+ save_stack_trace(&e->trace);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 0000000..0efc2e8
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
+#define ARCH__X86__MM__KMEMCHECK__ERROR_H
+
+#include <linux/ptrace.h>
+
+#include "shadow.h"
+
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs);
+
+void kmemcheck_error_save_bug(struct pt_regs *regs);
+
+void kmemcheck_error_recall(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 0000000..b4f2e7e
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,659 @@
+/**
+ * kmemcheck - a heavyweight memory checker for the linux kernel
+ * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
+ * (With a lot of help from Ingo Molnar and Pekka Enberg.)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2) as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/cacheflush.h>
+#include <asm/kmemcheck.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+#include "error.h"
+#include "opcode.h"
+#include "pte.h"
+#include "selftest.h"
+#include "shadow.h"
+
+
+#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 0
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 1
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
+# define KMEMCHECK_ENABLED 2
+#endif
+
+int kmemcheck_enabled = KMEMCHECK_ENABLED;
+
+int __init kmemcheck_init(void)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Limit SMP to use a single CPU. We rely on the fact that this code
+ * runs before SMP is set up.
+ */
+ if (setup_max_cpus > 1) {
+ printk(KERN_INFO
+ "kmemcheck: Limiting number of CPUs to 1.\n");
+ setup_max_cpus = 1;
+ }
+#endif
+
+ if (!kmemcheck_selftest()) {
+ printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
+ kmemcheck_enabled = 0;
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "kmemcheck: Initialized\n");
+ return 0;
+}
+
+early_initcall(kmemcheck_init);
+
+/*
+ * We need to parse the kmemcheck= option before any memory is allocated.
+ */
+static int __init param_kmemcheck(char *str)
+{
+ int val;
+ int ret;
+
+ if (!str)
+ return -EINVAL;
+
+ ret = kstrtoint(str, 0, &val);
+ if (ret)
+ return ret;
+ kmemcheck_enabled = val;
+ return 0;
+}
+
+early_param("kmemcheck", param_kmemcheck);
+
+int kmemcheck_show_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+int kmemcheck_hide_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+struct kmemcheck_context {
+ bool busy;
+ int balance;
+
+ /*
+ * There can be at most two memory operands to an instruction, but
+ * each address can cross a page boundary -- so we may need up to
+ * four addresses that must be hidden/revealed for each fault.
+ */
+ unsigned long addr[4];
+ unsigned long n_addrs;
+ unsigned long flags;
+
+ /* Data size of the instruction that caused a fault. */
+ unsigned int size;
+};
+
+static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
+
+bool kmemcheck_active(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+ return data->balance > 0;
+}
+
+/* Save an address that needs to be shown/hidden */
+static void kmemcheck_save_addr(unsigned long addr)
+{
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+ BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
+ data->addr[data->n_addrs++] = addr;
+}
+
+static unsigned int kmemcheck_show_all(void)
+{
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_show_addr(data->addr[i]);
+
+ return n;
+}
+
+static unsigned int kmemcheck_hide_all(void)
+{
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_hide_addr(data->addr[i]);
+
+ return n;
+}
+
+/*
+ * Called from the #PF handler.
+ */
+void kmemcheck_show(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+ BUG_ON(!irqs_disabled());
+
+ if (unlikely(data->balance != 0)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->balance = 0;
+ return;
+ }
+
+ /*
+ * None of the addresses actually belonged to kmemcheck. Note that
+ * this is not an error.
+ */
+ if (kmemcheck_show_all() == 0)
+ return;
+
+ ++data->balance;
+
+ /*
+ * The IF needs to be cleared as well, so that the faulting
+ * instruction can run "uninterrupted". Otherwise, we might take
+ * an interrupt and start executing that before we've had a chance
+ * to hide the page again.
+ *
+ * NOTE: In the rare case of multiple faults, we must not override
+ * the original flags:
+ */
+ if (!(regs->flags & X86_EFLAGS_TF))
+ data->flags = regs->flags;
+
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+}
+
+/*
+ * Called from the #DB handler.
+ */
+void kmemcheck_hide(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+ int n;
+
+ BUG_ON(!irqs_disabled());
+
+ if (unlikely(data->balance != 1)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->n_addrs = 0;
+ data->balance = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+ return;
+ }
+
+ if (kmemcheck_enabled)
+ n = kmemcheck_hide_all();
+ else
+ n = kmemcheck_show_all();
+
+ if (n == 0)
+ return;
+
+ --data->balance;
+
+ data->n_addrs = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+}
+
+void kmemcheck_show_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+bool kmemcheck_page_is_tracked(struct page *p)
+{
+ /* This will also check the "hidden" flag of the PTE. */
+ return kmemcheck_pte_lookup((unsigned long) page_address(p));
+}
+
+void kmemcheck_hide_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+/* Access may NOT cross page boundary */
+static void kmemcheck_read_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+ enum kmemcheck_shadow status;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+
+ /* Don't warn about it again. */
+ kmemcheck_shadow_set(shadow, size);
+}
+
+bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
+{
+ enum kmemcheck_shadow status;
+ void *shadow;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return true;
+
+ status = kmemcheck_shadow_test_all(shadow, size);
+
+ return status == KMEMCHECK_SHADOW_INITIALIZED;
+}
+
+/* Access may cross page boundary */
+static void kmemcheck_read(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_read_strict(regs, addr, size);
+ return;
+ }
+
+ /*
+ * What we do is basically to split the access across the
+ * two pages and handle each part separately. Yes, this means
+ * that we may now see reads that are 3 + 5 bytes, for
+ * example (and if both are uninitialized, there will be two
+ * reports), but it makes the code a lot simpler.
+ */
+ kmemcheck_read_strict(regs, addr, next_page - addr);
+ kmemcheck_read_strict(regs, next_page, next_addr - next_page);
+}
+
+static void kmemcheck_write_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ kmemcheck_shadow_set(shadow, size);
+}
+
+static void kmemcheck_write(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_write_strict(regs, addr, size);
+ return;
+ }
+
+ /* See comment in kmemcheck_read(). */
+ kmemcheck_write_strict(regs, addr, next_page - addr);
+ kmemcheck_write_strict(regs, next_page, next_addr - next_page);
+}
+
+/*
+ * Copying is hard. We have two addresses, each of which may be split across
+ * a page (and each page will have different shadow addresses).
+ */
+static void kmemcheck_copy(struct pt_regs *regs,
+ unsigned long src_addr, unsigned long dst_addr, unsigned int size)
+{
+ uint8_t shadow[8];
+ enum kmemcheck_shadow status;
+
+ unsigned long page;
+ unsigned long next_addr;
+ unsigned long next_page;
+
+ uint8_t *x;
+ unsigned int i;
+ unsigned int n;
+
+ BUG_ON(size > sizeof(shadow));
+
+ page = src_addr & PAGE_MASK;
+ next_addr = src_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < size; ++i)
+ shadow[i] = x[i];
+ } else {
+ for (i = 0; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ } else {
+ n = next_page - src_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < n; ++i)
+ shadow[i] = x[i];
+ } else {
+ /* Not tracked */
+ for (i = 0; i < n; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i)
+ shadow[i] = x[i - n];
+ } else {
+ /* Not tracked */
+ for (i = n; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ page = dst_addr & PAGE_MASK;
+ next_addr = dst_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < size; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ } else {
+ n = next_page - dst_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < n; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i) {
+ x[i - n] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ }
+
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, src_addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+}
+
+enum kmemcheck_method {
+ KMEMCHECK_READ,
+ KMEMCHECK_WRITE,
+};
+
+static void kmemcheck_access(struct pt_regs *regs,
+ unsigned long fallback_address, enum kmemcheck_method fallback_method)
+{
+ const uint8_t *insn;
+ const uint8_t *insn_primary;
+ unsigned int size;
+
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+ /* Recursive fault -- ouch. */
+ if (data->busy) {
+ kmemcheck_show_addr(fallback_address);
+ kmemcheck_error_save_bug(regs);
+ return;
+ }
+
+ data->busy = true;
+
+ insn = (const uint8_t *) regs->ip;
+ insn_primary = kmemcheck_opcode_get_primary(insn);
+
+ kmemcheck_opcode_decode(insn, &size);
+
+ switch (insn_primary[0]) {
+#ifdef CONFIG_KMEMCHECK_BITOPS_OK
+ /* AND, OR, XOR */
+ /*
+ * Unfortunately, these instructions have to be excluded from
+ * our regular checking since they access only some (and not
+ * all) bits. This clears out "bogus" bitfield-access warnings.
+ */
+ case 0x80:
+ case 0x81:
+ case 0x82:
+ case 0x83:
+ switch ((insn_primary[1] >> 3) & 7) {
+ /* OR */
+ case 1:
+ /* AND */
+ case 4:
+ /* XOR */
+ case 6:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+
+ /* ADD */
+ case 0:
+ /* ADC */
+ case 2:
+ /* SBB */
+ case 3:
+ /* SUB */
+ case 5:
+ /* CMP */
+ case 7:
+ break;
+ }
+ break;
+#endif
+
+ /* MOVS, MOVSB, MOVSW, MOVSD */
+ case 0xa4:
+ case 0xa5:
+ /*
+ * These instructions are special because they take two
+ * addresses, but we only get one page fault.
+ */
+ kmemcheck_copy(regs, regs->si, regs->di, size);
+ goto out;
+
+ /* CMPS, CMPSB, CMPSW, CMPSD */
+ case 0xa6:
+ case 0xa7:
+ kmemcheck_read(regs, regs->si, size);
+ kmemcheck_read(regs, regs->di, size);
+ goto out;
+ }
+
+ /*
+ * If the opcode isn't special in any way, we use the data from the
+ * page fault handler to determine the address and type of memory
+ * access.
+ */
+ switch (fallback_method) {
+ case KMEMCHECK_READ:
+ kmemcheck_read(regs, fallback_address, size);
+ goto out;
+ case KMEMCHECK_WRITE:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+ }
+
+out:
+ data->busy = false;
+}
+
+bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ pte_t *pte;
+
+ /*
+ * XXX: Is it safe to assume that memory accesses from virtual 86
+ * mode or non-kernel code segments will _never_ access kernel
+ * memory (e.g. tracked pages)? For now, we need this to avoid
+ * invoking kmemcheck for PnP BIOS calls.
+ */
+ if (regs->flags & X86_VM_MASK)
+ return false;
+ if (regs->cs != __KERNEL_CS)
+ return false;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return false;
+
+ WARN_ON_ONCE(in_nmi());
+
+ if (error_code & 2)
+ kmemcheck_access(regs, address, KMEMCHECK_WRITE);
+ else
+ kmemcheck_access(regs, address, KMEMCHECK_READ);
+
+ kmemcheck_show(regs);
+ return true;
+}
+
+bool kmemcheck_trap(struct pt_regs *regs)
+{
+ if (!kmemcheck_active(regs))
+ return false;
+
+ /* We're done. */
+ kmemcheck_hide(regs);
+ return true;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 0000000..324aa3f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
+#include <linux/types.h>
+
+#include "opcode.h"
+
+static bool opcode_is_prefix(uint8_t b)
+{
+ return
+ /* Group 1 */
+ b == 0xf0 || b == 0xf2 || b == 0xf3
+ /* Group 2 */
+ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
+ || b == 0x64 || b == 0x65
+ /* Group 3 */
+ || b == 0x66
+ /* Group 4 */
+ || b == 0x67;
+}
+
+#ifdef CONFIG_X86_64
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return (b & 0xf0) == 0x40;
+}
+#else
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return false;
+}
+#endif
+
+#define REX_W (1 << 3)
+
+/*
+ * This is a VERY crude opcode decoder. We only need to find the size of the
+ * load/store that caused our #PF and this should work for all the opcodes
+ * that we care about. Moreover, the ones who invented this instruction set
+ * should be shot.
+ */
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
+{
+ /* Default operand size */
+ int operand_size_override = 4;
+
+ /* prefixes */
+ for (; opcode_is_prefix(*op); ++op) {
+ if (*op == 0x66)
+ operand_size_override = 2;
+ }
+
+ /* REX prefix */
+ if (opcode_is_rex_prefix(*op)) {
+ uint8_t rex = *op;
+
+ ++op;
+ if (rex & REX_W) {
+ switch (*op) {
+ case 0x63:
+ *size = 4;
+ return;
+ case 0x0f:
+ ++op;
+
+ switch (*op) {
+ case 0xb6:
+ case 0xbe:
+ *size = 1;
+ return;
+ case 0xb7:
+ case 0xbf:
+ *size = 2;
+ return;
+ }
+
+ break;
+ }
+
+ *size = 8;
+ return;
+ }
+ }
+
+ /* escape opcode */
+ if (*op == 0x0f) {
+ ++op;
+
+ /*
+ * This is move with zero-extend and sign-extend, respectively;
+ * we don't have to think about 0xb6/0xbe, because this is
+ * already handled in the conditional below.
+ */
+ if (*op == 0xb7 || *op == 0xbf)
+ operand_size_override = 2;
+ }
+
+ *size = (*op & 1) ? operand_size_override : 1;
+}
+
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
+{
+ /* skip prefixes */
+ while (opcode_is_prefix(*op))
+ ++op;
+ if (opcode_is_rex_prefix(*op))
+ ++op;
+ return op;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 0000000..6956aad
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
+#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
+
+#include <linux/types.h>
+
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 0000000..4ead26e
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+#include "pte.h"
+
+pte_t *kmemcheck_pte_lookup(unsigned long address)
+{
+ pte_t *pte;
+ unsigned int level;
+
+ pte = lookup_address(address, &level);
+ if (!pte)
+ return NULL;
+ if (level != PG_LEVEL_4K)
+ return NULL;
+ if (!pte_hidden(*pte))
+ return NULL;
+
+ return pte;
+}
+
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 0000000..9f59664
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
+#define ARCH__X86__MM__KMEMCHECK__PTE_H
+
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+pte_t *kmemcheck_pte_lookup(unsigned long address);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 0000000..aef7140
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,70 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+
+#include "opcode.h"
+#include "selftest.h"
+
+struct selftest_opcode {
+ unsigned int expected_size;
+ const uint8_t *insn;
+ const char *desc;
+};
+
+static const struct selftest_opcode selftest_opcodes[] = {
+ /* REP MOVS */
+ {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
+ {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
+
+ /* MOVZX / MOVZXD */
+ {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
+ {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
+ {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
+
+#ifdef CONFIG_X86_64
+ /* MOVZX / MOVZXD */
+ {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
+ {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
+#endif
+};
+
+static bool selftest_opcode_one(const struct selftest_opcode *op)
+{
+ unsigned size;
+
+ kmemcheck_opcode_decode(op->insn, &size);
+
+ if (size == op->expected_size)
+ return true;
+
+ printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
+ op->desc, op->expected_size, size);
+ return false;
+}
+
+static bool selftest_opcodes_all(void)
+{
+ bool pass = true;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
+ pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
+
+ return pass;
+}
+
+bool kmemcheck_selftest(void)
+{
+ bool pass = true;
+
+ pass = pass && selftest_opcodes_all();
+
+ return pass;
+}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 0000000..8fed4fe
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
+#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+
+bool kmemcheck_selftest(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 0000000..aec1242
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,173 @@
+#include <linux/kmemcheck.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include "pte.h"
+#include "shadow.h"
+
+/*
+ * Return the shadow address for the given address. Returns NULL if the
+ * address is not tracked.
+ *
+ * We need to be extremely careful not to follow any invalid pointers,
+ * because this function can be called for *any* possible address.
+ */
+void *kmemcheck_shadow_lookup(unsigned long address)
+{
+ pte_t *pte;
+ struct page *page;
+
+ if (!virt_addr_valid(address))
+ return NULL;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return NULL;
+
+ page = virt_to_page(address);
+ if (!page->shadow)
+ return NULL;
+ return page->shadow + (address & (PAGE_SIZE - 1));
+}
+
+static void mark_shadow(void *address, unsigned int n,
+ enum kmemcheck_shadow status)
+{
+ unsigned long addr = (unsigned long) address;
+ unsigned long last_addr = addr + n - 1;
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long last_page = last_addr & PAGE_MASK;
+ unsigned int first_n;
+ void *shadow;
+
+ /* If the memory range crosses a page boundary, stop there. */
+ if (page == last_page)
+ first_n = n;
+ else
+ first_n = page + PAGE_SIZE - addr;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, first_n);
+
+ addr += first_n;
+ n -= first_n;
+
+ /* Do full-page memset()s. */
+ while (n >= PAGE_SIZE) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, PAGE_SIZE);
+
+ addr += PAGE_SIZE;
+ n -= PAGE_SIZE;
+ }
+
+ /* Do the remaining page, if any. */
+ if (n > 0) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, n);
+ }
+}
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
+}
+
+void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
+}
+
+/*
+ * Fill the shadow memory of the given address such that the memory at that
+ * address is marked as being initialized.
+ */
+void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
+}
+EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
+
+void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
+}
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
+{
+#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+
+ /*
+ * Make sure _some_ bytes are initialized. Gcc frequently generates
+ * code to access neighboring bytes.
+ */
+ for (i = 0; i < size; ++i) {
+ if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+
+ return x[0];
+#else
+ return kmemcheck_shadow_test_all(shadow, size);
+#endif
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+
+ /* All bytes must be initialized. */
+ for (i = 0; i < size; ++i) {
+ if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+
+ return x[0];
+}
+
+void kmemcheck_shadow_set(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+ for (i = 0; i < size; ++i)
+ x[i] = KMEMCHECK_SHADOW_INITIALIZED;
+}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 0000000..ff0b2f7
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,18 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
+#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
+
+enum kmemcheck_shadow {
+ KMEMCHECK_SHADOW_UNALLOCATED,
+ KMEMCHECK_SHADOW_UNINITIALIZED,
+ KMEMCHECK_SHADOW_INITIALIZED,
+ KMEMCHECK_SHADOW_FREED,
+};
+
+void *kmemcheck_shadow_lookup(unsigned long address);
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
+enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
+ unsigned int size);
+void kmemcheck_shadow_set(void *shadow, unsigned int size);
+
+#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 0000000..ddb2244
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,619 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ * 2007 Alexander Eichner
+ * 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <linux/errno.h>
+#include <asm/debugreg.h>
+#include <linux/mmiotrace.h>
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+ struct list_head list;
+ struct kmmio_fault_page *release_next;
+ unsigned long addr; /* the requested address */
+ pteval_t old_presence; /* page presence prior to arming */
+ bool armed;
+
+ /*
+ * Number of times this page has been registered as a part
+ * of a probe. If zero, page is disarmed and this may be freed.
+ * Used only by writers (RCU) and post_kmmio_handler().
+ * Protected by kmmio_lock, when linked into kmmio_page_table.
+ */
+ int count;
+
+ bool scheduled_for_release;
+};
+
+struct kmmio_delayed_release {
+ struct rcu_head rcu;
+ struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+ struct kmmio_fault_page *fpage;
+ struct kmmio_probe *probe;
+ unsigned long saved_flags;
+ unsigned long addr;
+ int active;
+};
+
+static DEFINE_SPINLOCK(kmmio_lock);
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long addr)
+{
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+
+ return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+ struct kmmio_probe *p;
+ list_for_each_entry_rcu(p, &kmmio_probes, list) {
+ if (addr >= p->addr && addr < (p->addr + p->len))
+ return p;
+ }
+ return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
+{
+ struct list_head *head;
+ struct kmmio_fault_page *f;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+ head = kmmio_page_list(addr);
+ list_for_each_entry_rcu(f, head, list) {
+ if (f->addr == addr)
+ return f;
+ }
+ return NULL;
+}
+
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
+{
+ pmdval_t v = pmd_val(*pmd);
+ if (clear) {
+ *old = v & _PAGE_PRESENT;
+ v &= ~_PAGE_PRESENT;
+ } else /* presume this has been called with clear==true previously */
+ v |= *old;
+ set_pmd(pmd, __pmd(v));
+}
+
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
+{
+ pteval_t v = pte_val(*pte);
+ if (clear) {
+ *old = v & _PAGE_PRESENT;
+ v &= ~_PAGE_PRESENT;
+ } else /* presume this has been called with clear==true previously */
+ v |= *old;
+ set_pte_atomic(pte, __pte(v));
+}
+
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(f->addr, &level);
+
+ if (!pte) {
+ pr_err("no pte for addr 0x%08lx\n", f->addr);
+ return -1;
+ }
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
+ break;
+ case PG_LEVEL_4K:
+ clear_pte_presence(pte, clear, &f->old_presence);
+ break;
+ default:
+ pr_err("unexpected page level 0x%x.\n", level);
+ return -1;
+ }
+
+ __flush_tlb_one(f->addr);
+ return 0;
+}
+
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+ int ret;
+ WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
+ if (f->armed) {
+ pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
+ f->addr, f->count, !!f->old_presence);
+ }
+ ret = clear_page_presence(f, true);
+ WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
+ f->addr);
+ f->armed = true;
+ return ret;
+}
+
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+ int ret = clear_page_presence(f, false);
+ WARN_ONCE(ret < 0,
+ KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
+ f->armed = false;
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled throughout this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+ struct kmmio_context *ctx;
+ struct kmmio_fault_page *faultpage;
+ int ret = 0; /* default to fault not handled */
+ unsigned long page_base = addr;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+ if (!pte)
+ return -EINVAL;
+ page_base &= page_level_mask(l);
+
+ /*
+ * Preemption is now disabled to prevent process switch during
+ * single stepping. We can only handle one active kmmio trace
+ * per cpu, so ensure that we finish it before something else
+ * gets to run. We also hold the RCU read lock over single
+ * stepping to avoid looking up the probe and kmmio_fault_page
+ * again.
+ */
+ preempt_disable();
+ rcu_read_lock();
+
+ faultpage = get_kmmio_fault_page(page_base);
+ if (!faultpage) {
+ /*
+ * Either this page fault is not caused by kmmio, or
+ * another CPU just pulled the kmmio probe from under
+ * our feet. The latter case should not be possible.
+ */
+ goto no_kmmio;
+ }
+
+ ctx = &get_cpu_var(kmmio_ctx);
+ if (ctx->active) {
+ if (page_base == ctx->addr) {
+ /*
+ * A second fault on the same page means some other
+ * condition needs handling by do_page_fault(), the
+ * page really not being present is the most common.
+ */
+ pr_debug("secondary hit for 0x%08lx CPU %d.\n",
+ addr, smp_processor_id());
+
+ if (!faultpage->old_presence)
+ pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
+ addr, smp_processor_id());
+ } else {
+ /*
+ * Prevent overwriting already in-flight context.
+ * This should not happen, let's hope disarming at
+ * least prevents a panic.
+ */
+ pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
+ smp_processor_id(), addr);
+ pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
+ disarm_kmmio_fault_page(faultpage);
+ }
+ goto no_kmmio_ctx;
+ }
+ ctx->active++;
+
+ ctx->fpage = faultpage;
+ ctx->probe = get_kmmio_probe(page_base);
+ ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+ ctx->addr = page_base;
+
+ if (ctx->probe && ctx->probe->pre_handler)
+ ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+ /*
+ * Enable single-stepping and disable interrupts for the faulting
+ * context. Local interrupts must not get enabled during stepping.
+ */
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+
+ /* Now we set present bit in PTE and single step. */
+ disarm_kmmio_fault_page(ctx->fpage);
+
+ /*
+ * If another cpu accesses the same page while we are stepping,
+ * the access will not be caught. It will simply succeed and the
+ * only downside is we lose the event. If this becomes a problem,
+ * the user should drop to single cpu before tracing.
+ */
+
+ put_cpu_var(kmmio_ctx);
+ return 1; /* fault handled */
+
+no_kmmio_ctx:
+ put_cpu_var(kmmio_ctx);
+no_kmmio:
+ rcu_read_unlock();
+ preempt_enable_no_resched();
+ return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled throughout this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+ int ret = 0;
+ struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+
+ if (!ctx->active) {
+ /*
+ * debug traps without an active context are due to either
+ * something external causing them (f.e. using a debugger while
+ * mmio tracing enabled), or erroneous behaviour
+ */
+ pr_warning("unexpected debug trap on CPU %d.\n",
+ smp_processor_id());
+ goto out;
+ }
+
+ if (ctx->probe && ctx->probe->post_handler)
+ ctx->probe->post_handler(ctx->probe, condition, regs);
+
+ /* Prevent racing against release_kmmio_fault_page(). */
+ spin_lock(&kmmio_lock);
+ if (ctx->fpage->count)
+ arm_kmmio_fault_page(ctx->fpage);
+ spin_unlock(&kmmio_lock);
+
+ regs->flags &= ~X86_EFLAGS_TF;
+ regs->flags |= ctx->saved_flags;
+
+ /* These were acquired in kmmio_handler(). */
+ ctx->active--;
+ BUG_ON(ctx->active);
+ rcu_read_unlock();
+ preempt_enable_no_resched();
+
+ /*
+ * if somebody else is singlestepping across a probe point, flags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (!(regs->flags & X86_EFLAGS_TF))
+ ret = 1;
+out:
+ put_cpu_var(kmmio_ctx);
+ return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long addr)
+{
+ struct kmmio_fault_page *f;
+
+ f = get_kmmio_fault_page(addr);
+ if (f) {
+ if (!f->count)
+ arm_kmmio_fault_page(f);
+ f->count++;
+ return 0;
+ }
+
+ f = kzalloc(sizeof(*f), GFP_ATOMIC);
+ if (!f)
+ return -1;
+
+ f->count = 1;
+ f->addr = addr;
+
+ if (arm_kmmio_fault_page(f)) {
+ kfree(f);
+ return -1;
+ }
+
+ list_add_rcu(&f->list, kmmio_page_list(f->addr));
+
+ return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long addr,
+ struct kmmio_fault_page **release_list)
+{
+ struct kmmio_fault_page *f;
+
+ f = get_kmmio_fault_page(addr);
+ if (!f)
+ return;
+
+ f->count--;
+ BUG_ON(f->count < 0);
+ if (!f->count) {
+ disarm_kmmio_fault_page(f);
+ if (!f->scheduled_for_release) {
+ f->release_next = *release_list;
+ *release_list = f;
+ f->scheduled_for_release = true;
+ }
+ }
+}
+
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+ unsigned long flags;
+ int ret = 0;
+ unsigned long size = 0;
+ const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ unsigned int l;
+ pte_t *pte;
+
+ spin_lock_irqsave(&kmmio_lock, flags);
+ if (get_kmmio_probe(p->addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ pte = lookup_address(p->addr, &l);
+ if (!pte) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ kmmio_count++;
+ list_add_rcu(&p->list, &kmmio_probes);
+ while (size < size_lim) {
+ if (add_kmmio_fault_page(p->addr + size))
+ pr_err("Unable to set page fault.\n");
+ size += page_level_size(l);
+ }
+out:
+ spin_unlock_irqrestore(&kmmio_lock, flags);
+ /*
+ * XXX: What should I do here?
+ * Here was a call to global_flush_tlb(), but it does not exist
+ * anymore. It seems it's not needed after all.
+ */
+ return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+ struct kmmio_delayed_release *dr = container_of(
+ head,
+ struct kmmio_delayed_release,
+ rcu);
+ struct kmmio_fault_page *f = dr->release_list;
+ while (f) {
+ struct kmmio_fault_page *next = f->release_next;
+ BUG_ON(f->count);
+ kfree(f);
+ f = next;
+ }
+ kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+ struct kmmio_delayed_release *dr =
+ container_of(head, struct kmmio_delayed_release, rcu);
+ struct kmmio_fault_page *f = dr->release_list;
+ struct kmmio_fault_page **prevp = &dr->release_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kmmio_lock, flags);
+ while (f) {
+ if (!f->count) {
+ list_del_rcu(&f->list);
+ prevp = &f->release_next;
+ } else {
+ *prevp = f->release_next;
+ f->release_next = NULL;
+ f->scheduled_for_release = false;
+ }
+ f = *prevp;
+ }
+ spin_unlock_irqrestore(&kmmio_lock, flags);
+
+ /* This is the real RCU destroy call. */
+ call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ * Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ * Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ * Actually free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+ unsigned long flags;
+ unsigned long size = 0;
+ const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ struct kmmio_fault_page *release_list = NULL;
+ struct kmmio_delayed_release *drelease;
+ unsigned int l;
+ pte_t *pte;
+
+ pte = lookup_address(p->addr, &l);
+ if (!pte)
+ return;
+
+ spin_lock_irqsave(&kmmio_lock, flags);
+ while (size < size_lim) {
+ release_kmmio_fault_page(p->addr + size, &release_list);
+ size += page_level_size(l);
+ }
+ list_del_rcu(&p->list);
+ kmmio_count--;
+ spin_unlock_irqrestore(&kmmio_lock, flags);
+
+ if (!release_list)
+ return;
+
+ drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+ if (!drelease) {
+ pr_crit("leaking kmmio_fault_page objects.\n");
+ return;
+ }
+ drelease->release_list = release_list;
+
+ /*
+ * This is not really RCU here. We have just disarmed a set of
+ * pages so that they cannot trigger page faults anymore. However,
+ * we cannot remove the pages from kmmio_page_table,
+ * because a probe hit might be in flight on another CPU. The
+ * pages are collected into a list, and they will be removed from
+ * kmmio_page_table when it is certain that no probe hit related to
+ * these pages can be in flight. RCU grace period sounds like a
+ * good choice.
+ *
+ * If we removed the pages too early, kmmio page fault handler might
+ * not find the respective kmmio_fault_page and determine it's not
+ * a kmmio fault, when it actually is. This would lead to madness.
+ */
+ call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
+{
+ struct die_args *arg = args;
+ unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
+
+ if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
+ if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ *dr6_p &= ~DR_STEP;
+ return NOTIFY_STOP;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+ .notifier_call = kmmio_die_notifier
+};
+
+int kmmio_init(void)
+{
+ int i;
+
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+ return register_die_notifier(&nb_die);
+}
+
+void kmmio_cleanup(void)
+{
+ int i;
+
+ unregister_die_notifier(&nb_die);
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+ WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+ KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+ }
+}
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 0000000..62474ba
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,21 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+ return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
+
+#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
new file mode 100644
index 0000000..307f60e
--- /dev/null
+++ b/arch/x86/mm/mmap.c
@@ -0,0 +1,123 @@
+/*
+ * Flexible mmap layout support
+ *
+ * Based on code by Ingo Molnar and Andi Kleen, copyrighted
+ * as follows:
+ *
+ * Copyright 2003-2009 Red Hat Inc.
+ * All Rights Reserved.
+ * Copyright 2005 Andi Kleen, SUSE Labs.
+ * Copyright 2007 Jiri Kosina, SUSE Labs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/limits.h>
+#include <linux/sched.h>
+#include <asm/elf.h>
+
+struct va_alignment __read_mostly va_align = {
+ .flags = -1,
+};
+
+static unsigned long stack_maxrandom_size(void)
+{
+ unsigned long max = 0;
+ if ((current->flags & PF_RANDOMIZE) &&
+ !(current->personality & ADDR_NO_RANDOMIZE)) {
+ max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
+ }
+
+ return max;
+}
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole with possible stack randomization.
+ */
+#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static int mmap_is_legacy(void)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+ /*
+ * 8 bits of randomness in 32bit mmaps, 20 address space bits
+ * 28 bits of randomness in 64bit mmaps, 40 address space bits
+ */
+ if (mmap_is_ia32())
+ rnd = (unsigned long)get_random_int() % (1<<8);
+ else
+ rnd = (unsigned long)get_random_int() % (1<<28);
+
+ return rnd << PAGE_SHIFT;
+}
+
+static unsigned long mmap_base(unsigned long rnd)
+{
+ unsigned long gap = rlimit(RLIMIT_STACK);
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
+
+ if (mmap_is_legacy()) {
+ mm->mmap_base = mm->mmap_legacy_base;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ }
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_MPX)
+ return "[mpx]";
+ return NULL;
+}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 0000000..0057a7a
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,478 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ * Jeff Muizelaar, 2006, 2007
+ * Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+
+#define pr_fmt(fmt) "mmiotrace: " fmt
+
+#define DEBUG 1
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kallsyms.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820.h> /* for ISA_START_ADDRESS */
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include "pf_in.h"
+
+struct trap_reason {
+ unsigned long addr;
+ unsigned long ip;
+ enum reason_type type;
+ int active_traces;
+};
+
+struct remap_trace {
+ struct list_head list;
+ struct kmmio_probe probe;
+ resource_size_t phys;
+ unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list); /* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ * and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long filter_offset;
+static bool nommiotrace;
+static bool trace_pc;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(trace_pc, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+
+static bool is_enabled(void)
+{
+ return atomic_read(&mmiotrace_enabled);
+}
+
+static void print_pte(unsigned long address)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(address, &level);
+
+ if (!pte) {
+ pr_err("Error in %s: no pte for page 0x%08lx\n",
+ __func__, address);
+ return;
+ }
+
+ if (level == PG_LEVEL_2M) {
+ pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
+ address);
+ BUG();
+ }
+ pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
+ address,
+ (unsigned long long)pte_val(*pte),
+ (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+ const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
+ addr, my_reason->addr);
+ print_pte(addr);
+ print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
+ print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
+#ifdef __i386__
+ pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
+ regs->ax, regs->bx, regs->cx, regs->dx);
+ pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
+ regs->si, regs->di, regs->bp, regs->sp);
+#else
+ pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
+ regs->ax, regs->cx, regs->dx);
+ pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
+ regs->si, regs->di, regs->bp, regs->sp);
+#endif
+ put_cpu_var(pf_reason);
+ BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+ unsigned long addr)
+{
+ struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+ const unsigned long instptr = instruction_pointer(regs);
+ const enum reason_type type = get_ins_type(instptr);
+ struct remap_trace *trace = p->private;
+
+ /* it doesn't make sense to have more than one active trace per cpu */
+ if (my_reason->active_traces)
+ die_kmmio_nesting_error(regs, addr);
+ else
+ my_reason->active_traces++;
+
+ my_reason->type = type;
+ my_reason->addr = addr;
+ my_reason->ip = instptr;
+
+ my_trace->phys = addr - trace->probe.addr + trace->phys;
+ my_trace->map_id = trace->id;
+
+ /*
+ * Only record the program counter when requested.
+ * It may taint clean-room reverse engineering.
+ */
+ if (trace_pc)
+ my_trace->pc = instptr;
+ else
+ my_trace->pc = 0;
+
+ /*
+ * XXX: the timestamp recorded will be *after* the tracing has been
+ * done, not at the time we hit the instruction. SMP implications
+ * on event ordering?
+ */
+
+ switch (type) {
+ case REG_READ:
+ my_trace->opcode = MMIO_READ;
+ my_trace->width = get_ins_mem_width(instptr);
+ break;
+ case REG_WRITE:
+ my_trace->opcode = MMIO_WRITE;
+ my_trace->width = get_ins_mem_width(instptr);
+ my_trace->value = get_ins_reg_val(instptr, regs);
+ break;
+ case IMM_WRITE:
+ my_trace->opcode = MMIO_WRITE;
+ my_trace->width = get_ins_mem_width(instptr);
+ my_trace->value = get_ins_imm_val(instptr);
+ break;
+ default:
+ {
+ unsigned char *ip = (unsigned char *)instptr;
+ my_trace->opcode = MMIO_UNKNOWN_OP;
+ my_trace->width = 0;
+ my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
+ *(ip + 2);
+ }
+ }
+ put_cpu_var(cpu_trace);
+ put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+ struct pt_regs *regs)
+{
+ struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+
+ /* this should always return the active_trace count to 0 */
+ my_reason->active_traces--;
+ if (my_reason->active_traces) {
+ pr_emerg("unexpected post handler");
+ BUG();
+ }
+
+ switch (my_reason->type) {
+ case REG_READ:
+ my_trace->value = get_ins_reg_val(my_reason->ip, regs);
+ break;
+ default:
+ break;
+ }
+
+ mmio_trace_rw(my_trace);
+ put_cpu_var(cpu_trace);
+ put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+ void __iomem *addr)
+{
+ static atomic_t next_id;
+ struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+ /* These are page-unaligned. */
+ struct mmiotrace_map map = {
+ .phys = offset,
+ .virt = (unsigned long)addr,
+ .len = size,
+ .opcode = MMIO_PROBE
+ };
+
+ if (!trace) {
+ pr_err("kmalloc failed in ioremap\n");
+ return;
+ }
+
+ *trace = (struct remap_trace) {
+ .probe = {
+ .addr = (unsigned long)addr,
+ .len = size,
+ .pre_handler = pre,
+ .post_handler = post,
+ .private = trace
+ },
+ .phys = offset,
+ .id = atomic_inc_return(&next_id)
+ };
+ map.map_id = trace->id;
+
+ spin_lock_irq(&trace_lock);
+ if (!is_enabled()) {
+ kfree(trace);
+ goto not_enabled;
+ }
+
+ mmio_trace_mapping(&map);
+ list_add_tail(&trace->list, &trace_list);
+ if (!nommiotrace)
+ register_kmmio_probe(&trace->probe);
+
+not_enabled:
+ spin_unlock_irq(&trace_lock);
+}
+
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+ void __iomem *addr)
+{
+ if (!is_enabled()) /* recheck and proper locking in *_core() */
+ return;
+
+ pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
+ (unsigned long long)offset, size, addr);
+ if ((filter_offset) && (offset != filter_offset))
+ return;
+ ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+ struct mmiotrace_map map = {
+ .phys = 0,
+ .virt = (unsigned long)addr,
+ .len = 0,
+ .opcode = MMIO_UNPROBE
+ };
+ struct remap_trace *trace;
+ struct remap_trace *tmp;
+ struct remap_trace *found_trace = NULL;
+
+ pr_debug("Unmapping %p.\n", addr);
+
+ spin_lock_irq(&trace_lock);
+ if (!is_enabled())
+ goto not_enabled;
+
+ list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+ if ((unsigned long)addr == trace->probe.addr) {
+ if (!nommiotrace)
+ unregister_kmmio_probe(&trace->probe);
+ list_del(&trace->list);
+ found_trace = trace;
+ break;
+ }
+ }
+ map.map_id = (found_trace) ? found_trace->id : -1;
+ mmio_trace_mapping(&map);
+
+not_enabled:
+ spin_unlock_irq(&trace_lock);
+ if (found_trace) {
+ synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+ kfree(found_trace);
+ }
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+ might_sleep();
+ if (is_enabled()) /* recheck and proper locking in *_core() */
+ iounmap_trace_core(addr);
+}
+
+int mmiotrace_printk(const char *fmt, ...)
+{
+ int ret = 0;
+ va_list args;
+ unsigned long flags;
+ va_start(args, fmt);
+
+ spin_lock_irqsave(&trace_lock, flags);
+ if (is_enabled())
+ ret = mmio_trace_printk(fmt, args);
+ spin_unlock_irqrestore(&trace_lock, flags);
+
+ va_end(args);
+ return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
+static void clear_trace_list(void)
+{
+ struct remap_trace *trace;
+ struct remap_trace *tmp;
+
+ /*
+ * No locking required, because the caller ensures we are in a
+ * critical section via mutex, and is_enabled() is false,
+ * i.e. nothing can traverse or modify this list.
+ * Caller also ensures is_enabled() cannot change.
+ */
+ list_for_each_entry(trace, &trace_list, list) {
+ pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
+ trace->probe.addr, trace->probe.len);
+ if (!nommiotrace)
+ unregister_kmmio_probe(&trace->probe);
+ }
+ synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+ list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+ list_del(&trace->list);
+ kfree(trace);
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_var_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+ int cpu;
+ int err;
+
+ if (downed_cpus == NULL &&
+ !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
+ pr_notice("Failed to allocate mask\n");
+ goto out;
+ }
+
+ get_online_cpus();
+ cpumask_copy(downed_cpus, cpu_online_mask);
+ cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
+ if (num_online_cpus() > 1)
+ pr_notice("Disabling non-boot CPUs...\n");
+ put_online_cpus();
+
+ for_each_cpu(cpu, downed_cpus) {
+ err = cpu_down(cpu);
+ if (!err)
+ pr_info("CPU%d is down.\n", cpu);
+ else
+ pr_err("Error taking CPU%d down: %d\n", cpu, err);
+ }
+out:
+ if (num_online_cpus() > 1)
+ pr_warning("multiple CPUs still online, may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+ int cpu;
+ int err;
+
+ if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
+ return;
+ pr_notice("Re-enabling CPUs...\n");
+ for_each_cpu(cpu, downed_cpus) {
+ err = cpu_up(cpu);
+ if (!err)
+ pr_info("enabled CPU%d.\n", cpu);
+ else
+ pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
+ }
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+ if (num_online_cpus() > 1)
+ pr_warning("multiple CPUs are online, may miss events. "
+ "Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
+void enable_mmiotrace(void)
+{
+ mutex_lock(&mmiotrace_mutex);
+ if (is_enabled())
+ goto out;
+
+ if (nommiotrace)
+ pr_info("MMIO tracing disabled.\n");
+ kmmio_init();
+ enter_uniprocessor();
+ spin_lock_irq(&trace_lock);
+ atomic_inc(&mmiotrace_enabled);
+ spin_unlock_irq(&trace_lock);
+ pr_info("enabled.\n");
+out:
+ mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+ mutex_lock(&mmiotrace_mutex);
+ if (!is_enabled())
+ goto out;
+
+ spin_lock_irq(&trace_lock);
+ atomic_dec(&mmiotrace_enabled);
+ BUG_ON(is_enabled());
+ spin_unlock_irq(&trace_lock);
+
+ clear_trace_list(); /* guarantees: no more kmmio callbacks */
+ leave_uniprocessor();
+ kmmio_cleanup();
+ pr_info("disabled.\n");
+out:
+ mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
new file mode 100644
index 0000000..7ed47b1
--- /dev/null
+++ b/arch/x86/mm/mpx.c
@@ -0,0 +1,1032 @@
+/*
+ * mpx.c - Memory Protection eXtensions
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ * Qiaowei Ren <qiaowei.ren@intel.com>
+ * Dave Hansen <dave.hansen@intel.com>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+
+#include <asm/insn.h>
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mpx.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/mpx.h>
+
+static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
+{
+ if (is_64bit_mm(mm))
+ return MPX_BD_SIZE_BYTES_64;
+ else
+ return MPX_BD_SIZE_BYTES_32;
+}
+
+static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
+{
+ if (is_64bit_mm(mm))
+ return MPX_BT_SIZE_BYTES_64;
+ else
+ return MPX_BT_SIZE_BYTES_32;
+}
+
+/*
+ * This is really a simplified "vm_mmap". it only handles MPX
+ * bounds tables (the bounds directory is user-allocated).
+ */
+static unsigned long mpx_mmap(unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, populate;
+
+ /* Only bounds table can be allocated here */
+ if (len != mpx_bt_size_bytes(mm))
+ return -EINVAL;
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(addr, populate);
+
+ return addr;
+}
+
+enum reg_type {
+ REG_TYPE_RM = 0,
+ REG_TYPE_INDEX,
+ REG_TYPE_BASE,
+};
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = 0;
+
+ static const int regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+ };
+ int nr_registers = ARRAY_SIZE(regoff);
+ /*
+ * Don't possibly decode a 32-bit instructions as
+ * reading a 64-bit-only register.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
+ nr_registers -= 8;
+
+ switch (type) {
+ case REG_TYPE_RM:
+ regno = X86_MODRM_RM(insn->modrm.value);
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ case REG_TYPE_INDEX:
+ regno = X86_SIB_INDEX(insn->sib.value);
+ if (X86_REX_X(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ case REG_TYPE_BASE:
+ regno = X86_SIB_BASE(insn->sib.value);
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ default:
+ pr_err("invalid register type");
+ BUG();
+ break;
+ }
+
+ if (regno >= nr_registers) {
+ WARN_ONCE(1, "decoded an instruction with an invalid register");
+ return -EINVAL;
+ }
+ return regoff[regno];
+}
+
+/*
+ * return the address being referenced be instruction
+ * for rm=3 returning the content of the rm reg
+ * for rm!=3 calculates the address using SIB and Disp
+ */
+static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long addr, base, indx;
+ int addr_offset, base_offset, indx_offset;
+ insn_byte_t sib;
+
+ insn_get_modrm(insn);
+ insn_get_sib(insn);
+ sib = insn->sib.value;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
+ if (addr_offset < 0)
+ goto out_err;
+ addr = regs_get_register(regs, addr_offset);
+ } else {
+ if (insn->sib.nbytes) {
+ base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
+ if (base_offset < 0)
+ goto out_err;
+
+ indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
+ if (indx_offset < 0)
+ goto out_err;
+
+ base = regs_get_register(regs, base_offset);
+ indx = regs_get_register(regs, indx_offset);
+ addr = base + indx * (1 << X86_SIB_SCALE(sib));
+ } else {
+ addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
+ if (addr_offset < 0)
+ goto out_err;
+ addr = regs_get_register(regs, addr_offset);
+ }
+ addr += insn->displacement.value;
+ }
+ return (void __user *)addr;
+out_err:
+ return (void __user *)-1;
+}
+
+static int mpx_insn_decode(struct insn *insn,
+ struct pt_regs *regs)
+{
+ unsigned char buf[MAX_INSN_SIZE];
+ int x86_64 = !test_thread_flag(TIF_IA32);
+ int not_copied;
+ int nr_copied;
+
+ not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+ /*
+ * The decoder _should_ fail nicely if we pass it a short buffer.
+ * But, let's not depend on that implementation detail. If we
+ * did not get anything, just error out now.
+ */
+ if (!nr_copied)
+ return -EFAULT;
+ insn_init(insn, buf, nr_copied, x86_64);
+ insn_get_length(insn);
+ /*
+ * copy_from_user() tries to get as many bytes as we could see in
+ * the largest possible instruction. If the instruction we are
+ * after is shorter than that _and_ we attempt to copy from
+ * something unreadable, we might get a short read. This is OK
+ * as long as the read did not stop in the middle of the
+ * instruction. Check to see if we got a partial instruction.
+ */
+ if (nr_copied < insn->length)
+ return -EFAULT;
+
+ insn_get_opcode(insn);
+ /*
+ * We only _really_ need to decode bndcl/bndcn/bndcu
+ * Error out on anything else.
+ */
+ if (insn->opcode.bytes[0] != 0x0f)
+ goto bad_opcode;
+ if ((insn->opcode.bytes[1] != 0x1a) &&
+ (insn->opcode.bytes[1] != 0x1b))
+ goto bad_opcode;
+
+ return 0;
+bad_opcode:
+ return -EINVAL;
+}
+
+/*
+ * If a bounds overflow occurs then a #BR is generated. This
+ * function decodes MPX instructions to get violation address
+ * and set this address into extended struct siginfo.
+ *
+ * Note that this is not a super precise way of doing this.
+ * Userspace could have, by the time we get here, written
+ * anything it wants in to the instructions. We can not
+ * trust anything about it. They might not be valid
+ * instructions or might encode invalid registers, etc...
+ *
+ * The caller is expected to kfree() the returned siginfo_t.
+ */
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
+{
+ const struct mpx_bndreg_state *bndregs;
+ const struct mpx_bndreg *bndreg;
+ siginfo_t *info = NULL;
+ struct insn insn;
+ uint8_t bndregno;
+ int err;
+
+ err = mpx_insn_decode(&insn, regs);
+ if (err)
+ goto err_out;
+
+ /*
+ * We know at this point that we are only dealing with
+ * MPX instructions.
+ */
+ insn_get_modrm(&insn);
+ bndregno = X86_MODRM_REG(insn.modrm.value);
+ if (bndregno > 3) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ /* get bndregs field from current task's xsave area */
+ bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
+ if (!bndregs) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ /* now go select the individual register in the set of 4 */
+ bndreg = &bndregs->bndreg[bndregno];
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /*
+ * The registers are always 64-bit, but the upper 32
+ * bits are ignored in 32-bit mode. Also, note that the
+ * upper bounds are architecturally represented in 1's
+ * complement form.
+ *
+ * The 'unsigned long' cast is because the compiler
+ * complains when casting from integers to different-size
+ * pointers.
+ */
+ info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
+ info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
+ info->si_addr_lsb = 0;
+ info->si_signo = SIGSEGV;
+ info->si_errno = 0;
+ info->si_code = SEGV_BNDERR;
+ info->si_addr = mpx_get_addr_ref(&insn, regs);
+ /*
+ * We were not able to extract an address from the instruction,
+ * probably because there was something invalid in it.
+ */
+ if (info->si_addr == (void __user *)-1) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ trace_mpx_bounds_register_exception(info->si_addr, bndreg);
+ return info;
+err_out:
+ /* info might be NULL, but kfree() handles that */
+ kfree(info);
+ return ERR_PTR(err);
+}
+
+static __user void *mpx_get_bounds_dir(void)
+{
+ const struct mpx_bndcsr *bndcsr;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX))
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * The bounds directory pointer is stored in a register
+ * only accessible if we first do an xsave.
+ */
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+ if (!bndcsr)
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * Make sure the register looks valid by checking the
+ * enable bit.
+ */
+ if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * Lastly, mask off the low bits used for configuration
+ * flags, and return the address of the bounds table.
+ */
+ return (void __user *)(unsigned long)
+ (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
+}
+
+int mpx_enable_management(void)
+{
+ void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
+ struct mm_struct *mm = current->mm;
+ int ret = 0;
+
+ /*
+ * runtime in the userspace will be responsible for allocation of
+ * the bounds directory. Then, it will save the base of the bounds
+ * directory into XSAVE/XRSTOR Save Area and enable MPX through
+ * XRSTOR instruction.
+ *
+ * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
+ * expected to be relatively expensive. Storing the bounds
+ * directory here means that we do not have to do xsave in the
+ * unmap path; we can just use mm->bd_addr instead.
+ */
+ bd_base = mpx_get_bounds_dir();
+ down_write(&mm->mmap_sem);
+ mm->bd_addr = bd_base;
+ if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
+ ret = -ENXIO;
+
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+int mpx_disable_management(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX))
+ return -ENXIO;
+
+ down_write(&mm->mmap_sem);
+ mm->bd_addr = MPX_INVALID_BOUNDS_DIR;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
+static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
+ unsigned long *curval,
+ unsigned long __user *addr,
+ unsigned long old_val, unsigned long new_val)
+{
+ int ret;
+ /*
+ * user_atomic_cmpxchg_inatomic() actually uses sizeof()
+ * the pointer that we pass to it to figure out how much
+ * data to cmpxchg. We have to be careful here not to
+ * pass a pointer to a 64-bit data type when we only want
+ * a 32-bit copy.
+ */
+ if (is_64bit_mm(mm)) {
+ ret = user_atomic_cmpxchg_inatomic(curval,
+ addr, old_val, new_val);
+ } else {
+ u32 uninitialized_var(curval_32);
+ u32 old_val_32 = old_val;
+ u32 new_val_32 = new_val;
+ u32 __user *addr_32 = (u32 __user *)addr;
+
+ ret = user_atomic_cmpxchg_inatomic(&curval_32,
+ addr_32, old_val_32, new_val_32);
+ *curval = curval_32;
+ }
+ return ret;
+}
+
+/*
+ * With 32-bit mode, a bounds directory is 4MB, and the size of each
+ * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
+ * and the size of each bounds table is 4MB.
+ */
+static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
+{
+ unsigned long expected_old_val = 0;
+ unsigned long actual_old_val = 0;
+ unsigned long bt_addr;
+ unsigned long bd_new_entry;
+ int ret = 0;
+
+ /*
+ * Carve the virtual space out of userspace for the new
+ * bounds table:
+ */
+ bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
+ if (IS_ERR((void *)bt_addr))
+ return PTR_ERR((void *)bt_addr);
+ /*
+ * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
+ */
+ bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+
+ /*
+ * Go poke the address of the new bounds table in to the
+ * bounds directory entry out in userspace memory. Note:
+ * we may race with another CPU instantiating the same table.
+ * In that case the cmpxchg will see an unexpected
+ * 'actual_old_val'.
+ *
+ * This can fault, but that's OK because we do not hold
+ * mmap_sem at this point, unlike some of the other part
+ * of the MPX code that have to pagefault_disable().
+ */
+ ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry,
+ expected_old_val, bd_new_entry);
+ if (ret)
+ goto out_unmap;
+
+ /*
+ * The user_atomic_cmpxchg_inatomic() will only return nonzero
+ * for faults, *not* if the cmpxchg itself fails. Now we must
+ * verify that the cmpxchg itself completed successfully.
+ */
+ /*
+ * We expected an empty 'expected_old_val', but instead found
+ * an apparently valid entry. Assume we raced with another
+ * thread to instantiate this table and desclare succecss.
+ */
+ if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
+ ret = 0;
+ goto out_unmap;
+ }
+ /*
+ * We found a non-empty bd_entry but it did not have the
+ * VALID_FLAG set. Return an error which will result in
+ * a SEGV since this probably means that somebody scribbled
+ * some invalid data in to a bounds table.
+ */
+ if (expected_old_val != actual_old_val) {
+ ret = -EINVAL;
+ goto out_unmap;
+ }
+ trace_mpx_new_bounds_table(bt_addr);
+ return 0;
+out_unmap:
+ vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
+ return ret;
+}
+
+/*
+ * When a BNDSTX instruction attempts to save bounds to a bounds
+ * table, it will first attempt to look up the table in the
+ * first-level bounds directory. If it does not find a table in
+ * the directory, a #BR is generated and we get here in order to
+ * allocate a new table.
+ *
+ * With 32-bit mode, the size of BD is 4MB, and the size of each
+ * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
+ * and the size of each bound table is 4MB.
+ */
+static int do_mpx_bt_fault(void)
+{
+ unsigned long bd_entry, bd_base;
+ const struct mpx_bndcsr *bndcsr;
+ struct mm_struct *mm = current->mm;
+
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+ if (!bndcsr)
+ return -EINVAL;
+ /*
+ * Mask off the preserve and enable bits
+ */
+ bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
+ /*
+ * The hardware provides the address of the missing or invalid
+ * entry via BNDSTATUS, so we don't have to go look it up.
+ */
+ bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
+ /*
+ * Make sure the directory entry is within where we think
+ * the directory is.
+ */
+ if ((bd_entry < bd_base) ||
+ (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
+ return -EINVAL;
+
+ return allocate_bt(mm, (long __user *)bd_entry);
+}
+
+int mpx_handle_bd_fault(void)
+{
+ /*
+ * Userspace never asked us to manage the bounds tables,
+ * so refuse to help.
+ */
+ if (!kernel_managing_mpx_tables(current->mm))
+ return -EINVAL;
+
+ return do_mpx_bt_fault();
+}
+
+/*
+ * A thin wrapper around get_user_pages(). Returns 0 if the
+ * fault was resolved or -errno if not.
+ */
+static int mpx_resolve_fault(long __user *addr, int write)
+{
+ long gup_ret;
+ int nr_pages = 1;
+ int force = 0;
+
+ gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
+ nr_pages, write, force, NULL, NULL);
+ /*
+ * get_user_pages() returns number of pages gotten.
+ * 0 means we failed to fault in and get anything,
+ * probably because 'addr' is bad.
+ */
+ if (!gup_ret)
+ return -EFAULT;
+ /* Other error, return it */
+ if (gup_ret < 0)
+ return gup_ret;
+ /* must have gup'd a page and gup_ret>0, success */
+ return 0;
+}
+
+static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
+ unsigned long bd_entry)
+{
+ unsigned long bt_addr = bd_entry;
+ int align_to_bytes;
+ /*
+ * Bit 0 in a bt_entry is always the valid bit.
+ */
+ bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
+ /*
+ * Tables are naturally aligned at 8-byte boundaries
+ * on 64-bit and 4-byte boundaries on 32-bit. The
+ * documentation makes it appear that the low bits
+ * are ignored by the hardware, so we do the same.
+ */
+ if (is_64bit_mm(mm))
+ align_to_bytes = 8;
+ else
+ align_to_bytes = 4;
+ bt_addr &= ~(align_to_bytes-1);
+ return bt_addr;
+}
+
+/*
+ * We only want to do a 4-byte get_user() on 32-bit. Otherwise,
+ * we might run off the end of the bounds table if we are on
+ * a 64-bit kernel and try to get 8 bytes.
+ */
+int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
+ long __user *bd_entry_ptr)
+{
+ u32 bd_entry_32;
+ int ret;
+
+ if (is_64bit_mm(mm))
+ return get_user(*bd_entry_ret, bd_entry_ptr);
+
+ /*
+ * Note that get_user() uses the type of the *pointer* to
+ * establish the size of the get, not the destination.
+ */
+ ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
+ *bd_entry_ret = bd_entry_32;
+ return ret;
+}
+
+/*
+ * Get the base of bounds tables pointed by specific bounds
+ * directory entry.
+ */
+static int get_bt_addr(struct mm_struct *mm,
+ long __user *bd_entry_ptr,
+ unsigned long *bt_addr_result)
+{
+ int ret;
+ int valid_bit;
+ unsigned long bd_entry;
+ unsigned long bt_addr;
+
+ if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
+ return -EFAULT;
+
+ while (1) {
+ int need_write = 0;
+
+ pagefault_disable();
+ ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
+ pagefault_enable();
+ if (!ret)
+ break;
+ if (ret == -EFAULT)
+ ret = mpx_resolve_fault(bd_entry_ptr, need_write);
+ /*
+ * If we could not resolve the fault, consider it
+ * userspace's fault and error out.
+ */
+ if (ret)
+ return ret;
+ }
+
+ valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
+ bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
+
+ /*
+ * When the kernel is managing bounds tables, a bounds directory
+ * entry will either have a valid address (plus the valid bit)
+ * *OR* be completely empty. If we see a !valid entry *and* some
+ * data in the address field, we know something is wrong. This
+ * -EINVAL return will cause a SIGSEGV.
+ */
+ if (!valid_bit && bt_addr)
+ return -EINVAL;
+ /*
+ * Do we have an completely zeroed bt entry? That is OK. It
+ * just means there was no bounds table for this memory. Make
+ * sure to distinguish this from -EINVAL, which will cause
+ * a SEGV.
+ */
+ if (!valid_bit)
+ return -ENOENT;
+
+ *bt_addr_result = bt_addr;
+ return 0;
+}
+
+static inline int bt_entry_size_bytes(struct mm_struct *mm)
+{
+ if (is_64bit_mm(mm))
+ return MPX_BT_ENTRY_BYTES_64;
+ else
+ return MPX_BT_ENTRY_BYTES_32;
+}
+
+/*
+ * Take a virtual address and turns it in to the offset in bytes
+ * inside of the bounds table where the bounds table entry
+ * controlling 'addr' can be found.
+ */
+static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
+ unsigned long addr)
+{
+ unsigned long bt_table_nr_entries;
+ unsigned long offset = addr;
+
+ if (is_64bit_mm(mm)) {
+ /* Bottom 3 bits are ignored on 64-bit */
+ offset >>= 3;
+ bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
+ } else {
+ /* Bottom 2 bits are ignored on 32-bit */
+ offset >>= 2;
+ bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
+ }
+ /*
+ * We know the size of the table in to which we are
+ * indexing, and we have eliminated all the low bits
+ * which are ignored for indexing.
+ *
+ * Mask out all the high bits which we do not need
+ * to index in to the table. Note that the tables
+ * are always powers of two so this gives us a proper
+ * mask.
+ */
+ offset &= (bt_table_nr_entries-1);
+ /*
+ * We now have an entry offset in terms of *entries* in
+ * the table. We need to scale it back up to bytes.
+ */
+ offset *= bt_entry_size_bytes(mm);
+ return offset;
+}
+
+/*
+ * How much virtual address space does a single bounds
+ * directory entry cover?
+ *
+ * Note, we need a long long because 4GB doesn't fit in
+ * to a long on 32-bit.
+ */
+static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
+{
+ unsigned long long virt_space;
+ unsigned long long GB = (1ULL << 30);
+
+ /*
+ * This covers 32-bit emulation as well as 32-bit kernels
+ * running on 64-bit harware.
+ */
+ if (!is_64bit_mm(mm))
+ return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
+
+ /*
+ * 'x86_virt_bits' returns what the hardware is capable
+ * of, and returns the full >32-bit adddress space when
+ * running 32-bit kernels on 64-bit hardware.
+ */
+ virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
+ return virt_space / MPX_BD_NR_ENTRIES_64;
+}
+
+/*
+ * Free the backing physical pages of bounds table 'bt_addr'.
+ * Assume start...end is within that bounds table.
+ */
+static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
+ unsigned long bt_addr,
+ unsigned long start_mapping, unsigned long end_mapping)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr, len;
+ unsigned long start;
+ unsigned long end;
+
+ /*
+ * if we 'end' on a boundary, the offset will be 0 which
+ * is not what we want. Back it up a byte to get the
+ * last bt entry. Then once we have the entry itself,
+ * move 'end' back up by the table entry size.
+ */
+ start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
+ end = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
+ /*
+ * Move end back up by one entry. Among other things
+ * this ensures that it remains page-aligned and does
+ * not screw up zap_page_range()
+ */
+ end += bt_entry_size_bytes(mm);
+
+ /*
+ * Find the first overlapping vma. If vma->vm_start > start, there
+ * will be a hole in the bounds table. This -EINVAL return will
+ * cause a SIGSEGV.
+ */
+ vma = find_vma(mm, start);
+ if (!vma || vma->vm_start > start)
+ return -EINVAL;
+
+ /*
+ * A NUMA policy on a VM_MPX VMA could cause this bounds table to
+ * be split. So we need to look across the entire 'start -> end'
+ * range of this bounds table, find all of the VM_MPX VMAs, and
+ * zap only those.
+ */
+ addr = start;
+ while (vma && vma->vm_start < end) {
+ /*
+ * We followed a bounds directory entry down
+ * here. If we find a non-MPX VMA, that's bad,
+ * so stop immediately and return an error. This
+ * probably results in a SIGSEGV.
+ */
+ if (!(vma->vm_flags & VM_MPX))
+ return -EINVAL;
+
+ len = min(vma->vm_end, end) - addr;
+ zap_page_range(vma, addr, len, NULL);
+ trace_mpx_unmap_zap(addr, addr+len);
+
+ vma = vma->vm_next;
+ addr = vma->vm_start;
+ }
+ return 0;
+}
+
+static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
+ unsigned long addr)
+{
+ /*
+ * There are several ways to derive the bd offsets. We
+ * use the following approach here:
+ * 1. We know the size of the virtual address space
+ * 2. We know the number of entries in a bounds table
+ * 3. We know that each entry covers a fixed amount of
+ * virtual address space.
+ * So, we can just divide the virtual address by the
+ * virtual space used by one entry to determine which
+ * entry "controls" the given virtual address.
+ */
+ if (is_64bit_mm(mm)) {
+ int bd_entry_size = 8; /* 64-bit pointer */
+ /*
+ * Take the 64-bit addressing hole in to account.
+ */
+ addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
+ return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+ } else {
+ int bd_entry_size = 4; /* 32-bit pointer */
+ /*
+ * 32-bit has no hole so this case needs no mask
+ */
+ return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+ }
+ /*
+ * The two return calls above are exact copies. If we
+ * pull out a single copy and put it in here, gcc won't
+ * realize that we're doing a power-of-2 divide and use
+ * shifts. It uses a real divide. If we put them up
+ * there, it manages to figure it out (gcc 4.8.3).
+ */
+}
+
+static int unmap_entire_bt(struct mm_struct *mm,
+ long __user *bd_entry, unsigned long bt_addr)
+{
+ unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+ unsigned long uninitialized_var(actual_old_val);
+ int ret;
+
+ while (1) {
+ int need_write = 1;
+ unsigned long cleared_bd_entry = 0;
+
+ pagefault_disable();
+ ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
+ bd_entry, expected_old_val, cleared_bd_entry);
+ pagefault_enable();
+ if (!ret)
+ break;
+ if (ret == -EFAULT)
+ ret = mpx_resolve_fault(bd_entry, need_write);
+ /*
+ * If we could not resolve the fault, consider it
+ * userspace's fault and error out.
+ */
+ if (ret)
+ return ret;
+ }
+ /*
+ * The cmpxchg was performed, check the results.
+ */
+ if (actual_old_val != expected_old_val) {
+ /*
+ * Someone else raced with us to unmap the table.
+ * That is OK, since we were both trying to do
+ * the same thing. Declare success.
+ */
+ if (!actual_old_val)
+ return 0;
+ /*
+ * Something messed with the bounds directory
+ * entry. We hold mmap_sem for read or write
+ * here, so it could not be a _new_ bounds table
+ * that someone just allocated. Something is
+ * wrong, so pass up the error and SIGSEGV.
+ */
+ return -EINVAL;
+ }
+ /*
+ * Note, we are likely being called under do_munmap() already. To
+ * avoid recursion, do_munmap() will check whether it comes
+ * from one bounds table through VM_MPX flag.
+ */
+ return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
+}
+
+static int try_unmap_single_bt(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *next;
+ struct vm_area_struct *prev;
+ /*
+ * "bta" == Bounds Table Area: the area controlled by the
+ * bounds table that we are unmapping.
+ */
+ unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
+ unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
+ unsigned long uninitialized_var(bt_addr);
+ void __user *bde_vaddr;
+ int ret;
+ /*
+ * We already unlinked the VMAs from the mm's rbtree so 'start'
+ * is guaranteed to be in a hole. This gets us the first VMA
+ * before the hole in to 'prev' and the next VMA after the hole
+ * in to 'next'.
+ */
+ next = find_vma_prev(mm, start, &prev);
+ /*
+ * Do not count other MPX bounds table VMAs as neighbors.
+ * Although theoretically possible, we do not allow bounds
+ * tables for bounds tables so our heads do not explode.
+ * If we count them as neighbors here, we may end up with
+ * lots of tables even though we have no actual table
+ * entries in use.
+ */
+ while (next && (next->vm_flags & VM_MPX))
+ next = next->vm_next;
+ while (prev && (prev->vm_flags & VM_MPX))
+ prev = prev->vm_prev;
+ /*
+ * We know 'start' and 'end' lie within an area controlled
+ * by a single bounds table. See if there are any other
+ * VMAs controlled by that bounds table. If there are not
+ * then we can "expand" the are we are unmapping to possibly
+ * cover the entire table.
+ */
+ next = find_vma_prev(mm, start, &prev);
+ if ((!prev || prev->vm_end <= bta_start_vaddr) &&
+ (!next || next->vm_start >= bta_end_vaddr)) {
+ /*
+ * No neighbor VMAs controlled by same bounds
+ * table. Try to unmap the whole thing
+ */
+ start = bta_start_vaddr;
+ end = bta_end_vaddr;
+ }
+
+ bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
+ ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
+ /*
+ * No bounds table there, so nothing to unmap.
+ */
+ if (ret == -ENOENT) {
+ ret = 0;
+ return 0;
+ }
+ if (ret)
+ return ret;
+ /*
+ * We are unmapping an entire table. Either because the
+ * unmap that started this whole process was large enough
+ * to cover an entire table, or that the unmap was small
+ * but was the area covered by a bounds table.
+ */
+ if ((start == bta_start_vaddr) &&
+ (end == bta_end_vaddr))
+ return unmap_entire_bt(mm, bde_vaddr, bt_addr);
+ return zap_bt_entries_mapping(mm, bt_addr, start, end);
+}
+
+static int mpx_unmap_tables(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long one_unmap_start;
+ trace_mpx_unmap_search(start, end);
+
+ one_unmap_start = start;
+ while (one_unmap_start < end) {
+ int ret;
+ unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
+ bd_entry_virt_space(mm));
+ unsigned long one_unmap_end = end;
+ /*
+ * if the end is beyond the current bounds table,
+ * move it back so we only deal with a single one
+ * at a time
+ */
+ if (one_unmap_end > next_unmap_start)
+ one_unmap_end = next_unmap_start;
+ ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
+ if (ret)
+ return ret;
+
+ one_unmap_start = next_unmap_start;
+ }
+ return 0;
+}
+
+/*
+ * Free unused bounds tables covered in a virtual address region being
+ * munmap()ed. Assume end > start.
+ *
+ * This function will be called by do_munmap(), and the VMAs covering
+ * the virtual address region start...end have already been split if
+ * necessary, and the 'vma' is the first vma in this range (start -> end).
+ */
+void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /*
+ * Refuse to do anything unless userspace has asked
+ * the kernel to help manage the bounds tables,
+ */
+ if (!kernel_managing_mpx_tables(current->mm))
+ return;
+ /*
+ * This will look across the entire 'start -> end' range,
+ * and find all of the non-VM_MPX VMAs.
+ *
+ * To avoid recursion, if a VM_MPX vma is found in the range
+ * (start->end), we will not continue follow-up work. This
+ * recursion represents having bounds tables for bounds tables,
+ * which should not occur normally. Being strict about it here
+ * helps ensure that we do not have an exploitable stack overflow.
+ */
+ do {
+ if (vma->vm_flags & VM_MPX)
+ return;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+
+ ret = mpx_unmap_tables(mm, start, end);
+ if (ret)
+ force_sig(SIGSEGV, current);
+}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
new file mode 100644
index 0000000..c3b3f65
--- /dev/null
+++ b/arch/x86/mm/numa.c
@@ -0,0 +1,884 @@
+/* Common code for 32 and 64-bit NUMA */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/acpi.h>
+#include <asm/amd_nb.h>
+
+#include "numa_internal.h"
+
+int __initdata numa_off;
+nodemask_t numa_nodes_parsed __initdata;
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+
+static __init int numa_setup(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+ if (!strncmp(opt, "off", 3))
+ numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+ if (!strncmp(opt, "fake=", 5))
+ numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!strncmp(opt, "noacpi", 6))
+ acpi_numa = -1;
+#endif
+ return 0;
+}
+early_param("numa", numa_setup);
+
+/*
+ * apicid, cpu, node mappings
+ */
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+int numa_cpu_node(int cpu)
+{
+ int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+ if (apicid != BAD_APICID)
+ return __apicid_to_node[apicid];
+ return NUMA_NO_NODE;
+}
+
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+void numa_set_node(int cpu, int node)
+{
+ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+ /* early setting, no percpu area yet */
+ if (cpu_to_node_map) {
+ cpu_to_node_map[cpu] = node;
+ return;
+ }
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+ printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+ dump_stack();
+ return;
+ }
+#endif
+ per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+ set_cpu_numa_node(cpu, node);
+}
+
+void numa_clear_node(int cpu)
+{
+ numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: cpumask_of_node() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+ unsigned int node;
+
+ /* setup nr_node_ids if not done yet */
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
+
+ /* allocate the map */
+ for (node = 0; node < nr_node_ids; node++)
+ alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+
+ /* cpumask_of_node() will now work */
+ pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+}
+
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+ struct numa_meminfo *mi)
+{
+ /* ignore zero length blks */
+ if (start == end)
+ return 0;
+
+ /* whine about and ignore invalid blks */
+ if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+ pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
+ return 0;
+ }
+
+ if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: too many memblk ranges\n");
+ return -EINVAL;
+ }
+
+ mi->blk[mi->nr_blks].start = start;
+ mi->blk[mi->nr_blks].end = end;
+ mi->blk[mi->nr_blks].nid = nid;
+ mi->nr_blks++;
+ return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+ mi->nr_blks--;
+ memmove(&mi->blk[idx], &mi->blk[idx + 1],
+ (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /*
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
+ */
+ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
+ MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in node %d\n",
+ nd_size, nid);
+ return;
+ }
+ }
+ nd = __va(nd_pa);
+
+ /* report and initialize */
+ printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+ nd_pa, nd_pa + nd_size - 1);
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (tnid != nid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+
+ node_set_online(nid);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks. Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+ const u64 low = 0;
+ const u64 high = PFN_PHYS(max_pfn);
+ int i, j, k;
+
+ /* first, trim all entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ /* make sure all blocks are inside the limits */
+ bi->start = max(bi->start, low);
+ bi->end = min(bi->end, high);
+
+ /* and there's no empty or non-exist block */
+ if (bi->start >= bi->end ||
+ !memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start))
+ numa_remove_memblk_from(i--, mi);
+ }
+
+ /* merge neighboring / overlapping entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ for (j = i + 1; j < mi->nr_blks; j++) {
+ struct numa_memblk *bj = &mi->blk[j];
+ u64 start, end;
+
+ /*
+ * See whether there are overlapping blocks. Whine
+ * about but allow overlaps of the same nid. They
+ * will be merged below.
+ */
+ if (bi->end > bj->start && bi->start < bj->end) {
+ if (bi->nid != bj->nid) {
+ pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->nid, bj->start, bj->end - 1);
+ return -EINVAL;
+ }
+ pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->start, bj->end - 1);
+ }
+
+ /*
+ * Join together blocks on the same node, holes
+ * between which don't overlap with memory on other
+ * nodes.
+ */
+ if (bi->nid != bj->nid)
+ continue;
+ start = min(bi->start, bj->start);
+ end = max(bi->end, bj->end);
+ for (k = 0; k < mi->nr_blks; k++) {
+ struct numa_memblk *bk = &mi->blk[k];
+
+ if (bi->nid == bk->nid)
+ continue;
+ if (start < bk->end && end > bk->start)
+ break;
+ }
+ if (k < mi->nr_blks)
+ continue;
+ printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1, bj->start,
+ bj->end - 1, start, end - 1);
+ bi->start = start;
+ bi->end = end;
+ numa_remove_memblk_from(j--, mi);
+ }
+ }
+
+ /* clear unused ones */
+ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+ mi->blk[i].start = mi->blk[i].end = 0;
+ mi->blk[i].nid = NUMA_NO_NODE;
+ }
+
+ return 0;
+}
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+ const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+ if (mi->blk[i].start != mi->blk[i].end &&
+ mi->blk[i].nid != NUMA_NO_NODE)
+ node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed. The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+ size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+ /* numa_distance could be 1LU marking allocation failure, test cnt */
+ if (numa_distance_cnt)
+ memblock_free(__pa(numa_distance), size);
+ numa_distance_cnt = 0;
+ numa_distance = NULL; /* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+ nodemask_t nodes_parsed;
+ size_t size;
+ int i, j, cnt = 0;
+ u64 phys;
+
+ /* size the new table and allocate it */
+ nodes_parsed = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+ for_each_node_mask(i, nodes_parsed)
+ cnt = i;
+ cnt++;
+ size = cnt * cnt * sizeof(numa_distance[0]);
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ size, PAGE_SIZE);
+ if (!phys) {
+ pr_warning("NUMA: Warning: can't allocate distance table!\n");
+ /* don't retry until explicitly reset */
+ numa_distance = (void *)1LU;
+ return -ENOMEM;
+ }
+ memblock_reserve(phys, size);
+
+ numa_distance = __va(phys);
+ numa_distance_cnt = cnt;
+
+ /* fill with the default distances */
+ for (i = 0; i < cnt; i++)
+ for (j = 0; j < cnt; j++)
+ numa_distance[i * cnt + j] = i == j ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+ return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to' node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance. If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+ if (!numa_distance && numa_alloc_distance() < 0)
+ return;
+
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+ from < 0 || to < 0) {
+ pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ if ((u8)distance != distance ||
+ (from == to && distance != LOCAL_DISTANCE)) {
+ pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+ return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+ return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common). Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+ u64 numaram, e820ram;
+ int i;
+
+ numaram = 0;
+ for (i = 0; i < mi->nr_blks; i++) {
+ u64 s = mi->blk[i].start >> PAGE_SHIFT;
+ u64 e = mi->blk[i].end >> PAGE_SHIFT;
+ numaram += e - s;
+ numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+ if ((s64)numaram < 0)
+ numaram = 0;
+ }
+
+ e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+
+ /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+ printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
+ (numaram << PAGE_SHIFT) >> 20,
+ (e820ram << PAGE_SHIFT) >> 20);
+ return false;
+ }
+ return true;
+}
+
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ int i, nid;
+ nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
+ unsigned long start, end;
+ struct memblock_region *r;
+
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = &numa_meminfo.blk[i];
+
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ }
+
+ /*
+ * Mark all kernel nodes.
+ *
+ * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
+ * may not include all the memblock.reserved memory ranges because
+ * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+ */
+ for_each_memblock(reserved, r)
+ if (r->nid != MAX_NUMNODES)
+ node_set(r->nid, numa_kernel_nodes);
+
+ /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ nid = numa_meminfo.blk[i].nid;
+ if (!node_isset(nid, numa_kernel_nodes))
+ continue;
+
+ start = numa_meminfo.blk[i].start;
+ end = numa_meminfo.blk[i].end;
+
+ memblock_clear_hotplug(start, end - start);
+ }
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+ unsigned long uninitialized_var(pfn_align);
+ int i, nid;
+
+ /* Account for nodes with cpus and no memory */
+ node_possible_map = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&node_possible_map, mi);
+ if (WARN_ON(nodes_empty(node_possible_map)))
+ return -EINVAL;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *mb = &mi->blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.memory, mb->nid);
+ }
+
+ /*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, alloc node data won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
+ /*
+ * If sections array is gonna be used for pfn -> nid mapping, check
+ * whether its granularity is fine enough.
+ */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ pfn_align = node_map_pfn_alignment();
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+ PFN_PHYS(pfn_align) >> 20,
+ PFN_PHYS(PAGES_PER_SECTION) >> 20);
+ return -EINVAL;
+ }
+#endif
+ if (!numa_meminfo_cover_memory(mi))
+ return -EINVAL;
+
+ /* Finally register nodes. */
+ for_each_node_mask(nid, node_possible_map) {
+ u64 start = PFN_PHYS(max_pfn);
+ u64 end = 0;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ if (nid != mi->blk[i].nid)
+ continue;
+ start = min(mi->blk[i].start, start);
+ end = max(mi->blk[i].end, end);
+ }
+
+ if (start >= end)
+ continue;
+
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ continue;
+
+ alloc_node_data(nid);
+ }
+
+ /* Dump memblock with node info and return. */
+ memblock_dump_all();
+ return 0;
+}
+
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+static void __init numa_init_array(void)
+{
+ int rr, i;
+
+ rr = first_node(node_online_map);
+ for (i = 0; i < nr_cpu_ids; i++) {
+ if (early_cpu_to_node(i) != NUMA_NO_NODE)
+ continue;
+ numa_set_node(i, rr);
+ rr = next_node(rr, node_online_map);
+ if (rr == MAX_NUMNODES)
+ rr = first_node(node_online_map);
+ }
+}
+
+static int __init numa_init(int (*init_func)(void))
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ set_apicid_to_node(i, NUMA_NO_NODE);
+
+ nodes_clear(numa_nodes_parsed);
+ nodes_clear(node_possible_map);
+ nodes_clear(node_online_map);
+ memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+ MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+ MAX_NUMNODES));
+ /* In case that parsing SRAT failed. */
+ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
+ numa_reset_distance();
+
+ ret = init_func();
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We reset memblock back to the top-down direction
+ * here because if we configured ACPI_NUMA, we have
+ * parsed SRAT in init_func(). It is ok to have the
+ * reset here even if we did't configure ACPI_NUMA
+ * or acpi numa init fails and fallbacks to dummy
+ * numa init.
+ */
+ memblock_set_bottom_up(false);
+
+ ret = numa_cleanup_meminfo(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+ ret = numa_register_memblks(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ int nid = early_cpu_to_node(i);
+
+ if (nid == NUMA_NO_NODE)
+ continue;
+ if (!node_online(nid))
+ numa_clear_node(i);
+ }
+ numa_init_array();
+
+ return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory. This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+ printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+ 0LLU, PFN_PHYS(max_pfn) - 1);
+
+ node_set(0, numa_nodes_parsed);
+ numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+
+ return 0;
+}
+
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds. The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+ if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+ if (!numa_init(x86_acpi_numa_init))
+ return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_init(amd_numa_init))
+ return;
+#endif
+ }
+
+ numa_init(dummy_numa_init);
+}
+
+static __init int find_near_online_node(int node)
+{
+ int n, val;
+ int min_val = INT_MAX;
+ int best_node = -1;
+
+ for_each_online_node(n) {
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ return best_node;
+}
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+ int cpu;
+ u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+
+ BUG_ON(cpu_to_apicid == NULL);
+
+ for_each_possible_cpu(cpu) {
+ int node = numa_cpu_node(cpu);
+
+ if (node == NUMA_NO_NODE)
+ continue;
+ if (!node_online(node))
+ node = find_near_online_node(node);
+ numa_set_node(cpu, node);
+ }
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+# ifndef CONFIG_NUMA_EMU
+void numa_add_cpu(int cpu)
+{
+ cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif /* !CONFIG_NUMA_EMU */
+
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+int __cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+ printk(KERN_WARNING
+ "cpu_to_node(%d): usage too early!\n", cpu);
+ dump_stack();
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+ if (!cpu_possible(cpu)) {
+ printk(KERN_WARNING
+ "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+ dump_stack();
+ return NUMA_NO_NODE;
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+void debug_cpumask_set_cpu(int cpu, int node, bool enable)
+{
+ struct cpumask *mask;
+
+ if (node == NUMA_NO_NODE) {
+ /* early_cpu_to_node() already emits a warning and trace */
+ return;
+ }
+ mask = node_to_cpumask_map[node];
+ if (!mask) {
+ pr_err("node_to_cpumask_map[%i] NULL\n", node);
+ dump_stack();
+ return;
+ }
+
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu",
+ cpu, node, cpumask_pr_args(mask));
+ return;
+}
+
+# ifndef CONFIG_NUMA_EMU
+static void numa_set_cpumask(int cpu, bool enable)
+{
+ debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
+}
+
+void numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, false);
+}
+# endif /* !CONFIG_NUMA_EMU */
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const struct cpumask *cpumask_of_node(int node)
+{
+ if (node >= nr_node_ids) {
+ printk(KERN_WARNING
+ "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+ node, nr_node_ids);
+ dump_stack();
+ return cpu_none_mask;
+ }
+ if (node_to_cpumask_map[node] == NULL) {
+ printk(KERN_WARNING
+ "cpumask_of_node(%d): no node_to_cpumask_map!\n",
+ node);
+ dump_stack();
+ return cpu_online_mask;
+ }
+ return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+ struct numa_meminfo *mi = &numa_meminfo;
+ int nid = mi->blk[0].nid;
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].start <= start && mi->blk[i].end > start)
+ nid = mi->blk[i].nid;
+ return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
new file mode 100644
index 0000000..3686a1d
--- /dev/null
+++ b/arch/x86/mm/numa_32.c
@@ -0,0 +1,105 @@
+/*
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+
+#include "numa_internal.h"
+
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * 4) physnode_map - the mapping between a pfn and owning node
+ * physnode_map keeps track of the physical memory layout of a generic
+ * numa node on a 64Mb break (each element of the array will
+ * represent 64Mb of memory and will be marked by the node id. so,
+ * if the first gig is on node 0, and the second gig is on node 1
+ * physnode_map will contain:
+ *
+ * physnode_map[0-15] = 0;
+ * physnode_map[16-31] = 1;
+ * physnode_map[32- ] = -1;
+ */
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
+EXPORT_SYMBOL(physnode_map);
+
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+
+ printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
+ nid, start, end);
+ printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
+ printk(KERN_DEBUG " ");
+ start = round_down(start, PAGES_PER_SECTION);
+ end = round_up(end, PAGES_PER_SECTION);
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ physnode_map[pfn / PAGES_PER_SECTION] = nid;
+ printk(KERN_CONT "%lx ", pfn);
+ }
+ printk(KERN_CONT "\n");
+}
+
+unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long nr_pages = end_pfn - start_pfn;
+
+ if (!nr_pages)
+ return 0;
+
+ return (nr_pages + 1) * sizeof(struct page);
+}
+#endif
+
+extern unsigned long highend_pfn, highstart_pfn;
+
+void __init initmem_init(void)
+{
+ x86_numa_init();
+
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+ printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
+ max_low_pfn, highstart_pfn);
+
+ printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(max_low_pfn));
+
+ printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(highstart_pfn));
+
+ __vmalloc_start_set = true;
+ setup_bootmem_allocator();
+}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
new file mode 100644
index 0000000..9405ffc
--- /dev/null
+++ b/arch/x86/mm/numa_64.c
@@ -0,0 +1,12 @@
+/*
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/bootmem.h>
+
+#include "numa_internal.h"
+
+void __init initmem_init(void)
+{
+ x86_numa_init();
+}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 0000000..a8f90ce
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,502 @@
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <asm/dma.h>
+
+#include "numa_internal.h"
+
+static int emu_nid_to_phys[MAX_NUMNODES];
+static char *emu_cmdline __initdata;
+
+void __init numa_emu_cmdline(char *str)
+{
+ emu_cmdline = str;
+}
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].nid == nid)
+ return i;
+ return -ENOENT;
+}
+
+static u64 __init mem_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (start_pfn < end_pfn)
+ return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+ return 0;
+}
+
+/*
+ * Sets up nid to range from @start to @end. The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ int nid, int phys_blk, u64 size)
+{
+ struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+ struct numa_memblk *pb = &pi->blk[phys_blk];
+
+ if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+ return -EINVAL;
+ }
+
+ ei->nr_blks++;
+ eb->start = pb->start;
+ eb->end = pb->start + size;
+ eb->nid = nid;
+
+ if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+ emu_nid_to_phys[nid] = nid;
+
+ pb->start += size;
+ if (pb->start >= pb->end) {
+ WARN_ON_ONCE(pb->start > pb->end);
+ numa_remove_memblk_from(phys_blk, pi);
+ }
+
+ printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+ return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr. The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, int nr_nodes)
+{
+ nodemask_t physnode_mask = NODE_MASK_NONE;
+ u64 size;
+ int big;
+ int nid = 0;
+ int i, ret;
+
+ if (nr_nodes <= 0)
+ return -1;
+ if (nr_nodes > MAX_NUMNODES) {
+ pr_info("numa=fake=%d too large, reducing to %d\n",
+ nr_nodes, MAX_NUMNODES);
+ nr_nodes = MAX_NUMNODES;
+ }
+
+ /*
+ * Calculate target node size. x86_32 freaks on __udivdi3() so do
+ * the division in ulong number of pages and convert back.
+ */
+ size = max_addr - addr - mem_hole_size(addr, max_addr);
+ size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
+ /*
+ * Calculate the number of big nodes that can be allocated as a result
+ * of consolidating the remainder.
+ */
+ big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+ FAKE_NODE_MIN_SIZE;
+
+ size &= FAKE_NODE_MIN_HASH_MASK;
+ if (!size) {
+ pr_err("Not enough memory for each node. "
+ "NUMA emulation disabled.\n");
+ return -1;
+ }
+
+ for (i = 0; i < pi->nr_blks; i++)
+ node_set(pi->blk[i].nid, physnode_mask);
+
+ /*
+ * Continue to fill physical nodes with fake nodes until there is no
+ * memory left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+ end = start + size;
+
+ if (nid < big)
+ end += FAKE_NODE_MIN_SIZE;
+
+ /*
+ * Continue to add memory to this fake node if its
+ * non-reserved memory is less than the per-node size.
+ */
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > limit) {
+ end = limit;
+ break;
+ }
+ }
+
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (limit - end - mem_hole_size(end, limit) < size)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+ u64 end = start + size;
+
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > max_addr) {
+ end = max_addr;
+ break;
+ }
+ }
+ return end;
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'. The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size)
+{
+ nodemask_t physnode_mask = NODE_MASK_NONE;
+ u64 min_size;
+ int nid = 0;
+ int i, ret;
+
+ if (!size)
+ return -1;
+ /*
+ * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+ * increased accordingly if the requested size is too small. This
+ * creates a uniform distribution of node sizes across the entire
+ * machine (but not necessarily over physical nodes).
+ */
+ min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
+ min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+ if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+ min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+ FAKE_NODE_MIN_HASH_MASK;
+ if (size < min_size) {
+ pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+ size >> 20, min_size >> 20);
+ size = min_size;
+ }
+ size &= FAKE_NODE_MIN_HASH_MASK;
+
+ for (i = 0; i < pi->nr_blks; i++)
+ node_set(pi->blk[i].nid, physnode_mask);
+
+ /*
+ * Fill physical nodes with fake nodes of size until there is no memory
+ * left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+
+ end = find_end_of_node(start, limit, size);
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (limit - end - mem_hole_size(end, limit) < size)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success. @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ * emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ * nodes. The distances are determined considering how emulated nodes
+ * are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+ static struct numa_meminfo ei __initdata;
+ static struct numa_meminfo pi __initdata;
+ const u64 max_addr = PFN_PHYS(max_pfn);
+ u8 *phys_dist = NULL;
+ size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+ int max_emu_nid, dfl_phys_nid;
+ int i, j, ret;
+
+ if (!emu_cmdline)
+ goto no_emu;
+
+ memset(&ei, 0, sizeof(ei));
+ pi = *numa_meminfo;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ emu_nid_to_phys[i] = NUMA_NO_NODE;
+
+ /*
+ * If the numa=fake command-line contains a 'M' or 'G', it represents
+ * the fixed node size. Otherwise, if it is just a single number N,
+ * split the system RAM into N fake nodes.
+ */
+ if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+ u64 size;
+
+ size = memparse(emu_cmdline, &emu_cmdline);
+ ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+ } else {
+ unsigned long n;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+ }
+ if (*emu_cmdline == ':')
+ emu_cmdline++;
+
+ if (ret < 0)
+ goto no_emu;
+
+ if (numa_cleanup_meminfo(&ei) < 0) {
+ pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+ goto no_emu;
+ }
+
+ /* copy the physical distance table */
+ if (numa_dist_cnt) {
+ u64 phys;
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ phys_size, PAGE_SIZE);
+ if (!phys) {
+ pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+ goto no_emu;
+ }
+ memblock_reserve(phys, phys_size);
+ phys_dist = __va(phys);
+
+ for (i = 0; i < numa_dist_cnt; i++)
+ for (j = 0; j < numa_dist_cnt; j++)
+ phys_dist[i * numa_dist_cnt + j] =
+ node_distance(i, j);
+ }
+
+ /*
+ * Determine the max emulated nid and the default phys nid to use
+ * for unmapped nodes.
+ */
+ max_emu_nid = 0;
+ dfl_phys_nid = NUMA_NO_NODE;
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+ max_emu_nid = i;
+ if (dfl_phys_nid == NUMA_NO_NODE)
+ dfl_phys_nid = emu_nid_to_phys[i];
+ }
+ }
+ if (dfl_phys_nid == NUMA_NO_NODE) {
+ pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
+ goto no_emu;
+ }
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ /*
+ * Transform __apicid_to_node table to use emulated nids by
+ * reverse-mapping phys_nid. The maps should always exist but fall
+ * back to zero just in case.
+ */
+ for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+ if (__apicid_to_node[i] == NUMA_NO_NODE)
+ continue;
+ for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+ if (__apicid_to_node[i] == emu_nid_to_phys[j])
+ break;
+ __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+ }
+
+ /* make sure all emulated nodes are mapped to a physical node */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+ emu_nid_to_phys[i] = dfl_phys_nid;
+
+ /* transform distance table */
+ numa_reset_distance();
+ for (i = 0; i < max_emu_nid + 1; i++) {
+ for (j = 0; j < max_emu_nid + 1; j++) {
+ int physi = emu_nid_to_phys[i];
+ int physj = emu_nid_to_phys[j];
+ int dist;
+
+ if (get_option(&emu_cmdline, &dist) == 2)
+ ;
+ else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+ dist = physi == physj ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ else
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+
+ numa_set_distance(i, j, dist);
+ }
+ }
+
+ /* free the copied physical distance table */
+ if (phys_dist)
+ memblock_free(__pa(phys_dist), phys_size);
+ return;
+
+no_emu:
+ /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ emu_nid_to_phys[i] = i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void numa_add_cpu(int cpu)
+{
+ int physnid, nid;
+
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ physnid = emu_nid_to_phys[nid];
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid)
+ if (emu_nid_to_phys[nid] == physnid)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void numa_set_cpumask(int cpu, bool enable)
+{
+ int nid, physnid;
+
+ nid = early_cpu_to_node(cpu);
+ if (nid == NUMA_NO_NODE) {
+ /* early_cpu_to_node() already emits a warning and trace */
+ return;
+ }
+
+ physnid = emu_nid_to_phys[nid];
+
+ for_each_online_node(nid) {
+ if (emu_nid_to_phys[nid] != physnid)
+ continue;
+
+ debug_cpumask_set_cpu(cpu, nid, enable);
+ }
+}
+
+void numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, false);
+}
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 0000000..ad86ec9
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,33 @@
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+
+#include <linux/types.h>
+#include <asm/numa.h>
+
+struct numa_memblk {
+ u64 start;
+ u64 end;
+ int nid;
+};
+
+struct numa_meminfo {
+ int nr_blks;
+ struct numa_memblk blk[NR_NODE_MEMBLKS];
+};
+
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+
+void __init x86_numa_init(void);
+
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt)
+{ }
+#endif
+
+#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
new file mode 100644
index 0000000..5f169d5
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,260 @@
+/*
+ * self test for change_page_attr.
+ *
+ * Clears the a test pte bit on random pages in the direct mapping,
+ * then reverts and compares page tables forwards and afterwards.
+ */
+#include <linux/bootmem.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+/*
+ * Only print the results of the first pass:
+ */
+static __read_mostly int print = 1;
+
+enum {
+ NTEST = 400,
+#ifdef CONFIG_X86_64
+ LPS = (1 << PMD_SHIFT),
+#elif defined(CONFIG_X86_PAE)
+ LPS = (1 << PMD_SHIFT),
+#else
+ LPS = (1 << 22),
+#endif
+ GPS = (1<<30)
+};
+
+#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
+
+static int pte_testbit(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SOFTW1;
+}
+
+struct split_state {
+ long lpg, gpg, spg, exec;
+ long min_exec, max_exec;
+};
+
+static int print_split(struct split_state *s)
+{
+ long i, expected, missed = 0;
+ int err = 0;
+
+ s->lpg = s->gpg = s->spg = s->exec = 0;
+ s->min_exec = ~0UL;
+ s->max_exec = 0;
+ for (i = 0; i < max_pfn_mapped; ) {
+ unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ if (!pte) {
+ missed++;
+ i++;
+ continue;
+ }
+
+ if (level == PG_LEVEL_1G && sizeof(long) == 8) {
+ s->gpg++;
+ i += GPS/PAGE_SIZE;
+ } else if (level == PG_LEVEL_2M) {
+ if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) {
+ printk(KERN_ERR
+ "%lx level %d but not PSE %Lx\n",
+ addr, level, (u64)pte_val(*pte));
+ err = 1;
+ }
+ s->lpg++;
+ i += LPS/PAGE_SIZE;
+ } else {
+ s->spg++;
+ i++;
+ }
+ if (!(pte_val(*pte) & _PAGE_NX)) {
+ s->exec++;
+ if (addr < s->min_exec)
+ s->min_exec = addr;
+ if (addr > s->max_exec)
+ s->max_exec = addr;
+ }
+ }
+ if (print) {
+ printk(KERN_INFO
+ " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
+ s->spg, s->lpg, s->gpg, s->exec,
+ s->min_exec != ~0UL ? s->min_exec : 0,
+ s->max_exec, missed);
+ }
+
+ expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
+ if (expected != i) {
+ printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
+ max_pfn_mapped, expected);
+ return 1;
+ }
+ return err;
+}
+
+static unsigned long addr[NTEST];
+static unsigned int len[NTEST];
+
+/* Change the global bit on random pages in the direct mapping */
+static int pageattr_test(void)
+{
+ struct split_state sa, sb, sc;
+ unsigned long *bm;
+ pte_t *pte, pte0;
+ int failed = 0;
+ unsigned int level;
+ int i, k;
+ int err;
+ unsigned long test_addr;
+
+ if (print)
+ printk(KERN_INFO "CPA self-test:\n");
+
+ bm = vzalloc((max_pfn_mapped + 7) / 8);
+ if (!bm) {
+ printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
+ return -ENOMEM;
+ }
+
+ failed += print_split(&sa);
+
+ for (i = 0; i < NTEST; i++) {
+ unsigned long pfn = prandom_u32() % max_pfn_mapped;
+
+ addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
+ len[i] = prandom_u32() % 100;
+ len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
+
+ if (len[i] == 0)
+ len[i] = 1;
+
+ pte = NULL;
+ pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
+
+ for (k = 0; k < len[i]; k++) {
+ pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
+ if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
+ !(pte_val(*pte) & _PAGE_PRESENT)) {
+ addr[i] = 0;
+ break;
+ }
+ if (k == 0) {
+ pte0 = *pte;
+ } else {
+ if (pgprot_val(pte_pgprot(*pte)) !=
+ pgprot_val(pte_pgprot(pte0))) {
+ len[i] = k;
+ break;
+ }
+ }
+ if (test_bit(pfn + k, bm)) {
+ len[i] = k;
+ break;
+ }
+ __set_bit(pfn + k, bm);
+ }
+ if (!addr[i] || !pte || !k) {
+ addr[i] = 0;
+ continue;
+ }
+
+ test_addr = addr[i];
+ err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
+ if (err < 0) {
+ printk(KERN_ERR "CPA %d failed %d\n", i, err);
+ failed++;
+ }
+
+ pte = lookup_address(addr[i], &level);
+ if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
+ printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
+ pte ? (u64)pte_val(*pte) : 0ULL);
+ failed++;
+ }
+ if (level != PG_LEVEL_4K) {
+ printk(KERN_ERR "CPA %lx: unexpected level %d\n",
+ addr[i], level);
+ failed++;
+ }
+
+ }
+ vfree(bm);
+
+ failed += print_split(&sb);
+
+ for (i = 0; i < NTEST; i++) {
+ if (!addr[i])
+ continue;
+ pte = lookup_address(addr[i], &level);
+ if (!pte) {
+ printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
+ failed++;
+ continue;
+ }
+ test_addr = addr[i];
+ err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
+ if (err < 0) {
+ printk(KERN_ERR "CPA reverting failed: %d\n", err);
+ failed++;
+ }
+ pte = lookup_address(addr[i], &level);
+ if (!pte || pte_testbit(*pte)) {
+ printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
+ addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
+ failed++;
+ }
+
+ }
+
+ failed += print_split(&sc);
+
+ if (failed) {
+ WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
+ return -EINVAL;
+ } else {
+ if (print)
+ printk(KERN_INFO "ok.\n");
+ }
+
+ return 0;
+}
+
+static int do_pageattr_test(void *__unused)
+{
+ while (!kthread_should_stop()) {
+ schedule_timeout_interruptible(HZ*30);
+ if (pageattr_test() < 0)
+ break;
+ if (print)
+ print--;
+ }
+ return 0;
+}
+
+static int start_pageattr_test(void)
+{
+ struct task_struct *p;
+
+ p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
+ if (!IS_ERR(p))
+ wake_up_process(p);
+ else
+ WARN_ON(1);
+
+ return 0;
+}
+device_initcall(start_pageattr_test);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
new file mode 100644
index 0000000..b599a78
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,1987 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pfn.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pat.h>
+
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+ unsigned long *vaddr;
+ pgd_t *pgd;
+ pgprot_t mask_set;
+ pgprot_t mask_clr;
+ unsigned long numpages;
+ int flags;
+ unsigned long pfn;
+ unsigned force_split : 1;
+ int curpage;
+ struct page **pages;
+};
+
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
+
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+ /* Protect against CPA */
+ spin_lock(&pgd_lock);
+ direct_pages_count[level] += pages;
+ spin_unlock(&pgd_lock);
+}
+
+static void split_page_count(int level)
+{
+ direct_pages_count[level]--;
+ direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+void arch_report_meminfo(struct seq_file *m)
+{
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+ seq_printf(m, "DirectMap4M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+ if (direct_gbpages)
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_1G] << 20);
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+ return __pa_symbol(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @vaddr: virtual start address
+ * @size: number of bytes to flush
+ *
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+ unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ void *vend = vaddr + size;
+ void *p;
+
+ mb();
+
+ for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ p < vend; p += boot_cpu_data.x86_clflush_size)
+ clflushopt(p);
+
+ mb();
+}
+EXPORT_SYMBOL_GPL(clflush_cache_range);
+
+static void __cpa_flush_all(void *arg)
+{
+ unsigned long cache = (unsigned long)arg;
+
+ /*
+ * Flush all to work around Errata in early athlons regarding
+ * large page flushing.
+ */
+ __flush_tlb_all();
+
+ if (cache && boot_cpu_data.x86 >= 4)
+ wbinvd();
+}
+
+static void cpa_flush_all(unsigned long cache)
+{
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_all, (void *) cache, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+ /*
+ * We could optimize that further and do individual per page
+ * tlb invalidates for a low number of pages. Caveat: we must
+ * flush the high aliases on 64bit as well.
+ */
+ __flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+ unsigned int i, level;
+ unsigned long addr;
+
+ BUG_ON(irqs_disabled());
+ WARN_ON(PAGE_ALIGN(start) != start);
+
+ on_each_cpu(__cpa_flush_range, NULL, 1);
+
+ if (!cache)
+ return;
+
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+ pte_t *pte = lookup_address(addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *) addr, PAGE_SIZE);
+ }
+}
+
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+ int in_flags, struct page **pages)
+{
+ unsigned int i, level;
+ unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
+
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+
+ if (!cache || do_wbinvd)
+ return;
+
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0; i < numpages; i++) {
+ unsigned long addr;
+ pte_t *pte;
+
+ if (in_flags & CPA_PAGES_ARRAY)
+ addr = (unsigned long)page_address(pages[i]);
+ else
+ addr = start[i];
+
+ pte = lookup_address(addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *)addr, PAGE_SIZE);
+ }
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
+ unsigned long pfn)
+{
+ pgprot_t forbidden = __pgprot(0);
+
+ /*
+ * The BIOS area between 640k and 1Mb needs to be executable for
+ * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+ */
+#ifdef CONFIG_PCI_BIOS
+ if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+ pgprot_val(forbidden) |= _PAGE_NX;
+#endif
+
+ /*
+ * The kernel text needs to be executable for obvious reasons
+ * Does not cover __inittext since that is gone later on. On
+ * 64bit we do not enforce !NX on the low mapping
+ */
+ if (within(address, (unsigned long)_text, (unsigned long)_etext))
+ pgprot_val(forbidden) |= _PAGE_NX;
+
+ /*
+ * The .rodata section needs to be read-only. Using the pfn
+ * catches all aliases.
+ */
+ if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+ __pa_symbol(__end_rodata) >> PAGE_SHIFT))
+ pgprot_val(forbidden) |= _PAGE_RW;
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+ /*
+ * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+ * kernel text mappings for the large page aligned text, rodata sections
+ * will be always read-only. For the kernel identity mappings covering
+ * the holes caused by this alignment can be anything that user asks.
+ *
+ * This will preserve the large page mappings for kernel text/data
+ * at no extra cost.
+ */
+ if (kernel_set_to_readonly &&
+ within(address, (unsigned long)_text,
+ (unsigned long)__end_rodata_hpage_align)) {
+ unsigned int level;
+
+ /*
+ * Don't enforce the !RW mapping for the kernel text mapping,
+ * if the current mapping is already using small page mapping.
+ * No need to work hard to preserve large page mappings in this
+ * case.
+ *
+ * This also fixes the Linux Xen paravirt guest boot failure
+ * (because of unexpected read-only mappings for kernel identity
+ * mappings). In this paravirt guest case, the kernel text
+ * mapping and the kernel identity mapping share the same
+ * page-table pages. Thus we can't really use different
+ * protections for the kernel text and identity mappings. Also,
+ * these shared mappings are made of small page mappings.
+ * Thus this don't enforce !RW mapping for small page kernel
+ * text mapping logic will help Linux Xen parvirt guest boot
+ * as well.
+ */
+ if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
+ pgprot_val(forbidden) |= _PAGE_RW;
+ }
+#endif
+
+ prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+ return prot;
+}
+
+/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ *level = PG_LEVEL_NONE;
+
+ if (pgd_none(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud))
+ return NULL;
+
+ *level = PG_LEVEL_1G;
+ if (pud_large(*pud) || !pud_present(*pud))
+ return (pte_t *)pud;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return NULL;
+
+ *level = PG_LEVEL_2M;
+ if (pmd_large(*pmd) || !pmd_present(*pmd))
+ return (pte_t *)pmd;
+
+ *level = PG_LEVEL_4K;
+
+ return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
+EXPORT_SYMBOL_GPL(lookup_address);
+
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+ unsigned int *level)
+{
+ if (cpa->pgd)
+ return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+ address, level);
+
+ return lookup_address(address, level);
+}
+
+/*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ return NULL;
+
+ return pmd_offset(pud, address);
+}
+
+/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems. The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+ unsigned long virt_addr = (unsigned long)__virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long offset;
+ enum pg_level level;
+ pte_t *pte;
+
+ pte = lookup_address(virt_addr, &level);
+ BUG_ON(!pte);
+
+ /*
+ * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
+ * before being left-shifted PAGE_SHIFT bits -- this trick is to
+ * make 32-PAE kernel work correctly.
+ */
+ switch (level) {
+ case PG_LEVEL_1G:
+ phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PUD_PAGE_MASK;
+ break;
+ case PG_LEVEL_2M:
+ phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PMD_PAGE_MASK;
+ break;
+ default:
+ phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PAGE_MASK;
+ }
+
+ return (phys_addr_t)(phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
+static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
+{
+ /* change init_mm */
+ set_pte_atomic(kpte, pte);
+#ifdef CONFIG_X86_32
+ if (!SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ pud = pud_offset(pgd, address);
+ pmd = pmd_offset(pud, address);
+ set_pte_atomic((pte_t *)pmd, pte);
+ }
+ }
+#endif
+}
+
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
+{
+ unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
+ pte_t new_pte, old_pte, *tmp;
+ pgprot_t old_prot, new_prot, req_prot;
+ int i, do_split = 1;
+ enum pg_level level;
+
+ if (cpa->force_split)
+ return 1;
+
+ spin_lock(&pgd_lock);
+ /*
+ * Check for races, another CPU might have split this page
+ * up already:
+ */
+ tmp = _lookup_address_cpa(cpa, address, &level);
+ if (tmp != kpte)
+ goto out_unlock;
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ old_prot = pmd_pgprot(*(pmd_t *)kpte);
+ old_pfn = pmd_pfn(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ old_prot = pud_pgprot(*(pud_t *)kpte);
+ old_pfn = pud_pfn(*(pud_t *)kpte);
+ break;
+ default:
+ do_split = -EINVAL;
+ goto out_unlock;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Calculate the number of pages, which fit into this large
+ * page starting at address:
+ */
+ nextpage_addr = (address + psize) & pmask;
+ numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+ if (numpages < cpa->numpages)
+ cpa->numpages = numpages;
+
+ /*
+ * We are safe now. Check whether the new pgprot is the same:
+ * Convert protection attributes to 4k-format, as cpa->mask* are set
+ * up accordingly.
+ */
+ old_pte = *kpte;
+ req_prot = pgprot_large_2_4k(old_prot);
+
+ pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
+
+ /*
+ * req_prot is in format of 4k pages. It must be converted to large
+ * page format: the caching mode includes the PAT bit located at
+ * different bit positions in the two formats.
+ */
+ req_prot = pgprot_4k_2_large(req_prot);
+
+ /*
+ * Set the PSE and GLOBAL flags only if the PRESENT flag is
+ * set otherwise pmd_present/pmd_huge will return true even on
+ * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
+ * for the ancient hardware that doesn't support it.
+ */
+ if (pgprot_val(req_prot) & _PAGE_PRESENT)
+ pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
+ else
+ pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
+
+ req_prot = canon_pgprot(req_prot);
+
+ /*
+ * old_pfn points to the large page base pfn. So we need
+ * to add the offset of the virtual address:
+ */
+ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
+ cpa->pfn = pfn;
+
+ new_prot = static_protections(req_prot, address, pfn);
+
+ /*
+ * We need to check the full range, whether
+ * static_protection() requires a different pgprot for one of
+ * the pages in the range we try to preserve:
+ */
+ addr = address & pmask;
+ pfn = old_pfn;
+ for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+ pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
+
+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+ goto out_unlock;
+ }
+
+ /*
+ * If there are no changes, return. maxpages has been updated
+ * above:
+ */
+ if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+ do_split = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * We need to change the attributes. Check, whether we can
+ * change the large page in one go. We request a split, when
+ * the address is not aligned and the number of pages is
+ * smaller than the number of pages in the large page. Note
+ * that we limited the number of possible pages already to
+ * the number of pages in the large page.
+ */
+ if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
+ /*
+ * The address is aligned and the number of pages
+ * covers the full page.
+ */
+ new_pte = pfn_pte(old_pfn, new_prot);
+ __set_pmd_pte(kpte, address, new_pte);
+ cpa->flags |= CPA_FLUSHTLB;
+ do_split = 0;
+ }
+
+out_unlock:
+ spin_unlock(&pgd_lock);
+
+ return do_split;
+}
+
+static int
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+ struct page *base)
+{
+ pte_t *pbase = (pte_t *)page_address(base);
+ unsigned long ref_pfn, pfn, pfninc = 1;
+ unsigned int i, level;
+ pte_t *tmp;
+ pgprot_t ref_prot;
+
+ spin_lock(&pgd_lock);
+ /*
+ * Check for races, another CPU might have split this page
+ * up for us already:
+ */
+ tmp = _lookup_address_cpa(cpa, address, &level);
+ if (tmp != kpte) {
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
+
+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ ref_prot = pmd_pgprot(*(pmd_t *)kpte);
+ /* clear PSE and promote PAT bit to correct position */
+ ref_prot = pgprot_large_2_4k(ref_prot);
+ ref_pfn = pmd_pfn(*(pmd_t *)kpte);
+ break;
+
+ case PG_LEVEL_1G:
+ ref_prot = pud_pgprot(*(pud_t *)kpte);
+ ref_pfn = pud_pfn(*(pud_t *)kpte);
+ pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+
+ /*
+ * Clear the PSE flags if the PRESENT flag is not set
+ * otherwise pmd_present/pmd_huge will return true
+ * even on a non present pmd.
+ */
+ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
+ pgprot_val(ref_prot) &= ~_PAGE_PSE;
+ break;
+
+ default:
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
+
+ /*
+ * Set the GLOBAL flags only if the PRESENT flag is set
+ * otherwise pmd/pte_present will return true even on a non
+ * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
+ * for the ancient hardware that doesn't support it.
+ */
+ if (pgprot_val(ref_prot) & _PAGE_PRESENT)
+ pgprot_val(ref_prot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
+
+ /*
+ * Get the target pfn from the original entry:
+ */
+ pfn = ref_pfn;
+ for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
+ set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
+
+ if (virt_addr_valid(address)) {
+ unsigned long pfn = PFN_DOWN(__pa(address));
+
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ split_page_count(level);
+ }
+
+ /*
+ * Install the new, split up pagetable.
+ *
+ * We use the standard kernel pagetable protections for the new
+ * pagetable protections, the actual ptes set above control the
+ * primary protection behavior:
+ */
+ __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+ /*
+ * Intel Atom errata AAH41 workaround.
+ *
+ * The real fix should be in hw or in a microcode update, but
+ * we also probabilistically try to reduce the window of having
+ * a large TLB mixed with 4K TLBs while instruction fetches are
+ * going on.
+ */
+ __flush_tlb_all();
+ spin_unlock(&pgd_lock);
+
+ return 0;
+}
+
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+ unsigned long address)
+{
+ struct page *base;
+
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+
+ if (__split_large_page(cpa, kpte, address, base))
+ __free_page(base);
+
+ return 0;
+}
+
+static bool try_to_free_pte_page(pte_t *pte)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(pte[i]))
+ return false;
+
+ free_page((unsigned long)pte);
+ return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmd[i]))
+ return false;
+
+ free_page((unsigned long)pmd);
+ return true;
+}
+
+static bool try_to_free_pud_page(pud_t *pud)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ if (!pud_none(pud[i]))
+ return false;
+
+ free_page((unsigned long)pud);
+ return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, start);
+
+ while (start < end) {
+ set_pte(pte, __pte(0));
+
+ start += PAGE_SIZE;
+ pte++;
+ }
+
+ if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ pmd_clear(pmd);
+ return true;
+ }
+ return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+{
+ if (unmap_pte_range(pmd, start, end))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, start);
+
+ /*
+ * Not on a 2MB page boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ __unmap_pmd_range(pud, pmd, start, pre_end);
+
+ start = pre_end;
+ pmd++;
+ }
+
+ /*
+ * Try to unmap in 2M chunks.
+ */
+ while (end - start >= PMD_SIZE) {
+ if (pmd_large(*pmd))
+ pmd_clear(pmd);
+ else
+ __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+ start += PMD_SIZE;
+ pmd++;
+ }
+
+ /*
+ * 4K leftovers?
+ */
+ if (start < end)
+ return __unmap_pmd_range(pud, pmd, start, end);
+
+ /*
+ * Try again to free the PMD page if haven't succeeded above.
+ */
+ if (!pud_none(*pud))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ pud_t *pud = pud_offset(pgd, start);
+
+ /*
+ * Not on a GB page boundary?
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ unmap_pmd_range(pud, start, pre_end);
+
+ start = pre_end;
+ pud++;
+ }
+
+ /*
+ * Try to unmap in 1G chunks?
+ */
+ while (end - start >= PUD_SIZE) {
+
+ if (pud_large(*pud))
+ pud_clear(pud);
+ else
+ unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+ start += PUD_SIZE;
+ pud++;
+ }
+
+ /*
+ * 2M leftovers?
+ */
+ if (start < end)
+ unmap_pmd_range(pud, start, end);
+
+ /*
+ * No need to try to free the PUD page because we'll free it in
+ * populate_pgd's error path
+ */
+}
+
+static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd_entry = root + pgd_index(addr);
+
+ unmap_pud_range(pgd_entry, addr, end);
+
+ if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
+ pgd_clear(pgd_entry);
+}
+
+static int alloc_pte_page(pmd_t *pmd)
+{
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pte)
+ return -1;
+
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ return 0;
+}
+
+static int alloc_pmd_page(pud_t *pud)
+{
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pmd)
+ return -1;
+
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ return 0;
+}
+
+static void populate_pte(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, start);
+
+ while (num_pages-- && start < end) {
+
+ /* deal with the NX bit */
+ if (!(pgprot_val(pgprot) & _PAGE_NX))
+ cpa->pfn &= ~_PAGE_NX;
+
+ set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+
+ start += PAGE_SIZE;
+ cpa->pfn += PAGE_SIZE;
+ pte++;
+ }
+}
+
+static int populate_pmd(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+ unsigned int cur_pages = 0;
+ pmd_t *pmd;
+ pgprot_t pmd_pgprot;
+
+ /*
+ * Not on a 2M boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+ pre_end = min_t(unsigned long, pre_end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+ /*
+ * Need a PTE page?
+ */
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+ start = pre_end;
+ }
+
+ /*
+ * We mapped them all?
+ */
+ if (num_pages == cur_pages)
+ return cur_pages;
+
+ pmd_pgprot = pgprot_4k_2_large(pgprot);
+
+ while (end - start >= PMD_SIZE) {
+
+ /*
+ * We cannot use a 1G page so allocate a PMD page if needed.
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ pmd = pmd_offset(pud, start);
+
+ set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+ massage_pgprot(pmd_pgprot)));
+
+ start += PMD_SIZE;
+ cpa->pfn += PMD_SIZE;
+ cur_pages += PMD_SIZE >> PAGE_SHIFT;
+ }
+
+ /*
+ * Map trailing 4K pages.
+ */
+ if (start < end) {
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, end, num_pages - cur_pages,
+ pmd, pgprot);
+ }
+ return num_pages;
+}
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+ pgprot_t pgprot)
+{
+ pud_t *pud;
+ unsigned long end;
+ int cur_pages = 0;
+ pgprot_t pud_pgprot;
+
+ end = start + (cpa->numpages << PAGE_SHIFT);
+
+ /*
+ * Not on a Gb page boundary? => map everything up to it with
+ * smaller pages.
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long pre_end;
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+ pre_end = min_t(unsigned long, end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+ pud = pud_offset(pgd, start);
+
+ /*
+ * Need a PMD page?
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+ pud, pgprot);
+ if (cur_pages < 0)
+ return cur_pages;
+
+ start = pre_end;
+ }
+
+ /* We mapped them all? */
+ if (cpa->numpages == cur_pages)
+ return cur_pages;
+
+ pud = pud_offset(pgd, start);
+ pud_pgprot = pgprot_4k_2_large(pgprot);
+
+ /*
+ * Map everything starting from the Gb boundary, possibly with 1G pages
+ */
+ while (end - start >= PUD_SIZE) {
+ set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+ massage_pgprot(pud_pgprot)));
+
+ start += PUD_SIZE;
+ cpa->pfn += PUD_SIZE;
+ cur_pages += PUD_SIZE >> PAGE_SHIFT;
+ pud++;
+ }
+
+ /* Map trailing leftover */
+ if (start < end) {
+ int tmp;
+
+ pud = pud_offset(pgd, start);
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+ pud, pgprot);
+ if (tmp < 0)
+ return cur_pages;
+
+ cur_pages += tmp;
+ }
+ return cur_pages;
+}
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+ pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+ pud_t *pud = NULL; /* shut up gcc */
+ pgd_t *pgd_entry;
+ int ret;
+
+ pgd_entry = cpa->pgd + pgd_index(addr);
+
+ /*
+ * Allocate a PUD page and hand it down for mapping.
+ */
+ if (pgd_none(*pgd_entry)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!pud)
+ return -1;
+
+ set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
+
+ ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+ if (ret < 0) {
+ unmap_pgd_range(cpa->pgd, addr,
+ addr + (cpa->numpages << PAGE_SHIFT));
+ return ret;
+ }
+
+ cpa->numpages = ret;
+ return 0;
+}
+
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+ int primary)
+{
+ if (cpa->pgd)
+ return populate_pgd(cpa, vaddr);
+
+ /*
+ * Ignore all non primary paths.
+ */
+ if (!primary)
+ return 0;
+
+ /*
+ * Ignore the NULL PTE for kernel identity mapping, as it is expected
+ * to have holes.
+ * Also set numpages to '1' indicating that we processed cpa req for
+ * one virtual address page and its pfn. TBD: numpages can be set based
+ * on the initial value and the level returned by lookup_address().
+ */
+ if (within(vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ cpa->numpages = 1;
+ cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+ return 0;
+ } else {
+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
+ "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+ *cpa->vaddr);
+
+ return -EFAULT;
+ }
+}
+
+static int __change_page_attr(struct cpa_data *cpa, int primary)
+{
+ unsigned long address;
+ int do_split, err;
+ unsigned int level;
+ pte_t *kpte, old_pte;
+
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ address = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
+ address = cpa->vaddr[cpa->curpage];
+ else
+ address = *cpa->vaddr;
+repeat:
+ kpte = _lookup_address_cpa(cpa, address, &level);
+ if (!kpte)
+ return __cpa_process_fault(cpa, address, primary);
+
+ old_pte = *kpte;
+ if (!pte_val(old_pte))
+ return __cpa_process_fault(cpa, address, primary);
+
+ if (level == PG_LEVEL_4K) {
+ pte_t new_pte;
+ pgprot_t new_prot = pte_pgprot(old_pte);
+ unsigned long pfn = pte_pfn(old_pte);
+
+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+ new_prot = static_protections(new_prot, address, pfn);
+
+ /*
+ * Set the GLOBAL flags only if the PRESENT flag is
+ * set otherwise pte_present will return true even on
+ * a non present pte. The canon_pgprot will clear
+ * _PAGE_GLOBAL for the ancient hardware that doesn't
+ * support it.
+ */
+ if (pgprot_val(new_prot) & _PAGE_PRESENT)
+ pgprot_val(new_prot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
+
+ /*
+ * We need to keep the pfn from the existing PTE,
+ * after all we're only going to change it's attributes
+ * not the memory it points to
+ */
+ new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+ cpa->pfn = pfn;
+ /*
+ * Do we really change anything ?
+ */
+ if (pte_val(old_pte) != pte_val(new_pte)) {
+ set_pte_atomic(kpte, new_pte);
+ cpa->flags |= CPA_FLUSHTLB;
+ }
+ cpa->numpages = 1;
+ return 0;
+ }
+
+ /*
+ * Check, whether we can keep the large page intact
+ * and just change the pte:
+ */
+ do_split = try_preserve_large_page(kpte, address, cpa);
+ /*
+ * When the range fits into the existing large page,
+ * return. cp->numpages and cpa->tlbflush have been updated in
+ * try_large_page:
+ */
+ if (do_split <= 0)
+ return do_split;
+
+ /*
+ * We have to split the large page:
+ */
+ err = split_large_page(cpa, kpte, address);
+ if (!err) {
+ /*
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
+ *
+ * With out this, we violate the TLB application note, that says
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
+ *
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
+ */
+ flush_tlb_all();
+ goto repeat;
+ }
+
+ return err;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+
+static int cpa_process_alias(struct cpa_data *cpa)
+{
+ struct cpa_data alias_cpa;
+ unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+ unsigned long vaddr;
+ int ret;
+
+ if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
+ return 0;
+
+ /*
+ * No need to redo, when the primary call touched the direct
+ * mapping already:
+ */
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ vaddr = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
+ vaddr = cpa->vaddr[cpa->curpage];
+ else
+ vaddr = *cpa->vaddr;
+
+ if (!(within(vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
+
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &laddr;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+ ret = __change_page_attr_set_clr(&alias_cpa, 0);
+ if (ret)
+ return ret;
+ }
+
+#ifdef CONFIG_X86_64
+ /*
+ * If the primary call didn't touch the high mapping already
+ * and the physical address is inside the kernel map, we need
+ * to touch the high mapped kernel as well:
+ */
+ if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+ within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+ unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+ __START_KERNEL_map - phys_base;
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+ /*
+ * The high mapping range is imprecise, so ignore the
+ * return value.
+ */
+ __change_page_attr_set_clr(&alias_cpa, 0);
+ }
+#endif
+
+ return 0;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+{
+ int ret, numpages = cpa->numpages;
+
+ while (numpages) {
+ /*
+ * Store the remaining nr of pages for the large page
+ * preservation check.
+ */
+ cpa->numpages = numpages;
+ /* for array changes, we can't use large page */
+ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
+ cpa->numpages = 1;
+
+ if (!debug_pagealloc)
+ spin_lock(&cpa_lock);
+ ret = __change_page_attr(cpa, checkalias);
+ if (!debug_pagealloc)
+ spin_unlock(&cpa_lock);
+ if (ret)
+ return ret;
+
+ if (checkalias) {
+ ret = cpa_process_alias(cpa);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Adjust the number of pages with the result of the
+ * CPA operation. Either a large page has been
+ * preserved or a single page update happened.
+ */
+ BUG_ON(cpa->numpages > numpages || !cpa->numpages);
+ numpages -= cpa->numpages;
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+ cpa->curpage++;
+ else
+ *cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
+ }
+ return 0;
+}
+
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
+ pgprot_t mask_set, pgprot_t mask_clr,
+ int force_split, int in_flag,
+ struct page **pages)
+{
+ struct cpa_data cpa;
+ int ret, cache, checkalias;
+ unsigned long baddr = 0;
+
+ memset(&cpa, 0, sizeof(cpa));
+
+ /*
+ * Check, if we are requested to change a not supported
+ * feature:
+ */
+ mask_set = canon_pgprot(mask_set);
+ mask_clr = canon_pgprot(mask_clr);
+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
+ return 0;
+
+ /* Ensure we are PAGE_SIZE aligned */
+ if (in_flag & CPA_ARRAY) {
+ int i;
+ for (i = 0; i < numpages; i++) {
+ if (addr[i] & ~PAGE_MASK) {
+ addr[i] &= PAGE_MASK;
+ WARN_ON_ONCE(1);
+ }
+ }
+ } else if (!(in_flag & CPA_PAGES_ARRAY)) {
+ /*
+ * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+ * No need to cehck in that case
+ */
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
+ /*
+ * Save address for cache flush. *addr is modified in the call
+ * to __change_page_attr_set_clr() below.
+ */
+ baddr = *addr;
+ }
+
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+
+ vm_unmap_aliases();
+
+ cpa.vaddr = addr;
+ cpa.pages = pages;
+ cpa.numpages = numpages;
+ cpa.mask_set = mask_set;
+ cpa.mask_clr = mask_clr;
+ cpa.flags = 0;
+ cpa.curpage = 0;
+ cpa.force_split = force_split;
+
+ if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+ cpa.flags |= in_flag;
+
+ /* No alias checking for _NX bit modifications */
+ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+
+ ret = __change_page_attr_set_clr(&cpa, checkalias);
+
+ /*
+ * Check whether we really changed something:
+ */
+ if (!(cpa.flags & CPA_FLUSHTLB))
+ goto out;
+
+ /*
+ * No need to flush, when we did not set any of the caching
+ * attributes:
+ */
+ cache = !!pgprot2cachemode(mask_set);
+
+ /*
+ * On success we use CLFLUSH, when the CPU supports it to
+ * avoid the WBINVD. If the CPU does not support it and in the
+ * error case we fall back to cpa_flush_all (which uses
+ * WBINVD):
+ */
+ if (!ret && cpu_has_clflush) {
+ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ cpa_flush_array(addr, numpages, cache,
+ cpa.flags, pages);
+ } else
+ cpa_flush_range(baddr, numpages, cache);
+ } else
+ cpa_flush_all(cache);
+
+out:
+ return ret;
+}
+
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
+{
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+ (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
+{
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+ (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+ CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+ CPA_PAGES_ARRAY, pages);
+}
+
+int _set_memory_uc(unsigned long addr, int numpages)
+{
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ * If you really need strong UC use ioremap_uc(), but note
+ * that you cannot override IO areas with set_memory_*() as
+ * these helpers cannot work with IO memory.
+ */
+ return change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+ 0);
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+ int ret;
+
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ */
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_UC_MINUS, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = _set_memory_uc(addr, numpages);
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+ return ret;
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+static int _set_memory_array(unsigned long *addr, int addrinarray,
+ enum page_cache_mode new_type)
+{
+ enum page_cache_mode set_type;
+ int i, j;
+ int ret;
+
+ for (i = 0; i < addrinarray; i++) {
+ ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+ new_type, NULL);
+ if (ret)
+ goto out_free;
+ }
+
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
+ ret = change_page_attr_set(addr, addrinarray,
+ cachemode2pgprot(set_type), 1);
+
+ if (!ret && new_type == _PAGE_CACHE_MODE_WC)
+ ret = change_page_attr_set_clr(addr, addrinarray,
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, CPA_ARRAY, NULL);
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ for (j = 0; j < i; j++)
+ free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+ return ret;
+}
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
+int set_memory_array_wc(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
+}
+EXPORT_SYMBOL(set_memory_array_wc);
+
+int set_memory_array_wt(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_memory_array_wt);
+
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+ int ret;
+ unsigned long addr_copy = addr;
+
+ ret = change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+ 0);
+ if (!ret) {
+ ret = change_page_attr_set_clr(&addr_copy, numpages,
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, 0, NULL);
+ }
+ return ret;
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_WC, NULL);
+ if (ret)
+ return ret;
+
+ ret = _set_memory_wc(addr, numpages);
+ if (ret)
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+ return ret;
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wt(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
+}
+
+int set_memory_wt(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_WT, NULL);
+ if (ret)
+ return ret;
+
+ ret = _set_memory_wt(addr, numpages);
+ if (ret)
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_memory_wt);
+
+int _set_memory_wb(unsigned long addr, int numpages)
+{
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_CACHE_MASK), 0);
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = _set_memory_wb(addr, numpages);
+ if (ret)
+ return ret;
+
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ return 0;
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+ int i;
+ int ret;
+
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
+ ret = change_page_attr_clear(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK), 1);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < addrinarray; i++)
+ free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return 0;
+
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return 0;
+
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
+
+int set_memory_4k(unsigned long addr, int numpages)
+{
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(0), 1, 0, NULL);
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+static int _set_pages_array(struct page **pages, int addrinarray,
+ enum page_cache_mode new_type)
+{
+ unsigned long start;
+ unsigned long end;
+ enum page_cache_mode set_type;
+ int i;
+ int free_idx;
+ int ret;
+
+ for (i = 0; i < addrinarray; i++) {
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+ end = start + PAGE_SIZE;
+ if (reserve_memtype(start, end, new_type, NULL))
+ goto err_out;
+ }
+
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
+ ret = cpa_set_pages_array(pages, addrinarray,
+ cachemode2pgprot(set_type));
+ if (!ret && new_type == _PAGE_CACHE_MODE_WC)
+ ret = change_page_attr_set_clr(NULL, addrinarray,
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, CPA_PAGES_ARRAY, pages);
+ if (ret)
+ goto err_out;
+ return 0; /* Success */
+err_out:
+ free_idx = i;
+ for (i = 0; i < free_idx; i++) {
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+ return -EINVAL;
+}
+
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
+int set_pages_array_wc(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
+}
+EXPORT_SYMBOL(set_pages_array_wc);
+
+int set_pages_array_wt(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_pages_array_wt);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+ int retval;
+ unsigned long start;
+ unsigned long end;
+ int i;
+
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
+ retval = cpa_clear_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK));
+ if (retval)
+ return retval;
+
+ for (i = 0; i < addrinarray; i++) {
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_rw(addr, numpages);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
+ .numpages = numpages,
+ .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(0),
+ .flags = 0};
+
+ /*
+ * No alias checking needed for setting present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .flags = 0};
+
+ /*
+ * No alias checking needed for setting not present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (PageHighMem(page))
+ return;
+ if (!enable) {
+ debug_check_no_locks_freed(page_address(page),
+ numpages * PAGE_SIZE);
+ }
+
+ /*
+ * The return value is ignored as the calls cannot fail.
+ * Large pages for identity mappings are not used at boot time
+ * and hence no memory allocations during large page split.
+ */
+ if (enable)
+ __set_pages_p(page, numpages);
+ else
+ __set_pages_np(page, numpages);
+
+ /*
+ * We should perform an IPI and flush all tlbs,
+ * but that can deadlock->flush only current cpu:
+ */
+ __flush_tlb_all();
+
+ arch_flush_lazy_mmu_mode();
+}
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+ unsigned int level;
+ pte_t *pte;
+
+ if (PageHighMem(page))
+ return false;
+
+ pte = lookup_address((unsigned long)page_address(page), &level);
+ return (pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+ unsigned numpages, unsigned long page_flags)
+{
+ int retval = -EINVAL;
+
+ struct cpa_data cpa = {
+ .vaddr = &address,
+ .pfn = pfn,
+ .pgd = pgd,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(0),
+ .flags = 0,
+ };
+
+ if (!(__supported_pte_mask & _PAGE_NX))
+ goto out;
+
+ if (!(page_flags & _PAGE_NX))
+ cpa.mask_clr = __pgprot(_PAGE_NX);
+
+ cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+ retval = __change_page_attr_set_clr(&cpa, 0);
+ __flush_tlb_all();
+
+out:
+ return retval;
+}
+
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+ unsigned numpages)
+{
+ unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
+}
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
new file mode 100644
index 0000000..3f1bb4f
--- /dev/null
+++ b/arch/x86/mm/pat.c
@@ -0,0 +1,1114 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rbtree.h>
+
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+#include <asm/pgtable.h>
+#include <asm/fcntl.h>
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+#include <asm/io.h>
+
+#include "pat_internal.h"
+#include "mm_internal.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static bool __read_mostly boot_cpu_done;
+static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
+static bool __read_mostly pat_initialized;
+static bool __read_mostly init_cm_done;
+
+void pat_disable(const char *reason)
+{
+ if (pat_disabled)
+ return;
+
+ if (boot_cpu_done) {
+ WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
+ return;
+ }
+
+ pat_disabled = true;
+ pr_info("x86/PAT: %s\n", reason);
+}
+
+static int __init nopat(char *str)
+{
+ pat_disable("PAT support disabled.");
+ return 0;
+}
+early_param("nopat", nopat);
+
+bool pat_enabled(void)
+{
+ return pat_initialized;
+}
+EXPORT_SYMBOL_GPL(pat_enabled);
+
+int pat_debug_enable;
+
+static int __init pat_debug_setup(char *str)
+{
+ pat_debug_enable = 1;
+ return 0;
+}
+__setup("debugpat", pat_debug_setup);
+
+#ifdef CONFIG_X86_PAT
+/*
+ * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * memory type of pages that have backing page struct.
+ *
+ * X86 PAT supports 4 different memory types:
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_WT
+ *
+ * _PAGE_CACHE_MODE_WB is the default type.
+ */
+
+#define _PGMT_WB 0
+#define _PGMT_WC (1UL << PG_arch_1)
+#define _PGMT_UC_MINUS (1UL << PG_uncached)
+#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
+
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+ unsigned long pg_flags = pg->flags & _PGMT_MASK;
+
+ if (pg_flags == _PGMT_WB)
+ return _PAGE_CACHE_MODE_WB;
+ else if (pg_flags == _PGMT_WC)
+ return _PAGE_CACHE_MODE_WC;
+ else if (pg_flags == _PGMT_UC_MINUS)
+ return _PAGE_CACHE_MODE_UC_MINUS;
+ else
+ return _PAGE_CACHE_MODE_WT;
+}
+
+static inline void set_page_memtype(struct page *pg,
+ enum page_cache_mode memtype)
+{
+ unsigned long memtype_flags;
+ unsigned long old_flags;
+ unsigned long new_flags;
+
+ switch (memtype) {
+ case _PAGE_CACHE_MODE_WC:
+ memtype_flags = _PGMT_WC;
+ break;
+ case _PAGE_CACHE_MODE_UC_MINUS:
+ memtype_flags = _PGMT_UC_MINUS;
+ break;
+ case _PAGE_CACHE_MODE_WT:
+ memtype_flags = _PGMT_WT;
+ break;
+ case _PAGE_CACHE_MODE_WB:
+ default:
+ memtype_flags = _PGMT_WB;
+ break;
+ }
+
+ do {
+ old_flags = pg->flags;
+ new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
+ } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
+}
+#else
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+ return -1;
+}
+static inline void set_page_memtype(struct page *pg,
+ enum page_cache_mode memtype)
+{
+}
+#endif
+
+enum {
+ PAT_UC = 0, /* uncached */
+ PAT_WC = 1, /* Write combining */
+ PAT_WT = 4, /* Write Through */
+ PAT_WP = 5, /* Write Protected */
+ PAT_WB = 6, /* Write Back (default) */
+ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
+};
+
+#define CM(c) (_PAGE_CACHE_MODE_ ## c)
+
+static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
+{
+ enum page_cache_mode cache;
+ char *cache_mode;
+
+ switch (pat_val) {
+ case PAT_UC: cache = CM(UC); cache_mode = "UC "; break;
+ case PAT_WC: cache = CM(WC); cache_mode = "WC "; break;
+ case PAT_WT: cache = CM(WT); cache_mode = "WT "; break;
+ case PAT_WP: cache = CM(WP); cache_mode = "WP "; break;
+ case PAT_WB: cache = CM(WB); cache_mode = "WB "; break;
+ case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+ default: cache = CM(WB); cache_mode = "WB "; break;
+ }
+
+ memcpy(msg, cache_mode, 4);
+
+ return cache;
+}
+
+#undef CM
+
+/*
+ * Update the cache mode to pgprot translation tables according to PAT
+ * configuration.
+ * Using lower indices is preferred, so we start with highest index.
+ */
+static void __init_cache_modes(u64 pat)
+{
+ enum page_cache_mode cache;
+ char pat_msg[33];
+ int i;
+
+ pat_msg[32] = 0;
+ for (i = 7; i >= 0; i--) {
+ cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
+ pat_msg + 4 * i);
+ update_cache_mode_entry(i, cache);
+ }
+ pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
+
+ init_cm_done = true;
+}
+
+#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
+
+static void pat_bsp_init(u64 pat)
+{
+ u64 tmp_pat;
+
+ if (!boot_cpu_has(X86_FEATURE_PAT)) {
+ pat_disable("PAT not supported by CPU.");
+ return;
+ }
+
+ rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
+ if (!tmp_pat) {
+ pat_disable("PAT MSR is 0, disabled.");
+ return;
+ }
+
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+ pat_initialized = true;
+
+ __init_cache_modes(pat);
+}
+
+static void pat_ap_init(u64 pat)
+{
+ if (!boot_cpu_has(X86_FEATURE_PAT)) {
+ /*
+ * If this happens we are on a secondary CPU, but switched to
+ * PAT on the boot CPU. We have no way to undo PAT.
+ */
+ panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
+ }
+
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+}
+
+void init_cache_modes(void)
+{
+ u64 pat = 0;
+
+ if (init_cm_done)
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_PAT)) {
+ /*
+ * CPU supports PAT. Set PAT table to be consistent with
+ * PAT MSR. This case supports "nopat" boot option, and
+ * virtual machine environments which support PAT without
+ * MTRRs. In specific, Xen has unique setup to PAT MSR.
+ *
+ * If PAT MSR returns 0, it is considered invalid and emulates
+ * as No PAT.
+ */
+ rdmsrl(MSR_IA32_CR_PAT, pat);
+ }
+
+ if (!pat) {
+ /*
+ * No PAT. Emulate the PAT table that corresponds to the two
+ * cache bits, PWT (Write Through) and PCD (Cache Disable).
+ * This setup is also the same as the BIOS default setup.
+ *
+ * PTE encoding:
+ *
+ * PCD
+ * |PWT PAT
+ * || slot
+ * 00 0 WB : _PAGE_CACHE_MODE_WB
+ * 01 1 WT : _PAGE_CACHE_MODE_WT
+ * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 11 3 UC : _PAGE_CACHE_MODE_UC
+ *
+ * NOTE: When WC or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
+ }
+
+ __init_cache_modes(pat);
+}
+
+/**
+ * pat_init - Initialize PAT MSR and PAT table
+ *
+ * This function initializes PAT MSR and PAT table with an OS-defined value
+ * to enable additional cache attributes, WC and WT.
+ *
+ * This function must be called on all CPUs using the specific sequence of
+ * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
+ * procedure for PAT.
+ */
+void pat_init(void)
+{
+ u64 pat;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (pat_disabled)
+ return;
+
+ if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+ (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+ ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+ /*
+ * PAT support with the lower four entries. Intel Pentium 2,
+ * 3, M, and 4 are affected by PAT errata, which makes the
+ * upper four entries unusable. To be on the safe side, we don't
+ * use those.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * PAT bit unused
+ *
+ * NOTE: When WT or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+ } else {
+ /*
+ * Full PAT support. We put WT in slot 7 to improve
+ * robustness in the presence of errata that might cause
+ * the high PAT bit to be ignored. This way, a buggy slot 7
+ * access will hit slot 3, and slot 3 is UC, so at worst
+ * we lose performance without causing a correctness issue.
+ * Pentium 4 erratum N46 is an example for such an erratum,
+ * although we try not to use PAT at all on affected CPUs.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * 100 4 WB : Reserved
+ * 101 5 WC : Reserved
+ * 110 6 UC-: Reserved
+ * 111 7 WT : _PAGE_CACHE_MODE_WT
+ *
+ * The reserved slots are unused, but mapped to their
+ * corresponding types in the presence of PAT errata.
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+ }
+
+ if (!boot_cpu_done) {
+ pat_bsp_init(pat);
+ boot_cpu_done = true;
+ } else {
+ pat_ap_init(pat);
+ }
+}
+
+#undef PAT
+
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static unsigned long pat_x_mtrr_type(u64 start, u64 end,
+ enum page_cache_mode req_type)
+{
+ /*
+ * Look for MTRR hint to get the effective type in case where PAT
+ * request is for WB.
+ */
+ if (req_type == _PAGE_CACHE_MODE_WB) {
+ u8 mtrr_type, uniform;
+
+ mtrr_type = mtrr_type_lookup(start, end, &uniform);
+ if (mtrr_type != MTRR_TYPE_WRBACK)
+ return _PAGE_CACHE_MODE_UC_MINUS;
+
+ return _PAGE_CACHE_MODE_WB;
+ }
+
+ return req_type;
+}
+
+struct pagerange_state {
+ unsigned long cur_pfn;
+ int ram;
+ int not_ram;
+};
+
+static int
+pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
+{
+ struct pagerange_state *state = arg;
+
+ state->not_ram |= initial_pfn > state->cur_pfn;
+ state->ram |= total_nr_pages > 0;
+ state->cur_pfn = initial_pfn + total_nr_pages;
+
+ return state->ram && state->not_ram;
+}
+
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
+{
+ int ret = 0;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ struct pagerange_state state = {start_pfn, 0, 0};
+
+ /*
+ * For legacy reasons, physical address range in the legacy ISA
+ * region is tracked as non-RAM. This will allow users of
+ * /dev/mem to map portions of legacy ISA region, even when
+ * some of those portions are listed(or not even listed) with
+ * different e820 types(RAM/reserved/..)
+ */
+ if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+ start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
+
+ if (start_pfn < end_pfn) {
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &state, pagerange_is_ram_callback);
+ }
+
+ return (ret > 0) ? -1 : (state.ram ? 1 : 0);
+}
+
+/*
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * The page flags are limited to four types, WB (default), WC, WT and UC-.
+ * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting
+ * a new memory type is only allowed for a page mapped with the default WB
+ * type.
+ *
+ * Here we do two passes:
+ * - Find the memtype of all the pages in the range, look for any conflicts.
+ * - In case of no conflicts, set the new memtype for pages in the range.
+ */
+static int reserve_ram_pages_type(u64 start, u64 end,
+ enum page_cache_mode req_type,
+ enum page_cache_mode *new_type)
+{
+ struct page *page;
+ u64 pfn;
+
+ if (req_type == _PAGE_CACHE_MODE_WP) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_MODE_UC_MINUS;
+ return -EINVAL;
+ }
+
+ if (req_type == _PAGE_CACHE_MODE_UC) {
+ /* We do not support strong UC */
+ WARN_ON_ONCE(1);
+ req_type = _PAGE_CACHE_MODE_UC_MINUS;
+ }
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ enum page_cache_mode type;
+
+ page = pfn_to_page(pfn);
+ type = get_page_memtype(page);
+ if (type != _PAGE_CACHE_MODE_WB) {
+ pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+ start, end - 1, type, req_type);
+ if (new_type)
+ *new_type = type;
+
+ return -EBUSY;
+ }
+ }
+
+ if (new_type)
+ *new_type = req_type;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ set_page_memtype(page, req_type);
+ }
+ return 0;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+ struct page *page;
+ u64 pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ set_page_memtype(page, _PAGE_CACHE_MODE_WB);
+ }
+ return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_UC
+ * - _PAGE_CACHE_MODE_WT
+ *
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
+ enum page_cache_mode *new_type)
+{
+ struct memtype *new;
+ enum page_cache_mode actual_type;
+ int is_range_ram;
+ int err = 0;
+
+ BUG_ON(start >= end); /* end is exclusive */
+
+ if (!pat_enabled()) {
+ /* This is identical to page table setting without PAT */
+ if (new_type)
+ *new_type = req_type;
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB in page table. No need to track */
+ if (x86_platform.is_untracked_pat_range(start, end)) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_MODE_WB;
+ return 0;
+ }
+
+ /*
+ * Call mtrr_lookup to get the type hint. This is an
+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
+ * tools and ACPI tools). Use WB request for WB memory and use
+ * UC_MINUS otherwise.
+ */
+ actual_type = pat_x_mtrr_type(start, end, req_type);
+
+ if (new_type)
+ *new_type = actual_type;
+
+ is_range_ram = pat_pagerange_is_ram(start, end);
+ if (is_range_ram == 1) {
+
+ err = reserve_ram_pages_type(start, end, req_type, new_type);
+
+ return err;
+ } else if (is_range_ram < 0) {
+ return -EINVAL;
+ }
+
+ new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ new->start = start;
+ new->end = end;
+ new->type = actual_type;
+
+ spin_lock(&memtype_lock);
+
+ err = rbt_memtype_check_insert(new, new_type);
+ if (err) {
+ pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ start, end - 1,
+ cattr_name(new->type), cattr_name(req_type));
+ kfree(new);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ }
+
+ spin_unlock(&memtype_lock);
+
+ dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+ start, end - 1, cattr_name(new->type), cattr_name(req_type),
+ new_type ? cattr_name(*new_type) : "-");
+
+ return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+ int err = -EINVAL;
+ int is_range_ram;
+ struct memtype *entry;
+
+ if (!pat_enabled())
+ return 0;
+
+ /* Low ISA region is always mapped WB. No need to track */
+ if (x86_platform.is_untracked_pat_range(start, end))
+ return 0;
+
+ is_range_ram = pat_pagerange_is_ram(start, end);
+ if (is_range_ram == 1) {
+
+ err = free_ram_pages_type(start, end);
+
+ return err;
+ } else if (is_range_ram < 0) {
+ return -EINVAL;
+ }
+
+ spin_lock(&memtype_lock);
+ entry = rbt_memtype_erase(start, end);
+ spin_unlock(&memtype_lock);
+
+ if (!entry) {
+ pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid, start, end - 1);
+ return -EINVAL;
+ }
+
+ kfree(entry);
+
+ dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
+
+ return 0;
+}
+
+
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
+ * or _PAGE_CACHE_MODE_WT.
+ */
+static enum page_cache_mode lookup_memtype(u64 paddr)
+{
+ enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
+ struct memtype *entry;
+
+ if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
+ return rettype;
+
+ if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+ struct page *page;
+
+ page = pfn_to_page(paddr >> PAGE_SHIFT);
+ return get_page_memtype(page);
+ }
+
+ spin_lock(&memtype_lock);
+
+ entry = rbt_memtype_lookup(paddr);
+ if (entry != NULL)
+ rettype = entry->type;
+ else
+ rettype = _PAGE_CACHE_MODE_UC_MINUS;
+
+ spin_unlock(&memtype_lock);
+ return rettype;
+}
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ enum page_cache_mode *type)
+{
+ resource_size_t size = end - start;
+ enum page_cache_mode req_type = *type;
+ enum page_cache_mode new_type;
+ int ret;
+
+ WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+ ret = reserve_memtype(start, end, req_type, &new_type);
+ if (ret)
+ goto out_err;
+
+ if (!is_new_memtype_allowed(start, size, req_type, new_type))
+ goto out_free;
+
+ if (kernel_map_sync_memtype(start, size, new_type) < 0)
+ goto out_free;
+
+ *type = new_type;
+ return 0;
+
+out_free:
+ free_memtype(start, end);
+ ret = -EBUSY;
+out_err:
+ return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+ free_memtype(start, end);
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot)
+{
+ return vma_prot;
+}
+
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ return 1;
+}
+#else
+/* This check is needed to avoid cache aliasing when PAT is enabled */
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ u64 from = ((u64)pfn) << PAGE_SHIFT;
+ u64 to = from + size;
+ u64 cursor = from;
+
+ if (!pat_enabled())
+ return 1;
+
+ while (cursor < to) {
+ if (!devmem_is_allowed(pfn)) {
+ pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+ current->comm, from, to - 1);
+ return 0;
+ }
+ cursor += PAGE_SIZE;
+ pfn++;
+ }
+ return 1;
+}
+#endif /* CONFIG_STRICT_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
+
+ if (!range_is_allowed(pfn, size))
+ return 0;
+
+ if (file->f_flags & O_DSYNC)
+ pcm = _PAGE_CACHE_MODE_UC_MINUS;
+
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+ cachemode2protval(pcm));
+ return 1;
+}
+
+/*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 base, unsigned long size,
+ enum page_cache_mode pcm)
+{
+ unsigned long id_sz;
+
+ if (base > __pa(high_memory-1))
+ return 0;
+
+ /*
+ * some areas in the middle of the kernel identity range
+ * are not mapped, like the PCI space.
+ */
+ if (!page_is_ram(base >> PAGE_SHIFT))
+ return 0;
+
+ id_sz = (__pa(high_memory-1) <= base + size) ?
+ __pa(high_memory) - base :
+ size;
+
+ if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
+ pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid,
+ cattr_name(pcm),
+ base, (unsigned long long)(base + size-1));
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+ int strict_prot)
+{
+ int is_ram = 0;
+ int ret;
+ enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
+ enum page_cache_mode pcm = want_pcm;
+
+ is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+
+ /*
+ * reserve_pfn_range() for RAM pages. We do not refcount to keep
+ * track of number of mappings of RAM pages. We can assert that
+ * the type requested matches the type of first page in the range.
+ */
+ if (is_ram) {
+ if (!pat_enabled())
+ return 0;
+
+ pcm = lookup_memtype(paddr);
+ if (want_pcm != pcm) {
+ pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_pcm),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size - 1),
+ cattr_name(pcm));
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+ }
+ return 0;
+ }
+
+ ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
+ if (ret)
+ return ret;
+
+ if (pcm != want_pcm) {
+ if (strict_prot ||
+ !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
+ free_memtype(paddr, paddr + size);
+ pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_pcm),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size - 1),
+ cattr_name(pcm));
+ return -EINVAL;
+ }
+ /*
+ * We allow returning different type than the one requested in
+ * non strict case.
+ */
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+ }
+
+ if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
+ free_memtype(paddr, paddr + size);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+ int is_ram;
+
+ is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+ if (is_ram == 0)
+ free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ */
+int track_pfn_copy(struct vm_area_struct *vma)
+{
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+ pgprot_t pgprot;
+
+ if (vma->vm_flags & VM_PAT) {
+ /*
+ * reserve the whole chunk covered by vma. We need the
+ * starting address and protection from pte.
+ */
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ pgprot = __pgprot(prot);
+ return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+ }
+
+ return 0;
+}
+
+/*
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ */
+int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn, unsigned long addr, unsigned long size)
+{
+ resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ enum page_cache_mode pcm;
+
+ /* reserve the whole chunk starting from paddr */
+ if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+ int ret;
+
+ ret = reserve_pfn_range(paddr, size, prot, 0);
+ if (!ret)
+ vma->vm_flags |= VM_PAT;
+ return ret;
+ }
+
+ if (!pat_enabled())
+ return 0;
+
+ /*
+ * For anything smaller than the vma size we set prot based on the
+ * lookup.
+ */
+ pcm = lookup_memtype(paddr);
+
+ /* Check memtype for the remaining pages */
+ while (size > PAGE_SIZE) {
+ size -= PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ if (pcm != lookup_memtype(paddr))
+ return -EINVAL;
+ }
+
+ *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+
+ return 0;
+}
+
+int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn)
+{
+ enum page_cache_mode pcm;
+
+ if (!pat_enabled())
+ return 0;
+
+ /* Set prot based on lookup */
+ pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
+ *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+
+ return 0;
+}
+
+/*
+ * untrack_pfn is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case pfn, size are zero).
+ */
+void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
+{
+ resource_size_t paddr;
+ unsigned long prot;
+
+ if (!(vma->vm_flags & VM_PAT))
+ return;
+
+ /* free the chunk starting from pfn or the whole chunk */
+ paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ if (!paddr && !size) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ size = vma->vm_end - vma->vm_start;
+ }
+ free_pfn_range(paddr, size);
+ vma->vm_flags &= ~VM_PAT;
+}
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WC));
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+ struct memtype *print_entry;
+ int ret;
+
+ print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!print_entry)
+ return NULL;
+
+ spin_lock(&memtype_lock);
+ ret = rbt_memtype_copy_nth_element(print_entry, pos);
+ spin_unlock(&memtype_lock);
+
+ if (!ret) {
+ return print_entry;
+ } else {
+ kfree(print_entry);
+ return NULL;
+ }
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos == 0) {
+ ++*pos;
+ seq_puts(seq, "PAT memtype list:\n");
+ }
+
+ return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+ struct memtype *print_entry = (struct memtype *)v;
+
+ seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+ print_entry->start, print_entry->end);
+ kfree(print_entry);
+
+ return 0;
+}
+
+static const struct seq_operations memtype_seq_ops = {
+ .start = memtype_seq_start,
+ .next = memtype_seq_next,
+ .stop = memtype_seq_stop,
+ .show = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+ .open = memtype_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+ if (pat_enabled()) {
+ debugfs_create_file("pat_memtype_list", S_IRUSR,
+ arch_debugfs_dir, NULL, &memtype_fops);
+ }
+ return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
new file mode 100644
index 0000000..a739bfc
--- /dev/null
+++ b/arch/x86/mm/pat_internal.h
@@ -0,0 +1,48 @@
+#ifndef __PAT_INTERNAL_H_
+#define __PAT_INTERNAL_H_
+
+extern int pat_debug_enable;
+
+#define dprintk(fmt, arg...) \
+ do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
+
+struct memtype {
+ u64 start;
+ u64 end;
+ u64 subtree_max_end;
+ enum page_cache_mode type;
+ struct rb_node rb;
+};
+
+static inline char *cattr_name(enum page_cache_mode pcm)
+{
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC: return "uncached";
+ case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_MODE_WB: return "write-back";
+ case _PAGE_CACHE_MODE_WC: return "write-combining";
+ case _PAGE_CACHE_MODE_WT: return "write-through";
+ case _PAGE_CACHE_MODE_WP: return "write-protected";
+ default: return "broken";
+ }
+}
+
+#ifdef CONFIG_X86_PAT
+extern int rbt_memtype_check_insert(struct memtype *new,
+ enum page_cache_mode *new_type);
+extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_lookup(u64 addr);
+extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
+#else
+static inline int rbt_memtype_check_insert(struct memtype *new,
+ enum page_cache_mode *new_type)
+{ return 0; }
+static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{ return NULL; }
+static inline struct memtype *rbt_memtype_lookup(u64 addr)
+{ return NULL; }
+static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{ return 0; }
+#endif
+
+#endif /* __PAT_INTERNAL_H_ */
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
new file mode 100644
index 0000000..6393108
--- /dev/null
+++ b/arch/x86/mm/pat_rbtree.c
@@ -0,0 +1,249 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Interval tree (augmented rbtree) used to store the PAT memory type
+ * reservations.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+
+#include "pat_internal.h"
+
+/*
+ * The memtype tree keeps track of memory type for specific
+ * physical memory areas. Without proper tracking, conflicting memory
+ * types in different mappings can cause CPU cache corruption.
+ *
+ * The tree is an interval tree (augmented rbtree) with tree ordered
+ * on starting address. Tree can contain multiple entries for
+ * different regions which overlap. All the aliases have the same
+ * cache attributes of course.
+ *
+ * memtype_lock protects the rbtree.
+ */
+
+static struct rb_root memtype_rbroot = RB_ROOT;
+
+static int is_node_overlap(struct memtype *node, u64 start, u64 end)
+{
+ if (node->start >= end || node->end <= start)
+ return 0;
+
+ return 1;
+}
+
+static u64 get_subtree_max_end(struct rb_node *node)
+{
+ u64 ret = 0;
+ if (node) {
+ struct memtype *data = container_of(node, struct memtype, rb);
+ ret = data->subtree_max_end;
+ }
+ return ret;
+}
+
+static u64 compute_subtree_max_end(struct memtype *data)
+{
+ u64 max_end = data->end, child_max_end;
+
+ child_max_end = get_subtree_max_end(data->rb.rb_right);
+ if (child_max_end > max_end)
+ max_end = child_max_end;
+
+ child_max_end = get_subtree_max_end(data->rb.rb_left);
+ if (child_max_end > max_end)
+ max_end = child_max_end;
+
+ return max_end;
+}
+
+RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
+ u64, subtree_max_end, compute_subtree_max_end)
+
+/* Find the first (lowest start addr) overlapping range from rb tree */
+static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
+ u64 start, u64 end)
+{
+ struct rb_node *node = root->rb_node;
+ struct memtype *last_lower = NULL;
+
+ while (node) {
+ struct memtype *data = container_of(node, struct memtype, rb);
+
+ if (get_subtree_max_end(node->rb_left) > start) {
+ /* Lowest overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (is_node_overlap(data, start, end)) {
+ last_lower = data;
+ break;
+ } else if (start >= data->start) {
+ /* Lowest overlap if any must be on right side */
+ node = node->rb_right;
+ } else {
+ break;
+ }
+ }
+ return last_lower; /* Returns NULL if there is no overlap */
+}
+
+static struct memtype *memtype_rb_exact_match(struct rb_root *root,
+ u64 start, u64 end)
+{
+ struct memtype *match;
+
+ match = memtype_rb_lowest_match(root, start, end);
+ while (match != NULL && match->start < end) {
+ struct rb_node *node;
+
+ if (match->start == start && match->end == end)
+ return match;
+
+ node = rb_next(&match->rb);
+ if (node)
+ match = container_of(node, struct memtype, rb);
+ else
+ match = NULL;
+ }
+
+ return NULL; /* Returns NULL if there is no exact match */
+}
+
+static int memtype_rb_check_conflict(struct rb_root *root,
+ u64 start, u64 end,
+ enum page_cache_mode reqtype,
+ enum page_cache_mode *newtype)
+{
+ struct rb_node *node;
+ struct memtype *match;
+ enum page_cache_mode found_type = reqtype;
+
+ match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
+ if (match == NULL)
+ goto success;
+
+ if (match->type != found_type && newtype == NULL)
+ goto failure;
+
+ dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
+ found_type = match->type;
+
+ node = rb_next(&match->rb);
+ while (node) {
+ match = container_of(node, struct memtype, rb);
+
+ if (match->start >= end) /* Checked all possible matches */
+ goto success;
+
+ if (is_node_overlap(match, start, end) &&
+ match->type != found_type) {
+ goto failure;
+ }
+
+ node = rb_next(&match->rb);
+ }
+success:
+ if (newtype)
+ *newtype = found_type;
+
+ return 0;
+
+failure:
+ pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid, start, end,
+ cattr_name(found_type), cattr_name(match->type));
+ return -EBUSY;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
+{
+ struct rb_node **node = &(root->rb_node);
+ struct rb_node *parent = NULL;
+
+ while (*node) {
+ struct memtype *data = container_of(*node, struct memtype, rb);
+
+ parent = *node;
+ if (data->subtree_max_end < newdata->end)
+ data->subtree_max_end = newdata->end;
+ if (newdata->start <= data->start)
+ node = &((*node)->rb_left);
+ else if (newdata->start > data->start)
+ node = &((*node)->rb_right);
+ }
+
+ newdata->subtree_max_end = newdata->end;
+ rb_link_node(&newdata->rb, parent, node);
+ rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
+}
+
+int rbt_memtype_check_insert(struct memtype *new,
+ enum page_cache_mode *ret_type)
+{
+ int err = 0;
+
+ err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
+ new->type, ret_type);
+
+ if (!err) {
+ if (ret_type)
+ new->type = *ret_type;
+
+ new->subtree_max_end = new->end;
+ memtype_rb_insert(&memtype_rbroot, new);
+ }
+ return err;
+}
+
+struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{
+ struct memtype *data;
+
+ data = memtype_rb_exact_match(&memtype_rbroot, start, end);
+ if (!data)
+ goto out;
+
+ rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
+out:
+ return data;
+}
+
+struct memtype *rbt_memtype_lookup(u64 addr)
+{
+ struct memtype *data;
+ data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
+ return data;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{
+ struct rb_node *node;
+ int i = 1;
+
+ node = rb_first(&memtype_rbroot);
+ while (node && pos != i) {
+ node = rb_next(node);
+ i++;
+ }
+
+ if (node) { /* pos == i */
+ struct memtype *this = container_of(node, struct memtype, rb);
+ *out = *this;
+ return 0;
+ } else {
+ return 1;
+ }
+}
+#endif
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 0000000..9f0614d
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,532 @@
+/*
+ * Fault Injection Test harness (FI)
+ * Copyright (C) Intel Crop.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+ * USA.
+ *
+ */
+
+/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
+ * Copyright by Intel Crop., 2002
+ * Louis Zhuang (louis.zhuang@intel.com)
+ *
+ * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+
+#include <linux/module.h>
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+ 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+ 0x65, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+ 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
+static unsigned int rw32[] = {
+ 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+ 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+ 0xF0, 0xF3, 0xF2,
+ /* REX Prefixes */
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+ 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
+static unsigned int rw32[] = {
+ 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
+#endif /* not __i386__ */
+
+struct prefix_bits {
+ unsigned shorted:1;
+ unsigned enlarged:1;
+ unsigned rexr:1;
+ unsigned rex:1;
+};
+
+static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
+{
+ int i;
+ unsigned char *p = addr;
+ prf->shorted = 0;
+ prf->enlarged = 0;
+ prf->rexr = 0;
+ prf->rex = 0;
+
+restart:
+ for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+ if (*p == prefix_codes[i]) {
+ if (*p == 0x66)
+ prf->shorted = 1;
+#ifdef __amd64__
+ if ((*p & 0xf8) == 0x48)
+ prf->enlarged = 1;
+ if ((*p & 0xf4) == 0x44)
+ prf->rexr = 1;
+ if ((*p & 0xf0) == 0x40)
+ prf->rex = 1;
+#endif
+ p++;
+ goto restart;
+ }
+ }
+
+ return (p - addr);
+}
+
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+ int len;
+
+ if (*addr == 0x0F) {
+ /* 0x0F is extension instruction */
+ *opcode = *(unsigned short *)addr;
+ len = 2;
+ } else {
+ *opcode = *addr;
+ len = 1;
+ }
+
+ return len;
+}
+
+#define CHECK_OP_TYPE(opcode, array, type) \
+ for (i = 0; i < ARRAY_SIZE(array); i++) { \
+ if (array[i] == opcode) { \
+ rv = type; \
+ goto exit; \
+ } \
+ }
+
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+ enum reason_type rv = OTHERS;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+
+ CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+ CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+ CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+
+exit:
+ return rv;
+}
+#undef CHECK_OP_TYPE
+
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+
+ for (i = 0; i < ARRAY_SIZE(rw8); i++)
+ if (rw8[i] == opcode)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(rw32); i++)
+ if (rw32[i] == opcode)
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
+
+ printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+ return 0;
+}
+
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+
+ for (i = 0; i < ARRAY_SIZE(mw8); i++)
+ if (mw8[i] == opcode)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(mw16); i++)
+ if (mw16[i] == opcode)
+ return 2;
+
+ for (i = 0; i < ARRAY_SIZE(mw32); i++)
+ if (mw32[i] == opcode)
+ return prf.shorted ? 2 : 4;
+
+ for (i = 0; i < ARRAY_SIZE(mw64); i++)
+ if (mw64[i] == opcode)
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
+
+ printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+ return 0;
+}
+
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+ arg_AL = 0,
+ arg_CL = 1,
+ arg_DL = 2,
+ arg_BL = 3,
+ arg_AH = 4,
+ arg_CH = 5,
+ arg_DH = 6,
+ arg_BH = 7,
+
+ arg_AX = 0,
+ arg_CX = 1,
+ arg_DX = 2,
+ arg_BX = 3,
+ arg_SP = 4,
+ arg_BP = 5,
+ arg_SI = 6,
+ arg_DI = 7,
+#ifdef __amd64__
+ arg_R8 = 8,
+ arg_R9 = 9,
+ arg_R10 = 10,
+ arg_R11 = 11,
+ arg_R12 = 12,
+ arg_R13 = 13,
+ arg_R14 = 14,
+ arg_R15 = 15
+#endif
+};
+
+static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
+{
+ unsigned char *rv = NULL;
+
+ switch (no) {
+ case arg_AL:
+ rv = (unsigned char *)®s->ax;
+ break;
+ case arg_BL:
+ rv = (unsigned char *)®s->bx;
+ break;
+ case arg_CL:
+ rv = (unsigned char *)®s->cx;
+ break;
+ case arg_DL:
+ rv = (unsigned char *)®s->dx;
+ break;
+#ifdef __amd64__
+ case arg_R8:
+ rv = (unsigned char *)®s->r8;
+ break;
+ case arg_R9:
+ rv = (unsigned char *)®s->r9;
+ break;
+ case arg_R10:
+ rv = (unsigned char *)®s->r10;
+ break;
+ case arg_R11:
+ rv = (unsigned char *)®s->r11;
+ break;
+ case arg_R12:
+ rv = (unsigned char *)®s->r12;
+ break;
+ case arg_R13:
+ rv = (unsigned char *)®s->r13;
+ break;
+ case arg_R14:
+ rv = (unsigned char *)®s->r14;
+ break;
+ case arg_R15:
+ rv = (unsigned char *)®s->r15;
+ break;
+#endif
+ default:
+ break;
+ }
+
+ if (rv)
+ return rv;
+
+ if (rex) {
+ /*
+ * If REX prefix exists, access low bytes of SI etc.
+ * instead of AH etc.
+ */
+ switch (no) {
+ case arg_SI:
+ rv = (unsigned char *)®s->si;
+ break;
+ case arg_DI:
+ rv = (unsigned char *)®s->di;
+ break;
+ case arg_BP:
+ rv = (unsigned char *)®s->bp;
+ break;
+ case arg_SP:
+ rv = (unsigned char *)®s->sp;
+ break;
+ default:
+ break;
+ }
+ } else {
+ switch (no) {
+ case arg_AH:
+ rv = 1 + (unsigned char *)®s->ax;
+ break;
+ case arg_BH:
+ rv = 1 + (unsigned char *)®s->bx;
+ break;
+ case arg_CH:
+ rv = 1 + (unsigned char *)®s->cx;
+ break;
+ case arg_DH:
+ rv = 1 + (unsigned char *)®s->dx;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!rv)
+ printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+
+ return rv;
+}
+
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+ unsigned long *rv = NULL;
+
+ switch (no) {
+ case arg_AX:
+ rv = ®s->ax;
+ break;
+ case arg_BX:
+ rv = ®s->bx;
+ break;
+ case arg_CX:
+ rv = ®s->cx;
+ break;
+ case arg_DX:
+ rv = ®s->dx;
+ break;
+ case arg_SP:
+ rv = ®s->sp;
+ break;
+ case arg_BP:
+ rv = ®s->bp;
+ break;
+ case arg_SI:
+ rv = ®s->si;
+ break;
+ case arg_DI:
+ rv = ®s->di;
+ break;
+#ifdef __amd64__
+ case arg_R8:
+ rv = ®s->r8;
+ break;
+ case arg_R9:
+ rv = ®s->r9;
+ break;
+ case arg_R10:
+ rv = ®s->r10;
+ break;
+ case arg_R11:
+ rv = ®s->r11;
+ break;
+ case arg_R12:
+ rv = ®s->r12;
+ break;
+ case arg_R13:
+ rv = ®s->r13;
+ break;
+ case arg_R14:
+ rv = ®s->r14;
+ break;
+ case arg_R15:
+ rv = ®s->r15;
+ break;
+#endif
+ default:
+ printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+ }
+
+ return rv;
+}
+
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+ unsigned int opcode;
+ int reg;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+ for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+ if (reg_rop[i] == opcode)
+ goto do_work;
+
+ for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+ if (reg_wop[i] == opcode)
+ goto do_work;
+
+ printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+ "0x%02x\n", opcode);
+ goto err;
+
+do_work:
+ /* for STOS, source register is fixed */
+ if (opcode == 0xAA || opcode == 0xAB) {
+ reg = arg_AX;
+ } else {
+ unsigned char mod_rm = *p;
+ reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+ }
+ switch (get_ins_reg_width(ins_addr)) {
+ case 1:
+ return *get_reg_w8(reg, prf.rex, regs);
+
+ case 2:
+ return *(unsigned short *)get_reg_w32(reg, regs);
+
+ case 4:
+ return *(unsigned int *)get_reg_w32(reg, regs);
+
+#ifdef __amd64__
+ case 8:
+ return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+
+ default:
+ printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+ }
+
+err:
+ return 0;
+}
+
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char mod_rm;
+ unsigned char mod;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+ for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+ if (imm_wop[i] == opcode)
+ goto do_work;
+
+ printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+ "0x%02x\n", opcode);
+ goto err;
+
+do_work:
+ mod_rm = *p;
+ mod = mod_rm >> 6;
+ p++;
+ switch (mod) {
+ case 0:
+ /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
+ /* AMD64: XXX Check for address size prefix? */
+ if ((mod_rm & 0x7) == 0x5)
+ p += 4;
+ break;
+
+ case 1:
+ p += 1;
+ break;
+
+ case 2:
+ p += 4;
+ break;
+
+ case 3:
+ default:
+ printk(KERN_ERR "mmiotrace: not a memory access instruction "
+ "at 0x%lx, rm_mod=0x%02x\n",
+ ins_addr, mod_rm);
+ }
+
+ switch (get_ins_reg_width(ins_addr)) {
+ case 1:
+ return *(unsigned char *)p;
+
+ case 2:
+ return *(unsigned short *)p;
+
+ case 4:
+ return *(unsigned int *)p;
+
+#ifdef __amd64__
+ case 8:
+ return *(unsigned long *)p;
+#endif
+
+ default:
+ printk(KERN_ERR "mmiotrace: Error: width.\n");
+ }
+
+err:
+ return 0;
+}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 0000000..e05341a
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
+/*
+ * Fault Injection Test harness (FI)
+ * Copyright (C) Intel Crop.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+ * USA.
+ *
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum reason_type {
+ NOT_ME, /* page fault is not in regions */
+ NOTHING, /* access others point in regions */
+ REG_READ, /* read from addr to reg */
+ REG_WRITE, /* write from reg to addr */
+ IMM_WRITE, /* write from imm to addr */
+ OTHERS /* Other instructions can not intercept */
+};
+
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+
+#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 0000000..fb0a9dd
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,659 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/fixmap.h>
+#include <asm/mtrr.h>
+
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
+#ifdef CONFIG_HIGHPTE
+#define PGALLOC_USER_GFP __GFP_HIGHMEM
+#else
+#define PGALLOC_USER_GFP 0
+#endif
+
+gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ return (pte_t *)__get_free_page(PGALLOC_GFP);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+ pte = alloc_pages(__userpte_alloc_gfp, 0);
+ if (!pte)
+ return NULL;
+ if (!pgtable_page_ctor(pte)) {
+ __free_page(pte);
+ return NULL;
+ }
+ return pte;
+}
+
+static int __init setup_userpte(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /*
+ * "userpte=nohigh" disables allocation of user pagetables in
+ * high memory.
+ */
+ if (strcmp(arg, "nohigh") == 0)
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("userpte", setup_userpte);
+
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ paravirt_release_pte(page_to_pfn(pte));
+ tlb_remove_page(tlb, pte);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 2
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ struct page *page = virt_to_page(pmd);
+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+ /*
+ * NOTE! For PAE, any changes to the top page-directory-pointer-table
+ * entries need a full cr3 reload to flush.
+ */
+#ifdef CONFIG_X86_PAE
+ tlb->need_flush_all = 1;
+#endif
+ pgtable_pmd_page_dtor(page);
+ tlb_remove_page(tlb, page);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 3
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD \
+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+ BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+ virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+ return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+{
+ /* If the pgd points to a shared pagetable level (either the
+ ptes in non-PAE, or shared PMD in PAE), then just copy the
+ references from swapper_pg_dir. */
+ if (CONFIG_PGTABLE_LEVELS == 2 ||
+ (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ CONFIG_PGTABLE_LEVELS == 4) {
+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ }
+
+ /* list required to sync kernel mapping updates */
+ if (!SHARED_KERNEL_PMD) {
+ pgd_set_mm(pgd, mm);
+ pgd_list_add(pgd);
+ }
+}
+
+static void pgd_dtor(pgd_t *pgd)
+{
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ spin_lock(&pgd_lock);
+ pgd_list_del(pgd);
+ spin_unlock(&pgd_lock);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- nyc
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ flush_tlb_mm(mm);
+}
+#else /* !CONFIG_X86_PAE */
+
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS 0
+
+#endif /* CONFIG_X86_PAE */
+
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
+{
+ int i;
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++)
+ if (pmds[i]) {
+ pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
+ free_page((unsigned long)pmds[i]);
+ mm_dec_nr_pmds(mm);
+ }
+}
+
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
+{
+ int i;
+ bool failed = false;
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ if (!pmd)
+ failed = true;
+ if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+ free_page((unsigned long)pmd);
+ pmd = NULL;
+ failed = true;
+ }
+ if (pmd)
+ mm_inc_nr_pmds(mm);
+ pmds[i] = pmd;
+ }
+
+ if (failed) {
+ free_pmds(mm, pmds);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+ int i;
+
+ for(i = 0; i < PREALLOCATED_PMDS; i++) {
+ pgd_t pgd = pgdp[i];
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgdp[i] = native_make_pgd(0);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ mm_dec_nr_pmds(mm);
+ }
+ }
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+ pud_t *pud;
+ int i;
+
+ if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+ return;
+
+ pud = pud_offset(pgd, 0);
+
+ for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
+ pmd_t *pmd = pmds[i];
+
+ if (i >= KERNEL_PGD_BOUNDARY)
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, pud, pmd);
+ }
+}
+
+/*
+ * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
+ * assumes that pgd should be in one page.
+ *
+ * But kernel with PAE paging that is not running as a Xen domain
+ * only needs to allocate 32 bytes for pgd instead of one page.
+ */
+#ifdef CONFIG_X86_PAE
+
+#include <linux/slab.h>
+
+#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_ALIGN 32
+
+static struct kmem_cache *pgd_cache;
+
+static int __init pgd_cache_init(void)
+{
+ /*
+ * When PAE kernel is running as a Xen domain, it does not use
+ * shared kernel pmd. And this requires a whole page for pgd.
+ */
+ if (!SHARED_KERNEL_PMD)
+ return 0;
+
+ /*
+ * when PAE kernel is not running as a Xen domain, it uses
+ * shared kernel pmd. Shared kernel pmd does not require a whole
+ * page for pgd. We are able to just allocate a 32-byte for pgd.
+ * During boot time, we create a 32-byte slab for pgd table allocation.
+ */
+ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
+ SLAB_PANIC, NULL);
+ if (!pgd_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+core_initcall(pgd_cache_init);
+
+static inline pgd_t *_pgd_alloc(void)
+{
+ /*
+ * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
+ * We allocate one page for pgd.
+ */
+ if (!SHARED_KERNEL_PMD)
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+
+ /*
+ * Now PAE kernel is not running as a Xen domain. We can allocate
+ * a 32-byte slab for pgd to save memory space.
+ */
+ return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ if (!SHARED_KERNEL_PMD)
+ free_page((unsigned long)pgd);
+ else
+ kmem_cache_free(pgd_cache, pgd);
+}
+#else
+static inline pgd_t *_pgd_alloc(void)
+{
+ return (pgd_t *)__get_free_page(PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ free_page((unsigned long)pgd);
+}
+#endif /* CONFIG_X86_PAE */
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd;
+ pmd_t *pmds[PREALLOCATED_PMDS];
+
+ pgd = _pgd_alloc();
+
+ if (pgd == NULL)
+ goto out;
+
+ mm->pgd = pgd;
+
+ if (preallocate_pmds(mm, pmds) != 0)
+ goto out_free_pgd;
+
+ if (paravirt_pgd_alloc(mm) != 0)
+ goto out_free_pmds;
+
+ /*
+ * Make sure that pre-populating the pmds is atomic with
+ * respect to anything walking the pgd_list, so that they
+ * never see a partially populated pgd.
+ */
+ spin_lock(&pgd_lock);
+
+ pgd_ctor(mm, pgd);
+ pgd_prepopulate_pmd(mm, pgd, pmds);
+
+ spin_unlock(&pgd_lock);
+
+ return pgd;
+
+out_free_pmds:
+ free_pmds(mm, pmds);
+out_free_pgd:
+ _pgd_free(pgd);
+out:
+ return NULL;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ pgd_mop_up_pmds(mm, pgd);
+ pgd_dtor(pgd);
+ paravirt_pgd_free(mm, pgd);
+ _pgd_free(pgd);
+}
+
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+
+ if (changed && dirty) {
+ *ptep = entry;
+ pte_update_defer(vma->vm_mm, address, ptep);
+ }
+
+ return changed;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ if (changed && dirty) {
+ *pmdp = entry;
+ pmd_update_defer(vma->vm_mm, address, pmdp);
+ /*
+ * We had a write-protection fault here and changed the pmd
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
+ }
+
+ return changed;
+}
+#endif
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+
+ if (pte_young(*ptep))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *) &ptep->pte);
+
+ if (ret)
+ pte_update(vma->vm_mm, addr, ptep);
+
+ return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ int ret = 0;
+
+ if (pmd_young(*pmdp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pmdp);
+
+ if (ret)
+ pmd_update(vma->vm_mm, addr, pmdp);
+
+ return ret;
+}
+#endif
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int set;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+ (unsigned long *)pmdp);
+ if (set) {
+ pmd_update(vma->vm_mm, address, pmdp);
+ /* need tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+}
+#endif
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+ BUG_ON(fixmaps_set > 0);
+ __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
+ printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
+ -reserve, __FIXADDR_TOP + PAGE_SIZE);
+#endif
+}
+
+int fixmaps_set;
+
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
+{
+ unsigned long address = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ set_pte_vaddr(address, pte);
+ fixmaps_set++;
+}
+
+void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
+ pgprot_t flags)
+{
+ __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
+}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ * has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr, uniform;
+
+ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pud, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr, uniform;
+
+ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK)) {
+ pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+ __func__, addr, addr + PMD_SIZE);
+ return 0;
+ }
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pmd, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
+int pmd_clear_huge(pmd_t *pmd)
+{
+ if (pmd_large(*pmd)) {
+ pmd_clear(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
new file mode 100644
index 0000000..75cc097
--- /dev/null
+++ b/arch/x86/mm/pgtable_32.c
@@ -0,0 +1,98 @@
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ BUG();
+ return;
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ BUG();
+ return;
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_val(pteval))
+ set_pte_at(&init_mm, vaddr, pte, pteval);
+ else
+ pte_clear(&init_mm, vaddr, pte);
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+ __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
+ return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+ unsigned long address;
+
+ if (!arg)
+ return -EINVAL;
+
+ address = memparse(arg, &arg);
+ reserve_top_address(address);
+ early_ioremap_init();
+ return 0;
+}
+early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
new file mode 100644
index 0000000..e666cbb
--- /dev/null
+++ b/arch/x86/mm/physaddr.c
@@ -0,0 +1,98 @@
+#include <linux/bootmem.h>
+#include <linux/mmdebug.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+
+#include "physaddr.h"
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+ } else {
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
+ }
+
+ return x;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
+ return false;
+ } else {
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !phys_addr_valid(x))
+ return false;
+ }
+
+ return pfn_valid(x >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#else
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ unsigned long phys_addr = x - PAGE_OFFSET;
+ /* VMALLOC_* aren't constants */
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
+ /* max_low_pfn is set early, but not _that_ early */
+ if (max_low_pfn) {
+ VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+ BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+ }
+ return phys_addr;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x < PAGE_OFFSET)
+ return false;
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+ return false;
+ if (x >= FIXADDR_START)
+ return false;
+ return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
new file mode 100644
index 0000000..a3cd5a0
--- /dev/null
+++ b/arch/x86/mm/physaddr.h
@@ -0,0 +1,10 @@
+#include <asm/processor.h>
+
+static inline int phys_addr_valid(resource_size_t addr)
+{
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+ return 1;
+#endif
+}
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
new file mode 100644
index 0000000..90555bf
--- /dev/null
+++ b/arch/x86/mm/setup_nx.c
@@ -0,0 +1,60 @@
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+
+static int disable_nx;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on Enable
+ * off Disable
+ */
+static int __init noexec_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ disable_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ disable_nx = 1;
+ }
+ x86_configure_nx();
+ return 0;
+}
+early_param("noexec", noexec_setup);
+
+void x86_configure_nx(void)
+{
+ if (cpu_has_nx && !disable_nx)
+ __supported_pte_mask |= _PAGE_NX;
+ else
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+void __init x86_report_nx(void)
+{
+ if (!cpu_has_nx) {
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "missing in CPU!\n");
+ } else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ if (disable_nx) {
+ printk(KERN_INFO "NX (Execute Disable) protection: "
+ "disabled by kernel command line option\n");
+ } else {
+ printk(KERN_INFO "NX (Execute Disable) protection: "
+ "active\n");
+ }
+#else
+ /* 32bit non-PAE kernel, NX cannot be used */
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "cannot be enabled: non-PAE kernel!\n");
+#endif
+ }
+}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 0000000..c2aea63
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,223 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+int acpi_numa __initdata;
+
+static __init int setup_node(int pxm)
+{
+ return acpi_map_pxm_to_node(pxm);
+}
+
+static __init void bad_srat(void)
+{
+ printk(KERN_ERR "SRAT: SRAT not used.\n");
+ acpi_numa = -1;
+}
+
+static __init inline int srat_disabled(void)
+{
+ return acpi_numa < 0;
+}
+
+/*
+ * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them. I/O localities are
+ * not supported at this point.
+ */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+ int i, j;
+
+ for (i = 0; i < slit->locality_count; i++) {
+ const int from_node = pxm_to_node(i);
+
+ if (from_node == NUMA_NO_NODE)
+ continue;
+
+ for (j = 0; j < slit->locality_count; j++) {
+ const int to_node = pxm_to_node(j);
+
+ if (to_node == NUMA_NO_NODE)
+ continue;
+
+ numa_set_distance(from_node, to_node,
+ slit->entry[slit->locality_count * i + j]);
+ }
+ }
+}
+
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+ int pxm, node;
+ int apic_id;
+
+ if (srat_disabled())
+ return;
+ if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
+ if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+ return;
+ pxm = pa->proximity_domain;
+ apic_id = pa->apic_id;
+ if (!apic->apic_id_valid(apic_id)) {
+ printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
+ pxm, apic_id);
+ return;
+ }
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+ bad_srat();
+ return;
+ }
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+ set_apicid_to_node(apic_id, node);
+ node_set(node, numa_nodes_parsed);
+ acpi_numa = 1;
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
+ pxm, apic_id, node);
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+ int pxm, node;
+ int apic_id;
+
+ if (srat_disabled())
+ return;
+ if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
+ if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+ return;
+ pxm = pa->proximity_domain_lo;
+ if (acpi_srat_revision >= 2)
+ pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+ bad_srat();
+ return;
+ }
+
+ if (get_uv_system_type() >= UV_X2APIC)
+ apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+ else
+ apic_id = pa->apic_id;
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+
+ set_apicid_to_node(apic_id, node);
+ node_set(node, numa_nodes_parsed);
+ acpi_numa = 1;
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
+ pxm, apic_id, node);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+int __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+{
+ u64 start, end;
+ u32 hotpluggable;
+ int node, pxm;
+
+ if (srat_disabled())
+ goto out_err;
+ if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
+ goto out_err_bad_srat;
+ if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+ goto out_err;
+ hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+ if (hotpluggable && !save_add_info())
+ goto out_err;
+
+ start = ma->base_address;
+ end = start + ma->length;
+ pxm = ma->proximity_domain;
+ if (acpi_srat_revision <= 1)
+ pxm &= 0xff;
+
+ node = setup_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+ goto out_err_bad_srat;
+ }
+
+ if (numa_add_memblk(node, start, end) < 0)
+ goto out_err_bad_srat;
+
+ node_set(node, numa_nodes_parsed);
+
+ pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
+ node, pxm,
+ (unsigned long long) start, (unsigned long long) end - 1,
+ hotpluggable ? " hotplug" : "",
+ ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");
+
+ /* Mark hotplug range in memblock. */
+ if (hotpluggable && memblock_mark_hotplug(start, ma->length))
+ pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
+ (unsigned long long)start, (unsigned long long)end - 1);
+
+ return 0;
+out_err_bad_srat:
+ bad_srat();
+out_err:
+ return -1;
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+int __init x86_acpi_numa_init(void)
+{
+ int ret;
+
+ ret = acpi_numa_init();
+ if (ret < 0)
+ return ret;
+ return srat_disabled() ? -EINVAL : 0;
+}
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 0000000..38868ad
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,140 @@
+/*
+ * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/mmiotrace.h>
+
+static unsigned long mmio_address;
+module_param(mmio_address, ulong, 0);
+MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
+ "(or 8 MB if read_far is non-zero).");
+
+static unsigned long read_far = 0x400100;
+module_param(read_far, ulong, 0);
+MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
+ "(default: 0x400100).");
+
+static unsigned v16(unsigned i)
+{
+ return i * 12 + 7;
+}
+
+static unsigned v32(unsigned i)
+{
+ return i * 212371 + 13;
+}
+
+static void do_write_test(void __iomem *p)
+{
+ unsigned int i;
+ pr_info("write test.\n");
+ mmiotrace_printk("Write test.\n");
+
+ for (i = 0; i < 256; i++)
+ iowrite8(i, p + i);
+
+ for (i = 1024; i < (5 * 1024); i += 2)
+ iowrite16(v16(i), p + i);
+
+ for (i = (5 * 1024); i < (16 * 1024); i += 4)
+ iowrite32(v32(i), p + i);
+}
+
+static void do_read_test(void __iomem *p)
+{
+ unsigned int i;
+ unsigned errs[3] = { 0 };
+ pr_info("read test.\n");
+ mmiotrace_printk("Read test.\n");
+
+ for (i = 0; i < 256; i++)
+ if (ioread8(p + i) != i)
+ ++errs[0];
+
+ for (i = 1024; i < (5 * 1024); i += 2)
+ if (ioread16(p + i) != v16(i))
+ ++errs[1];
+
+ for (i = (5 * 1024); i < (16 * 1024); i += 4)
+ if (ioread32(p + i) != v32(i))
+ ++errs[2];
+
+ mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
+ errs[0], errs[1], errs[2]);
+}
+
+static void do_read_far_test(void __iomem *p)
+{
+ pr_info("read far test.\n");
+ mmiotrace_printk("Read far test.\n");
+
+ ioread32(p + read_far);
+}
+
+static void do_test(unsigned long size)
+{
+ void __iomem *p = ioremap_nocache(mmio_address, size);
+ if (!p) {
+ pr_err("could not ioremap, aborting.\n");
+ return;
+ }
+ mmiotrace_printk("ioremap returned %p.\n", p);
+ do_write_test(p);
+ do_read_test(p);
+ if (read_far && read_far < size - 4)
+ do_read_far_test(p);
+ iounmap(p);
+}
+
+/*
+ * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
+ * a short time. We had a bug in deferred freeing procedure which tried
+ * to free this region multiple times (ioremap can reuse the same address
+ * for many mappings).
+ */
+static void do_test_bulk_ioremapping(void)
+{
+ void __iomem *p;
+ int i;
+
+ for (i = 0; i < 10; ++i) {
+ p = ioremap_nocache(mmio_address, PAGE_SIZE);
+ if (p)
+ iounmap(p);
+ }
+
+ /* Force freeing. If it will crash we will know why. */
+ synchronize_rcu();
+}
+
+static int __init init(void)
+{
+ unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+
+ if (mmio_address == 0) {
+ pr_err("you have to use the module argument mmio_address.\n");
+ pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+ return -ENXIO;
+ }
+
+ pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+ "and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
+ do_test(size);
+ do_test_bulk_ioremapping();
+ pr_info("All done.\n");
+ return 0;
+}
+
+static void __exit cleanup(void)
+{
+ pr_debug("unloaded.\n");
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 0000000..5a760fd
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,353 @@
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/cache.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+#include <linux/debugfs.h>
+
+/*
+ * Smarter SMP flushing macros.
+ * c/o Linus Torvalds.
+ *
+ * These mean you can really definitely utterly forget about
+ * writing to user space from interrupts. (Its not allowed anyway).
+ *
+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ * More scalable flush, from Andi Kleen
+ *
+ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
+ */
+
+struct flush_tlb_info {
+ struct mm_struct *flush_mm;
+ unsigned long flush_start;
+ unsigned long flush_end;
+};
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ */
+void leave_mm(int cpu)
+{
+ struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+ BUG();
+ if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+ cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+ load_cr3(swapper_pg_dir);
+ /*
+ * This gets called in the idle path where RCU
+ * functions differently. Tracing normally
+ * uses RCU, so we have to call the tracepoint
+ * specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ }
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+
+/*
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) set cpu_tlbstate to TLBSTATE_OK
+ * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
+ * if cpu0 was in lazy tlb mode.
+ * 1a2) update cpu active_mm
+ * Now cpu0 accepts tlb flushes for the new mm.
+ * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ * Stop ipi delivery for the old mm. This is not synchronized with
+ * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
+ * mm, and in the worst case we perform a superfluous tlb flush.
+ * 1b) thread switch without mm change
+ * cpu active_mm is correct, cpu0 already handles flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ * Atomically set the bit [other cpus will start sending flush ipis],
+ * and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ * runs in kernel space, the cpu could load tlb entries for user space
+ * pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush funcation:
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+static void flush_tlb_func(void *info)
+{
+ struct flush_tlb_info *f = info;
+
+ inc_irq_stat(irq_tlb_count);
+
+ if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+ return;
+
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+ if (f->flush_end == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
+ } else {
+ unsigned long addr;
+ unsigned long nr_pages =
+ (f->flush_end - f->flush_start) / PAGE_SIZE;
+ addr = f->flush_start;
+ while (addr < f->flush_end) {
+ __flush_tlb_single(addr);
+ addr += PAGE_SIZE;
+ }
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
+ }
+ } else
+ leave_mm(smp_processor_id());
+
+}
+
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ struct flush_tlb_info info;
+
+ info.flush_mm = mm;
+ info.flush_start = start;
+ info.flush_end = end;
+
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ if (end == TLB_FLUSH_ALL)
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
+ else
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI,
+ (end - start) >> PAGE_SHIFT);
+
+ if (is_uv_system()) {
+ unsigned int cpu;
+
+ cpu = smp_processor_id();
+ cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
+ if (cpumask)
+ smp_call_function_many(cpumask, flush_tlb_func,
+ &info, 1);
+ return;
+ }
+ smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
+}
+
+void flush_tlb_current_task(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ preempt_disable();
+
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+ /* This is an implicit full barrier that synchronizes with switch_mm. */
+ local_flush_tlb();
+
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+ preempt_enable();
+}
+
+/*
+ * See Documentation/x86/tlb.txt for details. We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead. Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long vmflag)
+{
+ unsigned long addr;
+ /* do a global flush by default */
+ unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
+
+ preempt_disable();
+ if (current->active_mm != mm) {
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
+ goto out;
+ }
+
+ if (!current->mm) {
+ leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
+ goto out;
+ }
+
+ if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+ base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+
+ /*
+ * Both branches below are implicit full barriers (MOV to CR or
+ * INVLPG) that synchronize with switch_mm.
+ */
+ if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
+ base_pages_to_flush = TLB_FLUSH_ALL;
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ } else {
+ /* flush range by one by one 'invlpg' */
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+ __flush_tlb_single(addr);
+ }
+ }
+ trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
+out:
+ if (base_pages_to_flush == TLB_FLUSH_ALL) {
+ start = 0UL;
+ end = TLB_FLUSH_ALL;
+ }
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, start, end);
+ preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ preempt_disable();
+
+ if (current->active_mm == mm) {
+ if (current->mm) {
+ /*
+ * Implicit full barrier (INVLPG) that synchronizes
+ * with switch_mm.
+ */
+ __flush_tlb_one(start);
+ } else {
+ leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+ }
+ }
+
+ if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
+
+ preempt_enable();
+}
+
+static void do_flush_tlb_all(void *info)
+{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ __flush_tlb_all();
+ if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ leave_mm(smp_processor_id());
+}
+
+void flush_tlb_all(void)
+{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void do_kernel_range_flush(void *info)
+{
+ struct flush_tlb_info *f = info;
+ unsigned long addr;
+
+ /* flush range by one by one 'invlpg' */
+ for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+ __flush_tlb_single(addr);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+
+ /* Balance as user space task's flush, a bit conservative */
+ if (end == TLB_FLUSH_ALL ||
+ (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ } else {
+ struct flush_tlb_info info;
+ info.flush_start = start;
+ info.flush_end = end;
+ on_each_cpu(do_kernel_range_flush, &info, 1);
+ }
+}
+
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[32];
+ unsigned int len;
+
+ len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ ssize_t len;
+ int ceiling;
+
+ len = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, user_buf, len))
+ return -EFAULT;
+
+ buf[len] = '\0';
+ if (kstrtoint(buf, 0, &ceiling))
+ return -EINVAL;
+
+ if (ceiling < 0)
+ return -EINVAL;
+
+ tlb_single_page_flush_ceiling = ceiling;
+ return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+ .read = tlbflush_read_file,
+ .write = tlbflush_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+ debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_tlbflush);
+ return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);