From: Per E. <pe...@pd...> - 2004-10-11 15:14:20
|
This is a patch that adds a per-process /proc entry that maps virtual memory address ranges to node numbers. It may be useful to people debugging NUMA performance anomalies. I thought I'd float it by this list for comments. # cat /proc/self/nodemem 0000000000504000-0000000000525fff node 0 0000002a95556000-0000002a95556fff node 0 ... 0000002a9576a000-0000002a9576afff node 3 Process running on node 0 Process has 2392064 bytes (584 pages) on node 0 Process has 122880 bytes (30 pages) on node 1 Process has 122880 bytes (30 pages) on node 2 Process has 90112 bytes (22 pages) on node 3 Process has 0 bytes (0 pages) on node 4 Process has 0 bytes (0 pages) on node 5 Process has 0 bytes (0 pages) on node 6 Process has 0 bytes (0 pages) on node 7 Process has 0 bytes (0 pages) not in core /Per --- diff -uprN linux-2.6.8.1/fs/proc/Makefile linux-2.6.8.1-patched/fs/proc/Makefile --- linux-2.6.8.1/fs/proc/Makefile 2004-08-14 12:54:48.000000000 +0200 +++ linux-2.6.8.1-patched/fs/proc/Makefile 2004-10-11 15:46:32.000000000 +0200 @@ -8,7 +8,7 @@ proc-y := task_nommu.o proc-$(CONFIG_MMU) := task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - kmsg.o proc_tty.o proc_misc.o + kmsg.o proc_tty.o proc_misc.o nodemem.o proc-$(CONFIG_PROC_KCORE) += kcore.o proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o diff -uprN linux-2.6.8.1/fs/proc/base.c linux-2.6.8.1-patched/fs/proc/base.c --- linux-2.6.8.1/fs/proc/base.c 2004-08-14 12:55:35.000000000 +0200 +++ linux-2.6.8.1-patched/fs/proc/base.c 2004-10-11 15:46:46.000000000 +0200 @@ -48,6 +48,9 @@ enum pid_directory_inos { PROC_TGID_TASK, PROC_TGID_STATUS, PROC_TGID_MEM, +#ifdef CONFIG_NUMA + PROC_TGID_NODEMEM, +#endif PROC_TGID_CWD, PROC_TGID_ROOT, PROC_TGID_EXE, @@ -71,6 +74,9 @@ enum pid_directory_inos { PROC_TID_INO, PROC_TID_STATUS, PROC_TID_MEM, +#ifdef CONFIG_NUMA + PROC_TID_NODEMEM, +#endif PROC_TID_CWD, PROC_TID_ROOT, PROC_TID_EXE, @@ -113,6 +119,9 @@ static struct pid_entry tgid_base_stuff[ E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), +#ifdef CONFIG_NUMA + E(PROC_TGID_NODEMEM, "nodemem", S_IFREG|S_IRUGO), +#endif E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -135,6 +144,9 @@ static struct pid_entry tid_base_stuff[] E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), +#ifdef CONFIG_NUMA + E(PROC_TID_NODEMEM, "nodemem", S_IFREG|S_IRUGO), +#endif E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), @@ -689,6 +701,31 @@ static struct inode_operations proc_mem_ .permission = proc_permission, }; + +#ifdef CONFIG_NUMA + + +extern int nodemem_show(struct seq_file *s, void *v); + +static int nodemem_open(struct inode *inode, struct file *file) { + struct task_struct *task = proc_task(inode); + int ret = single_open(file, nodemem_show, NULL); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = task; + } + return ret; +} + +static struct file_operations proc_nodemem_operations = { + .open = nodemem_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_NUMA */ + static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1342,6 +1379,12 @@ static struct dentry *proc_pident_lookup inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; +#ifdef CONFIG_NUMA + case PROC_TID_NODEMEM: + case PROC_TGID_NODEMEM: + inode->i_fop = &proc_nodemem_operations; + break; +#endif case PROC_TID_MOUNTS: case PROC_TGID_MOUNTS: inode->i_fop = &proc_mounts_operations; diff -uprN linux-2.6.8.1/fs/proc/nodemem.c linux-2.6.8.1-patched/fs/proc/nodemem.c --- linux-2.6.8.1/fs/proc/nodemem.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.8.1-patched/fs/proc/nodemem.c 2004-10-11 16:22:48.863258816 +0200 @@ -0,0 +1,221 @@ +/* + * linux/fs/proc/nodemem.c + * + * Copyright (C) 2004 Per Ekman (pe...@pd...) + * Development supported by AMD. + * + * This file is released under the GPL. + * + * Per-process proc device to map virtual memory to nodes. + * + */ + +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/kernel.h> +#include <linux/mmzone.h> +#include <linux/mm.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/threads.h> + +/* Offsets into the range list array. If the range_list array is + * extended, make sure NOT_PRESENT remains the size of it. */ +#define NOT_PRESENT NR_CPUS + +#define RP_ENTRY_SIZE (2*(sizeof(unsigned long))) +#define RP_MAX_ENTRIES ((PAGE_SIZE / RP_ENTRY_SIZE) - 1) + +#define RP_START(p, i) (p->v[i*2]) +#define RP_END(p, i) (p->v[i*2 + 1]) +#define RP_ISEMPTY(p, i) ((p->v[i*2] == 0) && (p->v[i*2 + 1] == 0)) + +struct range_page { + unsigned long count; + unsigned long v[2 * RP_MAX_ENTRIES] __attribute__ ((packed)); + struct range_page *next __attribute__ ((packed)); +}; + +struct range_list { + struct range_page *rp; + unsigned long *cache; +}; + +static inline int mergeable_range(unsigned long a, unsigned long b, + unsigned long x, unsigned long y) +{ + if (a < x) { + if (b >= x - 1) + return 1; + } else if (a <= y + 1) + return 1; + + return 0; +} + +static inline void join_pair(unsigned long *v, unsigned long start, + unsigned long end) +{ + if (start < v[0]) + v[0] = start; + if (end > v[1]) + v[1] = end; +} + +static inline struct range_page *grow_rp(struct range_page *rp) +{ + struct range_page *p; + + if (rp == NULL) + p = rp; + else + p = rp->next; + if ((p = (struct range_page *)__get_free_page(GFP_KERNEL)) == NULL) + return NULL; + memset(p, 0, PAGE_SIZE); + + return p; +} + +static void free_rl(struct range_list *rlp) +{ + struct range_page *p, *rp = rlp->rp; + + while (rp) { + p = rp; + rp = rp->next; + free_page((unsigned long)p); + } +} + +static void insert_rl(struct range_list *rlp, unsigned long start, + unsigned long end) +{ + struct range_page *rp, *p = NULL; + unsigned long *empty = NULL; + int i; + + if (rlp->cache && + mergeable_range(start, end, rlp->cache[0], rlp->cache[1])) { + join_pair(rlp->cache, start, end); + return; + } + rp = rlp->rp; + while (rp) { + p = rp; + for (i = 0; i < p->count; i++) { + if (RP_ISEMPTY(p, i)) { + if (empty == NULL) + empty = &RP_START(p, i); + } else if (mergeable_range(start, end, RP_START(p, i), + RP_END(p, i))) { + join_pair(&RP_START(p, i), start, end); + rlp->cache = &RP_START(p, i); + return; + } + } + rp = rp->next; + } + + /* No hit */ + if (empty) { + empty[0] = start; + empty[1] = end; + return; + } + if (!p || p->count == RP_MAX_ENTRIES) + p = grow_rp(p); + if (!rlp->rp) + rlp->rp = p; + RP_START(p, p->count) = start; + RP_END(p, p->count) = end; + rlp->cache = &RP_START(p, p->count); + p->count++; +} + +static inline int page2node(struct page *page) +{ + struct zone *zone = page_zone(page); + return zone->zone_pgdat->node_id; +} + +int nodemem_show(struct seq_file *s, void *v) +{ + struct range_list *rlp; + struct range_page *rp; + struct task_struct *task = s->private; + struct vm_area_struct *vma; + pgd_t *pgd; + pmd_t *pmd; + pte_t *ptep, pte; + unsigned long pga; + int n, i; + + long node_pgc[NOT_PRESENT + 1]; + + rlp = kmalloc((sizeof *rlp) * (NOT_PRESENT + 1), GFP_KERNEL); + if (!rlp) + return 1; + memset(rlp, 0, (sizeof *rlp) * (NOT_PRESENT + 1)); + + for (n = 0; n <= NOT_PRESENT; n++) + node_pgc[n] = 0; + down_read(&task->active_mm->mmap_sem); + for (vma = task->active_mm->mmap; vma; vma = vma->vm_next) { + for (pga = PAGE_ALIGN(vma->vm_start); pga < vma->vm_end; + pga += PAGE_SIZE) { + n = NOT_PRESENT; + pgd = pgd_offset(task->active_mm, pga); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + pmd = pmd_offset(pgd, pga); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + goto out; + ptep = pte_offset_map(pmd, pga); + if (!ptep) + goto out; + pte = *ptep; + pte_unmap(ptep); + n = page2node(pte_page(pte)); + out: + node_pgc[n]++; + insert_rl(&rlp[n], pga, pga + PAGE_SIZE - 1); + } + } + up_read(&task->active_mm->mmap_sem); + for (n = 0; n <= NOT_PRESENT; n++) { + for (rp = rlp[n].rp; rp; rp = rp->next) { + for (i = 0; i < rp->count; i++) { + if (RP_ISEMPTY(rp, i)) + continue; + if (n == NOT_PRESENT) + seq_printf(s, + "%016lx-%016lx not " + "present\n", + RP_START(rp, i), + RP_END(rp, i)); + else + seq_printf(s, "%016lx-%016lx node %d\n", + RP_START(rp, i), + RP_END(rp, i), n); + + } + } + } + seq_printf(s, "Process running on node %d\n", task->thread_info->cpu); + for (n = 0; n <= NOT_PRESENT; n++) { + if (n == NOT_PRESENT) + seq_printf(s, "Process has %ld bytes (%ld pages) " + "not in core\n", + node_pgc[n] * PAGE_SIZE, node_pgc[n]); + else + seq_printf(s, "Process has %ld bytes (%ld pages)" + " on node %d\n", + node_pgc[n] * PAGE_SIZE, node_pgc[n], n); + } + + for (n = 0; n <= NOT_PRESENT; n++) + free_rl(&rlp[n]); + + return 0; +} |