From: Shailabh N. <na...@wa...> - 2006-01-03 23:29:07
|
Changes since 12/7/05 - removed __attribute((unused)) qualifiers from timespec vars (dave hansen) - use existing timestamping function do_posix_clock_monotonic_gettime() (jay lan) Changes since 11/14/05 - use nanosecond resolution, adjusted wall clock time for timestamps instead of sched_clock (akpm, andi, marcelo) - collect stats only if delay accounting enabled (parag) - stats collected for delays in all userspace-initiated block I/O including fsync/fdatasync but not counting waits for async block io events. 11/14/05: First post delayacct-blkio.patch Record time spent by a task waiting for completion of userspace initiated synchronous block I/O. This can help determine the right I/O priority for the task. Signed-off-by: Shailabh Nagar <na...@wa...> fs/buffer.c | 6 ++++++ fs/read_write.c | 10 +++++++++- include/linux/delayacct.h | 4 ++++ include/linux/sched.h | 2 ++ kernel/delayacct.c | 16 ++++++++++++++++ mm/filemap.c | 10 +++++++++- mm/memory.c | 17 +++++++++++++++-- 7 files changed, 61 insertions(+), 4 deletions(-) Index: linux-2.6.15-rc7/include/linux/sched.h =================================================================== --- linux-2.6.15-rc7.orig/include/linux/sched.h +++ linux-2.6.15-rc7/include/linux/sched.h @@ -546,6 +546,8 @@ struct task_delay_info { spinlock_t lock; /* Add stats in pairs: uint64_t delay, uint32_t count */ + uint64_t blkio_delay; /* wait for sync block io completion */ + uint32_t blkio_count; }; #endif Index: linux-2.6.15-rc7/fs/read_write.c =================================================================== --- linux-2.6.15-rc7.orig/fs/read_write.c +++ linux-2.6.15-rc7/fs/read_write.c @@ -14,6 +14,8 @@ #include <linux/security.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/time.h> +#include <linux/delayacct.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -224,8 +226,14 @@ ssize_t do_sync_read(struct file *filp, (ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos))) wait_on_retry_sync_kiocb(&kiocb); - if (-EIOCBQUEUED == ret) + if (-EIOCBQUEUED == ret) { + struct timespec start, end; + + do_posix_clock_monotonic_gettime(&start); ret = wait_on_sync_kiocb(&kiocb); + do_posix_clock_monotonic_gettime(&end); + delayacct_blkio(&start, &end); + } *ppos = kiocb.ki_pos; return ret; } Index: linux-2.6.15-rc7/mm/filemap.c =================================================================== --- linux-2.6.15-rc7.orig/mm/filemap.c +++ linux-2.6.15-rc7/mm/filemap.c @@ -28,6 +28,8 @@ #include <linux/blkdev.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/time.h> +#include <linux/delayacct.h> #include "filemap.h" /* * FIXME: remove all knowledge of the buffer layer from the core VM @@ -1062,8 +1064,14 @@ generic_file_read(struct file *filp, cha init_sync_kiocb(&kiocb, filp); ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); - if (-EIOCBQUEUED == ret) + if (-EIOCBQUEUED == ret) { + struct timespec start, end; + + do_posix_clock_monotonic_gettime(&start); ret = wait_on_sync_kiocb(&kiocb); + do_posix_clock_monotonic_gettime(&end); + delayacct_blkio(&start, &end); + } return ret; } Index: linux-2.6.15-rc7/mm/memory.c =================================================================== --- linux-2.6.15-rc7.orig/mm/memory.c +++ linux-2.6.15-rc7/mm/memory.c @@ -48,6 +48,8 @@ #include <linux/rmap.h> #include <linux/module.h> #include <linux/init.h> +#include <linux/time.h> +#include <linux/delayacct.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -2170,11 +2172,22 @@ static inline int handle_pte_fault(struc old_entry = entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { + int ret; + struct timespec start, end; + if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, address, pte, pmd, write_access); - return do_no_page(mm, vma, address, - pte, pmd, write_access); + + if (vma->vm_file) + do_posix_clock_monotonic_gettime(&start); + ret = do_no_page(mm, vma, address, + pte, pmd, write_access); + if (vma->vm_file) { + do_posix_clock_monotonic_gettime(&end); + delayacct_blkio(&start, &end); + } + return ret; } if (pte_file(entry)) return do_file_page(mm, vma, address, Index: linux-2.6.15-rc7/fs/buffer.c =================================================================== --- linux-2.6.15-rc7.orig/fs/buffer.c +++ linux-2.6.15-rc7/fs/buffer.c @@ -41,6 +41,8 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <linux/time.h> +#include <linux/delayacct.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void invalidate_bh_lrus(void); @@ -337,6 +339,7 @@ static long do_fsync(unsigned int fd, in struct file * file; struct address_space *mapping; int ret, err; + struct timespec start, end; ret = -EBADF; file = fget(fd); @@ -349,6 +352,7 @@ static long do_fsync(unsigned int fd, in goto out_putf; } + do_posix_clock_monotonic_gettime(&start); mapping = file->f_mapping; current->flags |= PF_SYNCWRITE; @@ -371,6 +375,8 @@ static long do_fsync(unsigned int fd, in out_putf: fput(file); out: + do_posix_clock_monotonic_gettime(&end); + delayacct_blkio(&start, &end); return ret; } Index: linux-2.6.15-rc7/include/linux/delayacct.h =================================================================== --- linux-2.6.15-rc7.orig/include/linux/delayacct.h +++ linux-2.6.15-rc7/include/linux/delayacct.h @@ -19,8 +19,12 @@ #ifdef CONFIG_TASK_DELAY_ACCT extern int delayacct_on; /* Delay accounting turned on/off */ extern void delayacct_tsk_init(struct task_struct *tsk); +extern void delayacct_blkio(struct timespec *start, struct timespec *end); #else static inline void delayacct_tsk_init(struct task_struct *tsk) {} +static inline void delayacct_blkio(struct timespec *start, struct timespec *end) +{} + #endif /* CONFIG_TASK_DELAY_ACCT */ #endif /* _LINUX_TASKDELAYS_H */ Index: linux-2.6.15-rc7/kernel/delayacct.c =================================================================== --- linux-2.6.15-rc7.orig/kernel/delayacct.c +++ linux-2.6.15-rc7/kernel/delayacct.c @@ -12,6 +12,7 @@ */ #include <linux/sched.h> +#include <linux/time.h> int delayacct_on=1; /* Delay accounting turned on/off */ @@ -34,3 +35,18 @@ static int __init delayacct_init(void) return 0; } core_initcall(delayacct_init); + +inline void delayacct_blkio(struct timespec *start, struct timespec *end) +{ + unsigned long long delay; + + if (!delayacct_on) + return; + + delay = timespec_diff_ns(start, end); + + spin_lock(¤t->delays.lock); + current->delays.blkio_delay += delay; + current->delays.blkio_count++; + spin_unlock(¤t->delays.lock); +} |
From: Shailabh N. <na...@wa...> - 2006-01-03 23:30:40
|
Changes since 12/7/05 - removed __attribute((unused)) qualifiers from timespec vars (dave hansen) - use existing timestamping function do_posix_clock_monotonic_gettime() (jay lan) Changes since 11/14/05 - use nanosecond resolution, adjusted wall clock time for timestamps instead of sched_clock (akpm, andi, marcelo) - collect stats only if delay accounting enabled (parag) - collect delays for only swapin page faults instead of all page faults. 11/14/05: First post delayacct-swapin.patch Record time spent by a task waiting for its pages to be swapped in. This statistic can help in adjusting the rss limits of tasks (process), especially relative to each other, when the system is under memory pressure. Signed-off-by: Shailabh Nagar <na...@wa...> include/linux/delayacct.h | 3 +++ include/linux/sched.h | 2 ++ kernel/delayacct.c | 15 +++++++++++++++ mm/memory.c | 16 +++++++++------- 4 files changed, 29 insertions(+), 7 deletions(-) Index: linux-2.6.15-rc7/include/linux/delayacct.h =================================================================== --- linux-2.6.15-rc7.orig/include/linux/delayacct.h +++ linux-2.6.15-rc7/include/linux/delayacct.h @@ -20,11 +20,14 @@ extern int delayacct_on; /* Delay accounting turned on/off */ extern void delayacct_tsk_init(struct task_struct *tsk); extern void delayacct_blkio(struct timespec *start, struct timespec *end); +extern void delayacct_swapin(struct timespec *start, struct timespec *end); #else static inline void delayacct_tsk_init(struct task_struct *tsk) {} static inline void delayacct_blkio(struct timespec *start, struct timespec *end) {} +static inline void delayacct_swapin(struct timespec *start, struct timespec *end) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ #endif /* _LINUX_TASKDELAYS_H */ Index: linux-2.6.15-rc7/include/linux/sched.h =================================================================== --- linux-2.6.15-rc7.orig/include/linux/sched.h +++ linux-2.6.15-rc7/include/linux/sched.h @@ -548,6 +548,8 @@ struct task_delay_info { /* Add stats in pairs: uint64_t delay, uint32_t count */ uint64_t blkio_delay; /* wait for sync block io completion */ uint32_t blkio_count; + uint64_t swapin_delay; /* wait for pages to be swapped in */ + uint32_t swapin_count; }; #endif Index: linux-2.6.15-rc7/mm/memory.c =================================================================== --- linux-2.6.15-rc7.orig/mm/memory.c +++ linux-2.6.15-rc7/mm/memory.c @@ -2171,16 +2171,15 @@ static inline int handle_pte_fault(struc old_entry = entry = *pte; if (!pte_present(entry)) { - if (pte_none(entry)) { - int ret; - struct timespec start, end; + int ret; + struct timespec start, end; + do_posix_clock_monotonic_gettime(&start); + if (pte_none(entry)) { if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, address, pte, pmd, write_access); - if (vma->vm_file) - do_posix_clock_monotonic_gettime(&start); ret = do_no_page(mm, vma, address, pte, pmd, write_access); if (vma->vm_file) { @@ -2192,8 +2191,11 @@ static inline int handle_pte_fault(struc if (pte_file(entry)) return do_file_page(mm, vma, address, pte, pmd, write_access, entry); - return do_swap_page(mm, vma, address, - pte, pmd, write_access, entry); + ret = do_swap_page(mm, vma, address, + pte, pmd, write_access, entry); + do_posix_clock_monotonic_gettime(&end); + delayacct_swapin(&start, &end); + return ret; } ptl = pte_lockptr(mm, pmd); Index: linux-2.6.15-rc7/kernel/delayacct.c =================================================================== --- linux-2.6.15-rc7.orig/kernel/delayacct.c +++ linux-2.6.15-rc7/kernel/delayacct.c @@ -50,3 +50,18 @@ inline void delayacct_blkio(struct times current->delays.blkio_count++; spin_unlock(¤t->delays.lock); } + +inline void delayacct_swapin(struct timespec *start, struct timespec *end) +{ + unsigned long long delay; + + if (!delayacct_on) + return; + + delay = timespec_diff_ns(start, end); + + spin_lock(¤t->delays.lock); + current->delays.swapin_delay += delay; + current->delays.swapin_count++; + spin_unlock(¤t->delays.lock); +} |
From: Shailabh N. <na...@wa...> - 2006-01-03 23:34:09
|
Changes since 11/14/05: - explicit versioning of statistics data returned - new command type for returning per-tgid stats - for cpu running time, use tsk->sched_info.cpu_time (collected by schedstats) First post 11/14/05 delayacct-connector.patch Creates a connector interface for getting delay and cpu statistics of tasks during their lifetime and when they exit. The cpu stats are available only if CONFIG_SCHEDSTATS is enabled. When a task is alive, userspace can get its stats by sending a command containing its pid. Sending a tgid returns the sum of stats of the tasks belonging to that tgid (where such a sum makes sense). Together, the command interface allows stats for a large number of tasks to be collected more efficiently than would be possible through /proc or any per-pid interface. The connector interface also sends the stats for each task to userspace when the task is exiting. This permits fine-grain accounting for short-lived tasks, which is important if userspace is doing its own aggregation of statistics based on some grouping of tasks (e.g. CSA jobs, ELSA banks or CKRM classes). While this patch is included as part of the delay stats collection patches, it is intended for general use by any kernel component that needs per-pid/tgid stats sent at task exit. This limits the increase in connectors called on task exit, atleast for stats collection purposes. Signed-off-by: Shailabh Nagar <na...@wa...> drivers/connector/Kconfig | 9 + drivers/connector/Makefile | 1 drivers/connector/cn_stats.c | 243 +++++++++++++++++++++++++++++++++++++++++++ include/linux/cn_stats.h | 115 ++++++++++++++++++++ include/linux/connector.h | 4 kernel/exit.c | 2 6 files changed, 374 insertions(+) Index: linux-2.6.15-rc7/drivers/connector/Kconfig =================================================================== --- linux-2.6.15-rc7.orig/drivers/connector/Kconfig +++ linux-2.6.15-rc7/drivers/connector/Kconfig @@ -18,4 +18,13 @@ config PROC_EVENTS Provide a connector that reports process events to userspace. Send events such as fork, exec, id change (uid, gid, suid, etc), and exit. +config STATS_CONNECTOR + bool "Report per-task statistics to userspace" + depends on CONNECTOR=y && TASK_DELAY_ACCT + ---help--- + Provide a connector interface that reports per-task statistics to + userspace. While a task is running, userspace can get the stats by + sending a command to the connector. At task exit, the final value of + the stats is sent automatically. + endmenu Index: linux-2.6.15-rc7/drivers/connector/Makefile =================================================================== --- linux-2.6.15-rc7.orig/drivers/connector/Makefile +++ linux-2.6.15-rc7/drivers/connector/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_CONNECTOR) += cn.o +obj-$(CONFIG_STATS_CONNECTOR) += cn_stats.o obj-$(CONFIG_PROC_EVENTS) += cn_proc.o cn-y += cn_queue.o connector.o Index: linux-2.6.15-rc7/include/linux/cn_stats.h =================================================================== --- /dev/null +++ linux-2.6.15-rc7/include/linux/cn_stats.h @@ -0,0 +1,115 @@ +/* + * cn_stats.h - Task statistics connector + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2005 + * Based on work by Matt Helsley, Nguyen Anh Quynh and Guillaume Thouvenin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef CN_STATS_H +#define CN_STATS_H + +#include <linux/types.h> +#include <linux/connector.h> + + +/* Format for per-task data returned to userland when + * - a task exits + * - listener requests stats for all tasks + * + * The struct is versioned. Newer versions should only add fields to + * the bottom of the struct to maintain backward compatibility. + * + * To create the next version, bump up the version count macro + * and delineate the start of newly added fields with a comment indicating the + * version number. + */ +#define CNSTATS_DATA_VERSION 1 + +struct cnstats_data { + + /* Version 1 */ + pid_t pid; + pid_t tgid; + + /* *_delay_total is cumulative delay (in nanosecs) of a + * task waiting for cpu to be available, block io + * completion, page fault to be serviced etc. + * *_count is number of delay intervals recorded. + * cpu_run_total is cumulative cpu run time, measured by timestamp + * intervals and hence more accurate than sampling-based cpu times. + * All *_total values are in nanoseconds though cpu data may not be + * collected at that granularity. + */ + + __u64 cpu_count; +#define CNSTATS_NOCPUSTATS 1 + __u64 cpu_run_total; + __u64 cpu_delay_total; + + __u64 blkio_count; + __u64 blkio_delay_total; + __u64 swapin_count; + __u64 swapin_delay_total; + +}; + + +/* + * Commands sent from userspace + */ + +struct cnstats_cmd { + enum intype { + CNSTATS_CMD_LISTEN = 1, /* Start listening on connector */ + CNSTATS_CMD_IGNORE, /* Stop listening */ + CNSTATS_CMD_STATS_PID, /* Stats for a pid */ + CNSTATS_CMD_STATS_TGID, /* Stats for a tgid */ + } intype; + + union { + pid_t pid; + pid_t tgid; + } param; +}; + +/* + * Response or data sent from kernel + * Versioned for backward compatibility + */ + +struct cnstats { + __u32 version; + enum outtype { + CNSTATS_DATA_NONE = 1, /* Control cmd response */ + CNSTATS_DATA_EXIT, /* Exiting task's stats */ + CNSTATS_DATA_PID, /* per-pid data cmd response*/ + CNSTATS_DATA_TGID, /* per-tgid data cmd response*/ + } outtype; + union { + struct cnstats_ack { + __u32 err; + } ack; + + struct cnstats_data stats; + } data; +}; + +#ifdef __KERNEL__ +#ifdef CONFIG_STATS_CONNECTOR +void cnstats_exit_connector(struct task_struct *tsk); +#else +static inline void cnstats_exit_connector(struct task_struct *tsk) +{} +#endif /* CONFIG_STATS_CONNECTOR */ +#endif /* __KERNEL__ */ +#endif /* CN_PROC_H */ Index: linux-2.6.15-rc7/include/linux/connector.h =================================================================== --- linux-2.6.15-rc7.orig/include/linux/connector.h +++ linux-2.6.15-rc7/include/linux/connector.h @@ -35,6 +35,10 @@ #define CN_IDX_CIFS 0x2 #define CN_VAL_CIFS 0x1 +/* Statistics connector ids */ +#define CN_IDX_STATS 0x2 +#define CN_VAL_STATS 0x2 + #define CN_NETLINK_USERS 1 /* Index: linux-2.6.15-rc7/drivers/connector/cn_stats.c =================================================================== --- /dev/null +++ linux-2.6.15-rc7/drivers/connector/cn_stats.c @@ -0,0 +1,243 @@ +/* + * cn_stats.c - Task statistics connector + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2005 + * Based on work by Matt Helsley, Nguyen Anh Quynh and Guillaume Thouvenin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/cn_stats.h> +#include <asm/atomic.h> + +#define CN_STATS_NOCPU (-1) +#define CN_STATS_NOACK 0 +#define CN_STATS_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct cnstats)) + +static atomic_t cnstats_num_listeners = ATOMIC_INIT(0); +static struct cb_id cnstats_id = { CN_IDX_STATS, CN_VAL_STATS }; +/* cnstats_counts is used as the sequence number of the netlink message */ +static DEFINE_PER_CPU(__u32, cnstats_counts) = { 0 }; + +void cnstats_init_msg(struct cn_msg *msg, int seq, int ack) +{ + memset(msg, 0, CN_STATS_MSG_SIZE); + msg->seq = seq; + msg->ack = ack; + memcpy(&msg->id, &cnstats_id, sizeof(msg->id)); + msg->len = sizeof(struct cnstats); +} + +/* + * Accumalate one task's statistics + * + */ +static inline void cnstats_add_tsk_data(struct cnstats_data *d, struct task_struct *tsk) +{ + uint64_t tmp; + + d->pid = tsk->pid; + d->tgid = tsk->tgid; + + /* zero XXX_total,non-zero XXX_count implies XXX stat overflowed */ +#ifdef CONFIG_SCHEDSTATS + d->cpu_count += tsk->sched_info.pcnt; + tmp = d->cpu_run_total + jiffies_to_usecs(tsk->sched_info.cpu_time)*1000; + d->cpu_run_total = (tmp < d->cpu_run_total)? 0:tmp; + tmp = d->cpu_delay_total + jiffies_to_usecs(tsk->sched_info.run_delay)*1000; + d->cpu_delay_total = (tmp < d->cpu_delay_total)? 0:tmp; +#else + /* Non-zero XXX_total,zero XXX_count implies XXX stat unavailable */ + d->cpu_count = 0; + d->cpu_run_total = d->cpu_delay_total = CNSTATS_NOCPUSTATS; +#endif + spin_lock(&tsk->delays.lock); + tmp = d->blkio_delay_total + tsk->delays.blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total)? 0:tmp; + tmp = d->swapin_delay_total + tsk->delays.swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total)? 0:tmp; + d->blkio_count += tsk->delays.blkio_count; + d->swapin_count += tsk->delays.swapin_count; + spin_unlock(&tsk->delays.lock); +} + +/* + * Send acknowledgement (error) + * + */ +static void cnstats_send_ack(int err, int rcvd_seq, int rcvd_ack) +{ + __u8 buffer[CN_STATS_MSG_SIZE]; + struct cn_msg *msg = (struct cn_msg *)buffer; + struct cnstats *c = (struct cnstats *)msg->data; + + cnstats_init_msg(msg, rcvd_seq, rcvd_ack+1); + c->version = CNSTATS_DATA_VERSION; + c->outtype = CNSTATS_DATA_NONE; + + /* Following allows other functions to continue returning -ve errors */ + c->data.ack.err = abs(err); + + cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); +} + +/* + * Send one pid's stats + * + */ +static int cnstats_send_pid(pid_t pid, int seq, int ack) +{ + __u8 buffer[CN_STATS_MSG_SIZE]; + struct cn_msg *msg = (struct cn_msg *)buffer; + struct cnstats *c = (struct cnstats *)msg->data; + struct cnstats_data *d = (struct cnstats_data *)&c->data.stats; + struct task_struct *tsk; + + cnstats_init_msg(msg, seq, ack); + c->version = CNSTATS_DATA_VERSION; + c->outtype = CNSTATS_DATA_PID; + + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(tsk); + read_unlock(&tasklist_lock); + + cnstats_add_tsk_data(d, tsk); + put_task_struct(tsk); + + return cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); +} + +/* + * Send one tgid's stats (sum of appropriate per-pid stats) + * + */ +static int cnstats_send_tgid(pid_t tgid, int seq, int ack) +{ + __u8 buffer[CN_STATS_MSG_SIZE]; + struct cn_msg *msg = (struct cn_msg *)buffer;; + struct cnstats *c = (struct cnstats *)msg->data; + struct cnstats_data *d = (struct cnstats_data *)&c->data.stats; + struct task_struct *first, *tsk; + + cnstats_init_msg(msg, seq, ack); + c->version = CNSTATS_DATA_VERSION; + c->outtype = CNSTATS_DATA_TGID; + + read_lock(&tasklist_lock); + first = tsk = find_task_by_pid(tgid); + if (!first) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + do + cnstats_add_tsk_data(d, tsk); + while_each_thread(first, tsk); + read_unlock(&tasklist_lock); + + d->pid = -1; + d->tgid = tgid; + + return cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); +} + +/*** + * cnstats_ctl - handle command sent via CN_IDX_STATS connector + * @data: command + */ +static void cnstats_ctl(void *data) +{ + struct cn_msg *msg = data; + struct cnstats_cmd *cmd; + int err = 0; + + if (msg->len != sizeof(*cmd)) + return; + + cmd = (struct cnstats_cmd *)msg->data; + switch (cmd->intype) { + case CNSTATS_CMD_LISTEN: + atomic_inc(&cnstats_num_listeners); + break; + + case CNSTATS_CMD_IGNORE: + atomic_dec(&cnstats_num_listeners); + break; + + case CNSTATS_CMD_STATS_PID: + if (atomic_read(&cnstats_num_listeners) < 1) + return; + err = cnstats_send_pid(cmd->param.pid, msg->seq, msg->ack+1); + if (!err) + return; + break; + + case CNSTATS_CMD_STATS_TGID: + if (atomic_read(&cnstats_num_listeners) < 1) + return; + err = cnstats_send_tgid(cmd->param.pid, msg->seq, msg->ack+1); + if (!err) + return; + break; + + default: + err = -EINVAL; + break; + } + cnstats_send_ack(err, msg->seq, msg->ack); +} + +/*** + * cnstats_exit_connector - send task's statistics to userspace when it exits + * @tsk: exiting task + */ +void cnstats_exit_connector(struct task_struct *tsk) +{ + int seq; + __u8 buffer[CN_STATS_MSG_SIZE]; + struct cn_msg *msg = (struct cn_msg *)buffer; + struct cnstats *c = (struct cnstats *)msg->data; + struct cnstats_data *d = (struct cnstats_data *)&c->data.stats; + + if (atomic_read(&cnstats_num_listeners) < 1) + return; + + seq = get_cpu_var(cnstats_counts)++; + put_cpu_var(cnstats_counts); + + cnstats_init_msg(msg, seq, CN_STATS_NOACK); + c->version = CNSTATS_DATA_VERSION; + c->outtype = CNSTATS_DATA_EXIT; + cnstats_add_tsk_data(d, tsk); + + cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); +} + +static int __init cnstats_init(void) +{ + int err; + + if ((err = cn_add_callback(&cnstats_id, "cn_stats", &cnstats_ctl))) { + printk(KERN_WARNING "cn_stats failed to register\n"); + return err; + } + return 0; +} + +module_init(cnstats_init); Index: linux-2.6.15-rc7/kernel/exit.c =================================================================== --- linux-2.6.15-rc7.orig/kernel/exit.c +++ linux-2.6.15-rc7/kernel/exit.c @@ -29,6 +29,7 @@ #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/cn_proc.h> +#include <linux/cn_stats.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -865,6 +866,7 @@ fastcall NORET_TYPE void do_exit(long co tsk->exit_code = code; proc_exit_connector(tsk); + cnstats_exit_connector(tsk); exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); |
From: Greg KH <gr...@kr...> - 2006-01-04 00:21:40
|
On Tue, Jan 03, 2006 at 11:33:40PM +0000, Shailabh Nagar wrote: > Changes since 11/14/05: > > - explicit versioning of statistics data returned > - new command type for returning per-tgid stats > - for cpu running time, use tsk->sched_info.cpu_time (collected by schedstats) > > First post 11/14/05 > > delayacct-connector.patch > > Creates a connector interface for getting delay and cpu statistics of tasks > during their lifetime and when they exit. The cpu stats are available only if > CONFIG_SCHEDSTATS is enabled. Why do you use this when we can send typesafe data through netlink messages now? Because of that, I think the whole connector code can be deleted :) Unless I missed something in the connector code that is not present in netlink... thanks, greg k-h |
From: Shailabh N. <na...@wa...> - 2006-01-04 00:43:06
|
Greg KH wrote: > On Tue, Jan 03, 2006 at 11:33:40PM +0000, Shailabh Nagar wrote: > >>Changes since 11/14/05: >> >>- explicit versioning of statistics data returned >>- new command type for returning per-tgid stats >>- for cpu running time, use tsk->sched_info.cpu_time (collected by schedstats) >> >>First post 11/14/05 >> >>delayacct-connector.patch >> >>Creates a connector interface for getting delay and cpu statistics of tasks >>during their lifetime and when they exit. The cpu stats are available only if >>CONFIG_SCHEDSTATS is enabled. > > > Why do you use this when we can send typesafe data through netlink > messages now? AFAIK, adding new netlink types was frowned upon which is one of the reasons why connectors were proposed (besides making it easier to use the netlink interface) ? > Because of that, I think the whole connector code can be > deleted :) > > Unless I missed something in the connector code that is not present in > netlink... > > thanks, > > greg k-h > |
From: Greg KH <gr...@kr...> - 2006-01-04 00:51:51
|
On Wed, Jan 04, 2006 at 12:42:36AM +0000, Shailabh Nagar wrote: > Greg KH wrote: > > On Tue, Jan 03, 2006 at 11:33:40PM +0000, Shailabh Nagar wrote: > > > >>Changes since 11/14/05: > >> > >>- explicit versioning of statistics data returned > >>- new command type for returning per-tgid stats > >>- for cpu running time, use tsk->sched_info.cpu_time (collected by schedstats) > >> > >>First post 11/14/05 > >> > >>delayacct-connector.patch > >> > >>Creates a connector interface for getting delay and cpu statistics of tasks > >>during their lifetime and when they exit. The cpu stats are available only if > >>CONFIG_SCHEDSTATS is enabled. > > > > > > Why do you use this when we can send typesafe data through netlink > > messages now? > > AFAIK, adding new netlink types was frowned upon which is one of the reasons why > connectors were proposed (besides making it easier to use the netlink interface) ? I don't know about the issue of creating new types (have you tried?), but there is a new netlink message format that pretty should make it just as easy as the connector stuff to send complex message types. thanks, greg k-h |
From: Shailabh N. <na...@wa...> - 2006-01-04 07:49:42
|
Greg KH wrote: >On Wed, Jan 04, 2006 at 12:42:36AM +0000, Shailabh Nagar wrote: > > >>Greg KH wrote: >> >> >>>On Tue, Jan 03, 2006 at 11:33:40PM +0000, Shailabh Nagar wrote: >>> >>> >>> >>>>Changes since 11/14/05: >>>> >>>>- explicit versioning of statistics data returned >>>>- new command type for returning per-tgid stats >>>>- for cpu running time, use tsk->sched_info.cpu_time (collected by schedstats) >>>> >>>>First post 11/14/05 >>>> >>>>delayacct-connector.patch >>>> >>>>Creates a connector interface for getting delay and cpu statistics of tasks >>>>during their lifetime and when they exit. The cpu stats are available only if >>>>CONFIG_SCHEDSTATS is enabled. >>>> >>>> >>>Why do you use this when we can send typesafe data through netlink >>>messages now? >>> >>> >>AFAIK, adding new netlink types was frowned upon which is one of the reasons why >>connectors were proposed (besides making it easier to use the netlink interface) ? >> >> > >I don't know about the issue of creating new types (have you tried?), >but there is a new netlink message format that pretty should make it >just as easy as the connector stuff to send complex message types. > > Thanks - just saw the genetlink patches/interface which seems to handle the problem of creating new netlink types as well. But why do I get the feeling that the delay accounting patches are becoming a pawn in deprecating connector usage ? :-) I'd rather not run off and implement over genetlink (which no one else seems to be using right now) unless there's some consensus that connectors are to be deprecated. Andrew - any opinions ? -- Shailabh >thanks, > >greg k-h > > >------------------------------------------------------- >This SF.net email is sponsored by: Splunk Inc. Do you grep through log files >for problems? Stop! Download the new AJAX search engine that makes >searching your log files as easy as surfing the web. DOWNLOAD SPLUNK! >http://ads.osdn.com/?ad_id=7637&alloc_id=16865&op=click >_______________________________________________ >Lse-tech mailing list >Lse...@li... >https://lists.sourceforge.net/lists/listinfo/lse-tech > > > |
From: Jay L. <jl...@en...> - 2006-01-04 19:05:02
|
Shailabh Nagar wrote: >Changes since 11/14/05: > >- explicit versioning of statistics data returned >- new command type for returning per-tgid stats >- for cpu running time, use tsk->sched_info.cpu_time (collected by schedstats) > >First post 11/14/05 > >delayacct-connector.patch > >Creates a connector interface for getting delay and cpu statistics of tasks >during their lifetime and when they exit. The cpu stats are available only if >CONFIG_SCHEDSTATS is enabled. > >When a task is alive, userspace can get its stats by sending a command >containing its pid. Sending a tgid returns the sum of stats of the tasks >belonging to that tgid (where such a sum makes sense). Together, the command >interface allows stats for a large number of tasks to be collected more >efficiently than would be possible through /proc or any per-pid interface. > >The connector interface also sends the stats for each task to userspace when >the task is exiting. This permits fine-grain accounting for short-lived tasks, >which is important if userspace is doing its own aggregation of statistics >based on some grouping of tasks (e.g. CSA jobs, ELSA banks or CKRM classes). > >While this patch is included as part of the delay stats collection patches, >it is intended for general use by any kernel component that needs per-pid/tgid >stats sent at task exit. This limits the increase in connectors called on task >exit, atleast for stats collection purposes. > >Signed-off-by: Shailabh Nagar <na...@wa...> > > drivers/connector/Kconfig | 9 + > drivers/connector/Makefile | 1 > drivers/connector/cn_stats.c | 243 +++++++++++++++++++++++++++++++++++++++++++ > include/linux/cn_stats.h | 115 ++++++++++++++++++++ > include/linux/connector.h | 4 > kernel/exit.c | 2 > 6 files changed, 374 insertions(+) > >Index: linux-2.6.15-rc7/drivers/connector/Kconfig >=================================================================== >--- linux-2.6.15-rc7.orig/drivers/connector/Kconfig >+++ linux-2.6.15-rc7/drivers/connector/Kconfig >@@ -18,4 +18,13 @@ config PROC_EVENTS > Provide a connector that reports process events to userspace. Send > events such as fork, exec, id change (uid, gid, suid, etc), and exit. > >+config STATS_CONNECTOR >+ bool "Report per-task statistics to userspace" >+ depends on CONNECTOR=y && TASK_DELAY_ACCT >+ ---help--- >+ Provide a connector interface that reports per-task statistics to >+ userspace. While a task is running, userspace can get the stats by >+ sending a command to the connector. At task exit, the final value of >+ the stats is sent automatically. >+ > endmenu >Index: linux-2.6.15-rc7/drivers/connector/Makefile >=================================================================== >--- linux-2.6.15-rc7.orig/drivers/connector/Makefile >+++ linux-2.6.15-rc7/drivers/connector/Makefile >@@ -1,4 +1,5 @@ > obj-$(CONFIG_CONNECTOR) += cn.o >+obj-$(CONFIG_STATS_CONNECTOR) += cn_stats.o > obj-$(CONFIG_PROC_EVENTS) += cn_proc.o > > cn-y += cn_queue.o connector.o >Index: linux-2.6.15-rc7/include/linux/cn_stats.h >=================================================================== >--- /dev/null >+++ linux-2.6.15-rc7/include/linux/cn_stats.h >@@ -0,0 +1,115 @@ >+/* >+ * cn_stats.h - Task statistics connector >+ * >+ * Copyright (C) Shailabh Nagar, IBM Corp. 2005 >+ * Based on work by Matt Helsley, Nguyen Anh Quynh and Guillaume Thouvenin >+ * >+ * This program is free software; you can redistribute it and/or modify >+ * it under the terms of the GNU General Public License as published by >+ * the Free Software Foundation; either version 2 of the License, or >+ * (at your option) any later version. >+ * >+ * This program is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >+ * GNU General Public License for more details. >+ */ >+ >+#ifndef CN_STATS_H >+#define CN_STATS_H >+ >+#include <linux/types.h> >+#include <linux/connector.h> >+ >+ >+/* Format for per-task data returned to userland when >+ * - a task exits >+ * - listener requests stats for all tasks >+ * >+ * The struct is versioned. Newer versions should only add fields to >+ * the bottom of the struct to maintain backward compatibility. >+ * >+ * To create the next version, bump up the version count macro >+ * and delineate the start of newly added fields with a comment indicating the >+ * version number. >+ */ >+#define CNSTATS_DATA_VERSION 1 >+ >+struct cnstats_data { >+ >+ /* Version 1 */ >+ pid_t pid; >+ pid_t tgid; >+ >+ /* *_delay_total is cumulative delay (in nanosecs) of a >+ * task waiting for cpu to be available, block io >+ * completion, page fault to be serviced etc. >+ * *_count is number of delay intervals recorded. >+ * cpu_run_total is cumulative cpu run time, measured by timestamp >+ * intervals and hence more accurate than sampling-based cpu times. >+ * All *_total values are in nanoseconds though cpu data may not be >+ * collected at that granularity. >+ */ >+ >+ __u64 cpu_count; >+#define CNSTATS_NOCPUSTATS 1 >+ __u64 cpu_run_total; >+ __u64 cpu_delay_total; >+ >+ __u64 blkio_count; >+ __u64 blkio_delay_total; >+ __u64 swapin_count; >+ __u64 swapin_delay_total; >+ >+}; >+ >+ >+/* >+ * Commands sent from userspace >+ */ >+ >+struct cnstats_cmd { >+ enum intype { >+ CNSTATS_CMD_LISTEN = 1, /* Start listening on connector */ >+ CNSTATS_CMD_IGNORE, /* Stop listening */ >+ CNSTATS_CMD_STATS_PID, /* Stats for a pid */ >+ CNSTATS_CMD_STATS_TGID, /* Stats for a tgid */ >+ } intype; >+ >+ union { >+ pid_t pid; >+ pid_t tgid; >+ } param; >+}; >+ >+/* >+ * Response or data sent from kernel >+ * Versioned for backward compatibility >+ */ >+ >+struct cnstats { >+ __u32 version; >+ enum outtype { >+ CNSTATS_DATA_NONE = 1, /* Control cmd response */ >+ CNSTATS_DATA_EXIT, /* Exiting task's stats */ >+ CNSTATS_DATA_PID, /* per-pid data cmd response*/ >+ CNSTATS_DATA_TGID, /* per-tgid data cmd response*/ >+ } outtype; >+ union { >+ struct cnstats_ack { >+ __u32 err; >+ } ack; >+ >+ struct cnstats_data stats; >+ } data; >+}; >+ >+#ifdef __KERNEL__ >+#ifdef CONFIG_STATS_CONNECTOR >+void cnstats_exit_connector(struct task_struct *tsk); >+#else >+static inline void cnstats_exit_connector(struct task_struct *tsk) >+{} >+#endif /* CONFIG_STATS_CONNECTOR */ >+#endif /* __KERNEL__ */ >+#endif /* CN_PROC_H */ >Index: linux-2.6.15-rc7/include/linux/connector.h >=================================================================== >--- linux-2.6.15-rc7.orig/include/linux/connector.h >+++ linux-2.6.15-rc7/include/linux/connector.h >@@ -35,6 +35,10 @@ > #define CN_IDX_CIFS 0x2 > #define CN_VAL_CIFS 0x1 > >+/* Statistics connector ids */ >+#define CN_IDX_STATS 0x2 >+#define CN_VAL_STATS 0x2 >+ > #define CN_NETLINK_USERS 1 > > /* >Index: linux-2.6.15-rc7/drivers/connector/cn_stats.c >=================================================================== >--- /dev/null >+++ linux-2.6.15-rc7/drivers/connector/cn_stats.c >@@ -0,0 +1,243 @@ >+/* >+ * cn_stats.c - Task statistics connector > It looks like you create this file to report task stats through connector and you include stats for delay accounting you need as a starter. This sounds like a good approach to me. However, we need to move where cnstats_exit_connector(tsk) is invoked. See below. >+ * >+ * Copyright (C) Shailabh Nagar, IBM Corp. 2005 >+ * Based on work by Matt Helsley, Nguyen Anh Quynh and Guillaume Thouvenin >+ * >+ * This program is free software; you can redistribute it and/or modify >+ * it under the terms of the GNU General Public License as published by >+ * the Free Software Foundation; either version 2 of the License, or >+ * (at your option) any later version. >+ * >+ * This program is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >+ * GNU General Public License for more details. >+ * >+ */ >+ >+#include <linux/module.h> >+#include <linux/kernel.h> >+#include <linux/init.h> >+#include <linux/cn_stats.h> >+#include <asm/atomic.h> >+ >+#define CN_STATS_NOCPU (-1) >+#define CN_STATS_NOACK 0 >+#define CN_STATS_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct cnstats)) >+ >+static atomic_t cnstats_num_listeners = ATOMIC_INIT(0); >+static struct cb_id cnstats_id = { CN_IDX_STATS, CN_VAL_STATS }; >+/* cnstats_counts is used as the sequence number of the netlink message */ >+static DEFINE_PER_CPU(__u32, cnstats_counts) = { 0 }; >+ >+void cnstats_init_msg(struct cn_msg *msg, int seq, int ack) >+{ >+ memset(msg, 0, CN_STATS_MSG_SIZE); >+ msg->seq = seq; >+ msg->ack = ack; >+ memcpy(&msg->id, &cnstats_id, sizeof(msg->id)); >+ msg->len = sizeof(struct cnstats); >+} >+ >+/* >+ * Accumalate one task's statistics >+ * >+ */ >+static inline void cnstats_add_tsk_data(struct cnstats_data *d, struct task_struct *tsk) >+{ >+ uint64_t tmp; >+ >+ d->pid = tsk->pid; >+ d->tgid = tsk->tgid; >+ >+ /* zero XXX_total,non-zero XXX_count implies XXX stat overflowed */ >+#ifdef CONFIG_SCHEDSTATS >+ d->cpu_count += tsk->sched_info.pcnt; >+ tmp = d->cpu_run_total + jiffies_to_usecs(tsk->sched_info.cpu_time)*1000; >+ d->cpu_run_total = (tmp < d->cpu_run_total)? 0:tmp; >+ tmp = d->cpu_delay_total + jiffies_to_usecs(tsk->sched_info.run_delay)*1000; >+ d->cpu_delay_total = (tmp < d->cpu_delay_total)? 0:tmp; >+#else >+ /* Non-zero XXX_total,zero XXX_count implies XXX stat unavailable */ >+ d->cpu_count = 0; >+ d->cpu_run_total = d->cpu_delay_total = CNSTATS_NOCPUSTATS; >+#endif >+ spin_lock(&tsk->delays.lock); >+ tmp = d->blkio_delay_total + tsk->delays.blkio_delay; >+ d->blkio_delay_total = (tmp < d->blkio_delay_total)? 0:tmp; >+ tmp = d->swapin_delay_total + tsk->delays.swapin_delay; >+ d->swapin_delay_total = (tmp < d->swapin_delay_total)? 0:tmp; >+ d->blkio_count += tsk->delays.blkio_count; >+ d->swapin_count += tsk->delays.swapin_count; >+ spin_unlock(&tsk->delays.lock); >+} >+ >+/* >+ * Send acknowledgement (error) >+ * >+ */ >+static void cnstats_send_ack(int err, int rcvd_seq, int rcvd_ack) >+{ >+ __u8 buffer[CN_STATS_MSG_SIZE]; >+ struct cn_msg *msg = (struct cn_msg *)buffer; >+ struct cnstats *c = (struct cnstats *)msg->data; >+ >+ cnstats_init_msg(msg, rcvd_seq, rcvd_ack+1); >+ c->version = CNSTATS_DATA_VERSION; >+ c->outtype = CNSTATS_DATA_NONE; >+ >+ /* Following allows other functions to continue returning -ve errors */ >+ c->data.ack.err = abs(err); >+ >+ cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); >+} >+ >+/* >+ * Send one pid's stats >+ * >+ */ >+static int cnstats_send_pid(pid_t pid, int seq, int ack) >+{ >+ __u8 buffer[CN_STATS_MSG_SIZE]; >+ struct cn_msg *msg = (struct cn_msg *)buffer; >+ struct cnstats *c = (struct cnstats *)msg->data; >+ struct cnstats_data *d = (struct cnstats_data *)&c->data.stats; >+ struct task_struct *tsk; >+ >+ cnstats_init_msg(msg, seq, ack); >+ c->version = CNSTATS_DATA_VERSION; >+ c->outtype = CNSTATS_DATA_PID; >+ >+ read_lock(&tasklist_lock); >+ tsk = find_task_by_pid(pid); >+ if (!tsk) { >+ read_unlock(&tasklist_lock); >+ return -ESRCH; >+ } >+ get_task_struct(tsk); >+ read_unlock(&tasklist_lock); >+ >+ cnstats_add_tsk_data(d, tsk); >+ put_task_struct(tsk); >+ >+ return cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); >+} >+ >+/* >+ * Send one tgid's stats (sum of appropriate per-pid stats) >+ * >+ */ >+static int cnstats_send_tgid(pid_t tgid, int seq, int ack) >+{ >+ __u8 buffer[CN_STATS_MSG_SIZE]; >+ struct cn_msg *msg = (struct cn_msg *)buffer;; >+ struct cnstats *c = (struct cnstats *)msg->data; >+ struct cnstats_data *d = (struct cnstats_data *)&c->data.stats; >+ struct task_struct *first, *tsk; >+ >+ cnstats_init_msg(msg, seq, ack); >+ c->version = CNSTATS_DATA_VERSION; >+ c->outtype = CNSTATS_DATA_TGID; >+ >+ read_lock(&tasklist_lock); >+ first = tsk = find_task_by_pid(tgid); >+ if (!first) { >+ read_unlock(&tasklist_lock); >+ return -ESRCH; >+ } >+ do >+ cnstats_add_tsk_data(d, tsk); >+ while_each_thread(first, tsk); >+ read_unlock(&tasklist_lock); >+ >+ d->pid = -1; >+ d->tgid = tgid; >+ >+ return cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); >+} >+ >+/*** >+ * cnstats_ctl - handle command sent via CN_IDX_STATS connector >+ * @data: command >+ */ >+static void cnstats_ctl(void *data) >+{ >+ struct cn_msg *msg = data; >+ struct cnstats_cmd *cmd; >+ int err = 0; >+ >+ if (msg->len != sizeof(*cmd)) >+ return; >+ >+ cmd = (struct cnstats_cmd *)msg->data; >+ switch (cmd->intype) { >+ case CNSTATS_CMD_LISTEN: >+ atomic_inc(&cnstats_num_listeners); >+ break; >+ >+ case CNSTATS_CMD_IGNORE: >+ atomic_dec(&cnstats_num_listeners); >+ break; >+ >+ case CNSTATS_CMD_STATS_PID: >+ if (atomic_read(&cnstats_num_listeners) < 1) >+ return; >+ err = cnstats_send_pid(cmd->param.pid, msg->seq, msg->ack+1); >+ if (!err) >+ return; >+ break; >+ >+ case CNSTATS_CMD_STATS_TGID: >+ if (atomic_read(&cnstats_num_listeners) < 1) >+ return; >+ err = cnstats_send_tgid(cmd->param.pid, msg->seq, msg->ack+1); >+ if (!err) >+ return; >+ break; >+ >+ default: >+ err = -EINVAL; >+ break; >+ } >+ cnstats_send_ack(err, msg->seq, msg->ack); >+} >+ >+/*** >+ * cnstats_exit_connector - send task's statistics to userspace when it exits >+ * @tsk: exiting task >+ */ >+void cnstats_exit_connector(struct task_struct *tsk) >+{ >+ int seq; >+ __u8 buffer[CN_STATS_MSG_SIZE]; >+ struct cn_msg *msg = (struct cn_msg *)buffer; >+ struct cnstats *c = (struct cnstats *)msg->data; >+ struct cnstats_data *d = (struct cnstats_data *)&c->data.stats; >+ >+ if (atomic_read(&cnstats_num_listeners) < 1) >+ return; >+ >+ seq = get_cpu_var(cnstats_counts)++; >+ put_cpu_var(cnstats_counts); >+ >+ cnstats_init_msg(msg, seq, CN_STATS_NOACK); >+ c->version = CNSTATS_DATA_VERSION; >+ c->outtype = CNSTATS_DATA_EXIT; >+ cnstats_add_tsk_data(d, tsk); >+ >+ cn_netlink_send(msg, CN_IDX_STATS, GFP_KERNEL); >+} >+ >+static int __init cnstats_init(void) >+{ >+ int err; >+ >+ if ((err = cn_add_callback(&cnstats_id, "cn_stats", &cnstats_ctl))) { >+ printk(KERN_WARNING "cn_stats failed to register\n"); >+ return err; >+ } >+ return 0; >+} >+ >+module_init(cnstats_init); >Index: linux-2.6.15-rc7/kernel/exit.c >=================================================================== >--- linux-2.6.15-rc7.orig/kernel/exit.c >+++ linux-2.6.15-rc7/kernel/exit.c >@@ -29,6 +29,7 @@ > #include <linux/syscalls.h> > #include <linux/signal.h> > #include <linux/cn_proc.h> >+#include <linux/cn_stats.h> > > #include <asm/uaccess.h> > #include <asm/unistd.h> >@@ -865,6 +866,7 @@ fastcall NORET_TYPE void do_exit(long co > > tsk->exit_code = code; > proc_exit_connector(tsk); >+ cnstats_exit_connector(tsk); > We need to move both proc_exit_connector(tsk) and cnstats_exit_connector(tsk) up to before exit_mm(tsk) statement. There are task statistics collected in task->mm and those stats will be lost after exit_mm(tsk). Thanks, - jay > exit_notify(tsk); > #ifdef CONFIG_NUMA > mpol_free(tsk->mempolicy); >- >To unsubscribe from this list: send the line "unsubscribe linux-kernel" in >the body of a message to maj...@vg... >More majordomo info at http://vger.kernel.org/majordomo-info.html >Please read the FAQ at http://www.tux.org/lkml/ > |
From: Matt H. <mat...@us...> - 2006-01-04 22:46:16
|
On Wed, 2006-01-04 at 11:04 -0800, Jay Lan wrote: > Shailabh Nagar wrote: <snip> > >Index: linux-2.6.15-rc7/kernel/exit.c > >=================================================================== > >--- linux-2.6.15-rc7.orig/kernel/exit.c > >+++ linux-2.6.15-rc7/kernel/exit.c > >@@ -29,6 +29,7 @@ > > #include <linux/syscalls.h> > > #include <linux/signal.h> > > #include <linux/cn_proc.h> > >+#include <linux/cn_stats.h> > > > > #include <asm/uaccess.h> > > #include <asm/unistd.h> > >@@ -865,6 +866,7 @@ fastcall NORET_TYPE void do_exit(long co > > > > tsk->exit_code = code; > > proc_exit_connector(tsk); > >+ cnstats_exit_connector(tsk); > > > > We need to move both proc_exit_connector(tsk) and > cnstats_exit_connector(tsk) up to before exit_mm(tsk) statement. > There are task statistics collected in task->mm and those stats > will be lost after exit_mm(tsk). > > Thanks, > - jay > > > exit_notify(tsk); > > #ifdef CONFIG_NUMA > > mpol_free(tsk->mempolicy); > >- Good point. The assignment of the task exit code will also have to move up before exit_mm(tsk) because the process event connector exit function retrieves the exit code from the task struct. Moving these may also affect the job/pagg/task_notify/cpuset exit notification if we're eventually going to remove *direct* calls to these from kernel/exit.c. Cheers, -Matt Helsley |
From: Andrew M. <ak...@os...> - 2006-01-04 23:15:39
|
Matt Helsley <mat...@us...> wrote: > > > We need to move both proc_exit_connector(tsk) and > > cnstats_exit_connector(tsk) up to before exit_mm(tsk) statement. > > There are task statistics collected in task->mm and those stats > > will be lost after exit_mm(tsk). > > > > Thanks, > > - jay > > > > > exit_notify(tsk); > > > #ifdef CONFIG_NUMA > > > mpol_free(tsk->mempolicy); > > >- > > Good point. The assignment of the task exit code will also have to move > up before exit_mm(tsk) because the process event connector exit function > retrieves the exit code from the task struct. Could someone please volunteer to do the patch? |
From: Matt H. <mat...@us...> - 2006-01-05 18:46:18
|
On Wed, 2006-01-04 at 15:17 -0800, Andrew Morton wrote: > Matt Helsley <mat...@us...> wrote: > > > > > We need to move both proc_exit_connector(tsk) and > > > cnstats_exit_connector(tsk) up to before exit_mm(tsk) statement. > > > There are task statistics collected in task->mm and those stats > > > will be lost after exit_mm(tsk). > > > > > > Thanks, > > > - jay > > > > > > > exit_notify(tsk); > > > > #ifdef CONFIG_NUMA > > > > mpol_free(tsk->mempolicy); > > > >- > > > > Good point. The assignment of the task exit code will also have to move > > up before exit_mm(tsk) because the process event connector exit function > > retrieves the exit code from the task struct. > > Could someone please volunteer to do the patch? Here are two separate patches (not a series). The first simply moves the process event connector north of exit_mm(). It applies to a clean 2.6.15 kernel. Please consider it for -mm. The second patch moves both -- it's intended to be applied on top of Shailabh's series of patches. Cheers, -Matt Helsley |
From: Matt H. <mat...@us...> - 2006-01-05 19:21:14
|
This patch moves the process event connector exit function above exit_mm() with the expectation that it will later be removed from the exit function with other calls that need to take place before exit_mm(). I'm looking into how this affects the exit_signal field. Please review but do not apply. Signed-off-by: Matt Helsley <mat...@us...> -- Index: linux-2.6.15/kernel/exit.c =================================================================== --- linux-2.6.15.orig/kernel/exit.c +++ linux-2.6.15/kernel/exit.c @@ -844,10 +844,13 @@ fastcall NORET_TYPE void do_exit(long co if (group_dead) { del_timer_sync(&tsk->signal->real_timer); exit_itimers(tsk->signal); acct_process(code); } + + tsk->exit_code = code; + proc_exit_connector(tsk); exit_mm(tsk); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); @@ -860,13 +863,10 @@ fastcall NORET_TYPE void do_exit(long co disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); if (tsk->binfmt) module_put(tsk->binfmt->module); - - tsk->exit_code = code; - proc_exit_connector(tsk); exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif |
From: Matt H. <mat...@us...> - 2006-01-05 19:53:41
|
NOTE: This is a resend to CKRM-tech, ELSA-devel, and LSE-tech. In any replies please add to the Cc list: Jay Lan <jl...@en...>, Shailabh Nagar <na...@wa...>, LKML <lin...@vg...>,Paul Jackson <pj...@sg...>, Erik Jacobson <er...@sg...>, Jack Steiner <st...@sg...>, John Hesterberg <jh...@sg...> This patch moves both the process exit event and per-process stats connectors above exit_mm() since the latter needs values from the mm_struct which will be lost after exit_mm(). Signed-off-by: Matt Helsley <mat...@us...> -- Index: linux-2.6.15/kernel/exit.c =================================================================== --- linux-2.6.15.orig/kernel/exit.c +++ linux-2.6.15/kernel/exit.c @@ -845,10 +845,14 @@ fastcall NORET_TYPE void do_exit(long co if (group_dead) { del_timer_sync(&tsk->signal->real_timer); exit_itimers(tsk->signal); acct_process(code); } + + tsk->exit_code = code; + proc_exit_connector(tsk); + cnstats_exit_connector(tsk); exit_mm(tsk); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); @@ -861,14 +865,10 @@ fastcall NORET_TYPE void do_exit(long co disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); if (tsk->binfmt) module_put(tsk->binfmt->module); - - tsk->exit_code = code; - proc_exit_connector(tsk); - cnstats_exit_connector(tsk); exit_notify(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif |
From: Andrew M. <ak...@os...> - 2006-01-05 23:10:50
|
Matt Helsley <mat...@us...> wrote: > > This patch moves both the process exit event and per-process stats > connectors above exit_mm() since the latter needs values from the > mm_struct which will be lost after exit_mm(). > > Signed-off-by: Matt Helsley <mat...@us...> > > -- > > Index: linux-2.6.15/kernel/exit.c > =================================================================== > --- linux-2.6.15.orig/kernel/exit.c > +++ linux-2.6.15/kernel/exit.c > @@ -845,10 +845,14 @@ fastcall NORET_TYPE void do_exit(long co > if (group_dead) { > del_timer_sync(&tsk->signal->real_timer); > exit_itimers(tsk->signal); > acct_process(code); > } > + > + tsk->exit_code = code; > + proc_exit_connector(tsk); > + cnstats_exit_connector(tsk); cnstats_exit_connector() doesn't exist yet... |
From: Matt H. <mat...@us...> - 2006-01-06 00:12:57
|
On Thu, 2006-01-05 at 15:10 -0800, Andrew Morton wrote: > Matt Helsley <mat...@us...> wrote: > > > > This patch moves both the process exit event and per-process stats > > connectors above exit_mm() since the latter needs values from the > > mm_struct which will be lost after exit_mm(). > > > > Signed-off-by: Matt Helsley <mat...@us...> > > > > -- > > > > Index: linux-2.6.15/kernel/exit.c > > =================================================================== > > --- linux-2.6.15.orig/kernel/exit.c > > +++ linux-2.6.15/kernel/exit.c > > @@ -845,10 +845,14 @@ fastcall NORET_TYPE void do_exit(long co > > if (group_dead) { > > del_timer_sync(&tsk->signal->real_timer); > > exit_itimers(tsk->signal); > > acct_process(code); > > } > > + > > + tsk->exit_code = code; > > + proc_exit_connector(tsk); > > + cnstats_exit_connector(tsk); > > cnstats_exit_connector() doesn't exist yet... Right. I forgot to repeat what I mentioned in the parent email -- that this patch is intended to be applied on top of Shailabh's patches. The first patch I posted (01/01) is intended for plain 2.6.15. Before proposing 01/01 for -mm I've been trying to see if there are any problems with the value of tsk->exit_signal before exit_mm() -- hence the "[RFC]" in the subject line of that one. Thanks, -Matt Helsley |
From: Jes S. <je...@tr...> - 2006-01-06 08:57:13
|
>>>>> "Matt" == Matt Helsley <mat...@us...> writes: Matt> Right. I forgot to repeat what I mentioned in the parent email Matt> -- that this patch is intended to be applied on top of Matt> Shailabh's patches. Matt> The first patch I posted (01/01) is intended for plain Matt> 2.6.15. Before proposing 01/01 for -mm I've been trying to see Matt> if there are any problems with the value of tsk->exit_signal Matt> before exit_mm() -- hence the "[RFC]" in the subject line of Matt> that one. Matt, Any chance one of you could put up a set of current patches somewhere? I am trying to make heads and tails of them and it's pretty hard as I haven't been on lse-tech for long and the lse-tech mailing list archives are useless due to the 99 to 1 SPAM ratio ;-( I am quite concerned about that lock your patches put into struct task_struct through struct task_delay_info. Have you done any measurements on how this impacts performance on highly threaded apps on larger system? IMHO it seems to make more sense to use something like Jack's proposed task_notifier code to lock-less collect the data into task local data structures and then take the data from there and ship off to userland through netlink or similar like you are doing? I am working on modifying Jack's patch to carry task local data and use it for not just accounting but other areas that need optional callbacks (optional in the sense that it's a feature that can be enabled or disabled). Looking at Shailabh's delayacct_blkio() changes it seems that it would be really easy to fit those into that framework. Guess I should post some of this code ..... Cheers, Jes |
From: John H. <jh...@sg...> - 2006-01-11 12:55:22
|
On Wed, Jan 11, 2006 at 05:36:29AM -0500, Jes Sorensen wrote: > I think task_notify it should be usable for statistics gathering as > well, the only issue is how to attach it to the processes we wish to > gather accounting for. Personally I am not a big fan of the current > concept where statistics are gathered for all tasks at all time but > just not exported until accounting is enabled. I believe the accounting our customers require needs to be turned on system-wide. In fact, I recall getting problems reports if there are some processes not 'accounted' for. If you do it on a task basis, and accounting gets turned on, you'd have to have a fool-proof way of tracking down all the tasks in a system and turn on their accounting. I would expect sites either want accounting on all the time for everything, or not at all. John |
From: Jes S. <je...@tr...> - 2006-01-11 13:50:45
|
>>>>> "John" == John Hesterberg <jh...@sg...> writes: John> I believe the accounting our customers require needs to be John> turned on system-wide. In fact, I recall getting problems John> reports if there are some processes not 'accounted' for. If you John> do it on a task basis, and accounting gets turned on, you'd have John> to have a fool-proof way of tracking down all the tasks in a John> system and turn on their accounting. Thats what I was scared someone would say ;-( The problem I am seeing is that SGI wants certain things in the accounting block, I am sure IBM has other things they wish to count and someone else will want to count something else again. With all those numbers we may end up with a relatively large block for accounting numbers in struct task_struct. However it makes it a lot harder to use the task_notifiers for it ;-( John> I would expect sites either want accounting on all the time for John> everything, or not at all. Right now a lot of the accounting is done by calling into a function which checks a flag for whether accounting is enabled and returns if it is not. That could easily be extended to check for the global accounting flag + check that the task's accounting data structure has been allocated. Then in certain places, like fork() and schedule(), allocate it if it's not in place already and accounting has been switched on. That way we could reduce the overhead to a single pointer in struct task_struct and the accounting structure could (in theory) grow indefinately large. Or is this too much of a hack? Comments? Cheers, Jes |
From: John H. <jh...@sg...> - 2006-01-11 21:40:41
|
On Wed, Jan 11, 2006 at 01:02:10PM -0800, Matt Helsley wrote: > On Wed, 2006-01-11 at 05:36 -0500, Jes Sorensen wrote: > > >>>>> "Shailabh" == Shailabh Nagar <na...@wa...> writes: > > > > Shailabh> Jes Sorensen wrote: > > >> I am quite concerned about that lock your patches put into struct > > >> task_struct through struct task_delay_info. Have you done any > > >> measurements on how this impacts performance on highly threaded > > >> apps on larger system? > > > > Shailabh> I don't expect the lock contention to be high. The lock is > > Shailabh> held for a very short time (across two > > Shailabh> additions/increments). Moreover, it gets contended only when > > Shailabh> the stats are being read (either through /proc or > > Shailabh> connectors). Since the reading of stats won't be that > > Shailabh> frequent (the utility of these numbers is to influence the > > Shailabh> I/O priority/rss limit etc. which won't be done at a very > > Shailabh> small granularity anyway), I wouldn't expect a problem. > > > > Hi Shailabh, > > > > When this is read through connectors, it's initiated by the connectors > > code which is called from the task's context hence we don't need > > locking for that. It's very similar to the task_notify code I am about > > to post and I think the connector code could fit into that > > framework. The main issue is /proc, but then one could even have a > > mechanism with a hook when the task exits that pushes the data to a > > storage point which is lock protected. > > > > Even if a lock isn't contended, you are still going to see the cache > > lines bounce around due to the writes. It may not show up on a 4-way > > box but what happens on a 64-way? We have seen some pretty nasty > > effects on the bigger SN2 boxes with locks that were taken far too > > frequently, to the point where it would prevent the box from booting > > (now I don't expect it to that severe here, but I'd still like to > > explore an approach of doing it lock free). > > > > Shailabh> But its better to take some measurements anyway. Any > > Shailabh> suggestions on a benchmark ? > > > > >> IMHO it seems to make more sense to use something like Jack's > > >> proposed task_notifier code to lock-less collect the data into task > > >> local data structures and then take the data from there and ship > > >> off to userland through netlink or similar like you are doing? > > >> > > >> I am working on modifying Jack's patch to carry task local data and > > >> use it for not just accounting but other areas that need optional > > >> callbacks (optional in the sense that it's a feature that can be > > >> enabled or disabled). Looking at Shailabh's delayacct_blkio() > > >> changes it seems that it would be really easy to fit those into > > >> that framework. > > >> > > >> Guess I should post some of this code ..... > > > > Shailabh> Please do. If this accounting can fit into some other > > Shailabh> framework, thats fine too. > > > > Ok, finally, sorry for the delay. My current code snapshot is > > available at http://www.trained-monkey.org/~jes/patches/task_notify/ - > > it's a modified version of Jack's task_notify code, and three example > > users of it (the SysV IPC semundo semaphore, the key infrastructure > > and SGI's JOB module). The patch order should be task_notify.diff, > > task-notify-keys.diff, task-notify-semundo.diff, and > > task_notify-job.diff last. > > I can already tell you I don't like the "magic" mechanism to identify > notifier blocks. The problem is that it's yet another space of id > numbers that we have to manage -- either manually, by having a person > hand the numbers out to developers, or automatically using the idr code. > You could use the notifier block's address and avoid an additional id > space. > > Also, even if this mechanism goes into task_notify it needs a better > name than "magic". > > > I think task_notify it should be usable for statistics gathering as > > well, the only issue is how to attach it to the processes we wish to > > gather accounting for. Personally I am not a big fan of the current > > concept where statistics are gathered for all tasks at all time but > > just not exported until accounting is enabled. > > Have you looked at Alan Stern's notifier chain fix patch? Could that be > used in task_notify? > > If not, perhaps the patch use the standard kernel list idioms. > > Another potential user for the task_notify functionality is the process > events connector. The problem is it requires the ability to receive > notifications about all processes. Also, there's a chance that future > CKRM code could use all-task and per-task notification. Combined with > John Hesterberg's feedback I think there is strong justification for an > all-tasks notification interface. I have two concerns about an all-tasks notification interface. First, we want this to scale, so don't want more global locks. One unique part of the task notify is that it doesn't use locks. Second, in at least some of the cases we're familiar with, even when we might need all-tasks notification we still need task-local data. That's been a problem with some of the global mechanisms I've seen discussed. Cheers, John |
From: Keith O. <ka...@sg...> - 2006-01-12 03:30:22
|
John Hesterberg (on Wed, 11 Jan 2006 15:39:10 -0600) wrote: >On Wed, Jan 11, 2006 at 01:02:10PM -0800, Matt Helsley wrote: >> Have you looked at Alan Stern's notifier chain fix patch? Could that be >> used in task_notify? > >I have two concerns about an all-tasks notification interface. >First, we want this to scale, so don't want more global locks. >One unique part of the task notify is that it doesn't use locks. Neither does Alan Stern's atomic notifier chain. Indeed it cannot use locks, because the atomic notifier chains can be called from anywhere, including non maskable interrupts. The downside is that Alan's atomic notifier chains require RCU. An alternative patch that requires no locks and does not even require RCU is in http://marc.theaimsgroup.com/?l=linux-kernel&m=113392370322545&w=2 |
From: Paul E. M. <pa...@us...> - 2006-01-12 05:04:40
|
On Thu, Jan 12, 2006 at 02:29:52PM +1100, Keith Owens wrote: > John Hesterberg (on Wed, 11 Jan 2006 15:39:10 -0600) wrote: > >On Wed, Jan 11, 2006 at 01:02:10PM -0800, Matt Helsley wrote: > >> Have you looked at Alan Stern's notifier chain fix patch? Could that be > >> used in task_notify? > > > >I have two concerns about an all-tasks notification interface. > >First, we want this to scale, so don't want more global locks. > >One unique part of the task notify is that it doesn't use locks. > > Neither does Alan Stern's atomic notifier chain. Indeed it cannot use > locks, because the atomic notifier chains can be called from anywhere, > including non maskable interrupts. The downside is that Alan's atomic > notifier chains require RCU. > > An alternative patch that requires no locks and does not even require > RCU is in http://marc.theaimsgroup.com/?l=linux-kernel&m=113392370322545&w=2 Interesting! Missed this on the first time around... But doesn't notifier_call_chain_lockfree() need to either disable preemption or use atomic operations to update notifier_chain_lockfree_inuse[] in order to avoid problems with preemption? If I understand the code, one such problem could be caused by the following sequence of events: 1. Task A enters notifier_call_chain_lockfree(), gets a copy of the current CPU in local variable "cpu", snapshots the (initially zero) value of notifier_chain_lockfree_inuse[cpu] into local variable "nested", then is preempted. 2. Task B enters notifier_call_chain_lockfree(), gets a copy of the current CPU in local variable "cpu", snapshots the (still zero) value of notifier_chain_lockfree_inuse[cpu] into local variable "nested", sets the value of notifier_chain_lockfree_inuse[cpu] to 1. 3. Task A runs again, perhaps because Task B's priority dropped, perhaps because some other CPU became available. It also sets the value of notifier_chain_lockfree_inuse[cpu] to 1. It then gains a reference to a notifier_block (call it Fred). 4. Task B completes running through the notifier chain and sets notifier_chain_lockfree_inuse[cpu] = nested, which is zero. 5. Task C invokes notifier_chain_unregister_lockfree() in order to remove Fred. Task C finds all notifier_chain_lockfree_inuse[cpu] entries equal to zero, so removes Fred while Task A is still referencing it. Which I believe is what was to be prevented. If one updates notifier_chain_lockfree_inuse[cpu] using atomics, then one could imagine a sequence of calls to notifier_call_chain_lockfree() and preemptions that prevented one of the notifier_chain_lockfree_inuse[] elements from ever reaching zero (though maybe this is being overly paranoid). If one disables preemption, then latency might become excessive. So what am I missing? Thanx, Paul |
From: Keith O. <ka...@sg...> - 2006-01-12 06:19:21
|
"Paul E. McKenney" (on Wed, 11 Jan 2006 21:04:53 -0800) wrote: >On Thu, Jan 12, 2006 at 02:29:52PM +1100, Keith Owens wrote: >> John Hesterberg (on Wed, 11 Jan 2006 15:39:10 -0600) wrote: >> >On Wed, Jan 11, 2006 at 01:02:10PM -0800, Matt Helsley wrote: >> >> Have you looked at Alan Stern's notifier chain fix patch? Could that be >> >> used in task_notify? >> > >> >I have two concerns about an all-tasks notification interface. >> >First, we want this to scale, so don't want more global locks. >> >One unique part of the task notify is that it doesn't use locks. >> >> Neither does Alan Stern's atomic notifier chain. Indeed it cannot use >> locks, because the atomic notifier chains can be called from anywhere, >> including non maskable interrupts. The downside is that Alan's atomic >> notifier chains require RCU. >> >> An alternative patch that requires no locks and does not even require >> RCU is in http://marc.theaimsgroup.com/?l=linux-kernel&m=113392370322545&w=2 > >Interesting! Missed this on the first time around... > >But doesn't notifier_call_chain_lockfree() need to either disable >preemption or use atomic operations to update notifier_chain_lockfree_inuse[] >in order to avoid problems with preemption? OK, I have thought about it and ... notifier_call_chain_lockfree() must be called with preempt disabled. The justification for this routine is to handle all the nasty corner cases in the notify_die() and similar chains, including panic, spinlocks being held and even non maskable interrupts. It is silly to try to make notifier_call_chain_lockfree() handle the preemptible case as well. If notifier_call_chain_lockfree() is to be called for task notification, then the caller must disable preempt around the call to notifier_call_chain_lockfree(). Scalability over lots of cpus should not be an issue, especially if notifier_chain_lockfree_inuse[] is converted to a per cpu variable. The amount of time spent with preempt disabled is proportional to the number of registered callbacks on the task notifcation chain and to the amount of work performed by those callbacks, neither of which should be high. |
From: Paul E. M. <pa...@us...> - 2006-01-12 06:51:13
|
On Thu, Jan 12, 2006 at 05:19:01PM +1100, Keith Owens wrote: > "Paul E. McKenney" (on Wed, 11 Jan 2006 21:04:53 -0800) wrote: > >On Thu, Jan 12, 2006 at 02:29:52PM +1100, Keith Owens wrote: > >> John Hesterberg (on Wed, 11 Jan 2006 15:39:10 -0600) wrote: > >> >On Wed, Jan 11, 2006 at 01:02:10PM -0800, Matt Helsley wrote: > >> >> Have you looked at Alan Stern's notifier chain fix patch? Could that be > >> >> used in task_notify? > >> > > >> >I have two concerns about an all-tasks notification interface. > >> >First, we want this to scale, so don't want more global locks. > >> >One unique part of the task notify is that it doesn't use locks. > >> > >> Neither does Alan Stern's atomic notifier chain. Indeed it cannot use > >> locks, because the atomic notifier chains can be called from anywhere, > >> including non maskable interrupts. The downside is that Alan's atomic > >> notifier chains require RCU. > >> > >> An alternative patch that requires no locks and does not even require > >> RCU is in http://marc.theaimsgroup.com/?l=linux-kernel&m=113392370322545&w=2 > > > >Interesting! Missed this on the first time around... > > > >But doesn't notifier_call_chain_lockfree() need to either disable > >preemption or use atomic operations to update notifier_chain_lockfree_inuse[] > >in order to avoid problems with preemption? > > OK, I have thought about it and ... > > notifier_call_chain_lockfree() must be called with preempt disabled. > > The justification for this routine is to handle all the nasty > corner cases in the notify_die() and similar chains, including panic, > spinlocks being held and even non maskable interrupts. It is silly to > try to make notifier_call_chain_lockfree() handle the preemptible case > as well. Fair enough! A comment, perhaps? In a former life I would have also demanded debug code to verify that preemption/interrupts/whatever were actually disabled, given the very subtle nature of any resulting bugs... > If notifier_call_chain_lockfree() is to be called for task > notification, then the caller must disable preempt around the call to > notifier_call_chain_lockfree(). Scalability over lots of cpus should > not be an issue, especially if notifier_chain_lockfree_inuse[] is > converted to a per cpu variable. The amount of time spent with preempt > disabled is proportional to the number of registered callbacks on the > task notifcation chain and to the amount of work performed by those > callbacks, neither of which should be high. Ah, but the guys doing the latency measurements will be the judge of that, right? ;-) Thanx, Paul |
From: Keith O. <ka...@sg...> - 2006-01-12 07:50:57
|
"Paul E. McKenney" (on Wed, 11 Jan 2006 22:51:15 -0800) wrote: >On Thu, Jan 12, 2006 at 05:19:01PM +1100, Keith Owens wrote: >> OK, I have thought about it and ... >> >> notifier_call_chain_lockfree() must be called with preempt disabled. >> >Fair enough! A comment, perhaps? In a former life I would have also >demanded debug code to verify that preemption/interrupts/whatever were >actually disabled, given the very subtle nature of any resulting bugs... Comment - OK. Debug code is not required, the reference to smp_processor_id() already does all the debug checks that notifier_call_chain_lockfree() needs. CONFIG_PREEMPT_DEBUG is your friend. |
From: Paul E. M. <pa...@us...> - 2006-01-12 15:18:39
|
On Thu, Jan 12, 2006 at 06:50:34PM +1100, Keith Owens wrote: > "Paul E. McKenney" (on Wed, 11 Jan 2006 22:51:15 -0800) wrote: > >On Thu, Jan 12, 2006 at 05:19:01PM +1100, Keith Owens wrote: > >> OK, I have thought about it and ... > >> > >> notifier_call_chain_lockfree() must be called with preempt disabled. > >> > >Fair enough! A comment, perhaps? In a former life I would have also > >demanded debug code to verify that preemption/interrupts/whatever were > >actually disabled, given the very subtle nature of any resulting bugs... > > Comment - OK. Debug code is not required, the reference to > smp_processor_id() already does all the debug checks that > notifier_call_chain_lockfree() needs. CONFIG_PREEMPT_DEBUG is your > friend. Ah, debug_smp_processor_id(). Very cool!!! Thanx, Paul |