dle-develop Mailing List for Dependable Linux Effort (Page 26)

Brought to you by: h-aoki, hiramatu, s-oshima, y-sugita

dle-develop — Dependable Linux Effort mailing list for developers

You can subscribe to this list here.

2009	Jan	Feb	Mar	Apr	May (32)	Jun (66)	Jul (102)	Aug (78)	Sep (106)	Oct (137)	Nov (147)	Dec (147)
2010	Jan (71)	Feb (139)	Mar (86)	Apr (76)	May (57)	Jun (10)	Jul (12)	Aug (6)	Sep (8)	Oct (12)	Nov (12)	Dec (18)
2011	Jan (16)	Feb (19)	Mar (3)	Apr (1)	May (16)	Jun (17)	Jul (74)	Aug (22)	Sep (18)	Oct (24)	Nov (21)	Dec (30)
2012	Jan (31)	Feb (16)	Mar (22)	Apr (25)	May (18)	Jun (13)	Jul (83)	Aug (49)	Sep (20)	Oct (60)	Nov (35)	Dec (28)
2013	Jan (39)	Feb (61)	Mar (35)	Apr (21)	May (45)	Jun (56)	Jul (20)	Aug (9)	Sep (10)	Oct (31)	Nov (8)	Dec (4)
2014	Jan (6)	Feb (7)	Mar (7)	Apr (6)	May (4)	Jun (8)	Jul (5)	Aug (2)	Sep (4)	Oct (4)	Nov (11)	Dec (5)
2015	Jan (4)	Feb (4)	Mar (3)	Apr (4)	May (9)	Jun (4)	Jul (15)	Aug (8)	Sep (16)	Oct (18)	Nov (15)	Dec (7)
2016	Jan (20)	Feb (9)	Mar (15)	Apr (24)	May (16)	Jun (28)	Jul (22)	Aug (23)	Sep (18)	Oct (30)	Nov (40)	Dec (9)
2017	Jan (1)	Feb (8)	Mar (37)	Apr (26)	May (25)	Jun (46)	Jul (24)	Aug (9)	Sep	Oct	Nov	Dec

Flat | Threaded

<< < 1 .. 24 25 26 27 28 .. 99 > >> (Page 26 of 99)

Re: [Dle-develop] [libvirt] [PATCH] node_memory: Add '\n' to help message

From: Osier Y. <jy...@re...> - 2013-01-25 05:25:59

On 2013年01月25日 03:40, Satoru Moriya wrote:
> Linefeed is missed in the help of node-memory-tune.
> This patch just adds '\n' to get a correct help message.
>
> Signed-off-by: Satoru Moriya<sat...@hd...>
> ---
>   tools/virsh-host.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/virsh-host.c b/tools/virsh-host.c index d05e435..b83c893 100644
> --- a/tools/virsh-host.c
> +++ b/tools/virsh-host.c
> @@ -764,7 +764,7 @@ cmdVersion(vshControl *ctl, const vshCmd *cmd ATTRIBUTE_UNUSED)
>
>   static const vshCmdInfo info_node_memory_tune[] = {
>       {"help", N_("Get or set node memory parameters")},
> -    {"desc", N_("Get or set node memory parameters"
> +    {"desc", N_("Get or set node memory parameters\n"
>                   "    To get the memory parameters, use following command: \n\n"
>                   "    virsh # node-memory-tune")},
>       {NULL, NULL}
> --
> 1.7.11.7
>

ACK. And pushed since there is no chance to cause problem for
the upcoming build.

[Dle-develop] [PATCH] node_memory: Add '\n' to help message

From: Satoru M. <sat...@hd...> - 2013-01-24 19:42:04

Linefeed is missed in the help of node-memory-tune.
This patch just adds '\n' to get a correct help message.

Signed-off-by: Satoru Moriya <sat...@hd...>
---
 tools/virsh-host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/virsh-host.c b/tools/virsh-host.c index d05e435..b83c893 100644
--- a/tools/virsh-host.c
+++ b/tools/virsh-host.c
@@ -764,7 +764,7 @@ cmdVersion(vshControl *ctl, const vshCmd *cmd ATTRIBUTE_UNUSED)
 
 static const vshCmdInfo info_node_memory_tune[] = {
     {"help", N_("Get or set node memory parameters")},
-    {"desc", N_("Get or set node memory parameters"
+    {"desc", N_("Get or set node memory parameters\n"
                 "    To get the memory parameters, use following command: \n\n"
                 "    virsh # node-memory-tune")},
     {NULL, NULL}
--
1.7.11.7

[Dle-develop] [PATCH v5 -next 2/2]efi_pstore: Introducing workqueue updating sysfs entries

From: Seiji A. <sei...@hd...> - 2013-01-24 00:42:05

[Problem]
efi_pstore creates sysfs entries, which enable users to access to NVRAM,
in a write callback. If a kernel panic happens in an interrupt context,
it may fail because it could sleep due to dynamic memory allocations during
creating sysfs entries.

[Patch Description]
This patch removes sysfs operations from a write callback by introducing 
a workqueue updating sysfs entries which is scheduled after the write
callback is called.

Also, the workqueue is kicked in a just oops case.
A system will go down in other cases such as panic, clean shutdown and emergency 
restart. And we don't need to create sysfs entries because there is no chance for 
users to access to them.

efi_pstore will be robust against a kernel panic in an interrupt context with this patch.

Signed-off-by: Seiji Aguchi <sei...@hd...>
---
 drivers/firmware/efivars.c |   85 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/efi.h        |    3 +-
 2 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index a64fb7b..6922511 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -158,6 +158,13 @@ efivar_create_sysfs_entry(struct efivars *efivars,
 			  efi_char16_t *variable_name,
 			  efi_guid_t *vendor_guid);
 
+/*
+ * Prototype for workqueue functions updating sysfs entry
+ */
+
+static void efivar_update_sysfs_entries(struct work_struct *);
+static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries);
+
 /* Return the number of unicode characters in data */
 static unsigned long
 utf16_strnlen(efi_char16_t *s, size_t maxlength)
@@ -1248,11 +1255,8 @@ static int efi_pstore_write(enum pstore_type_id type,
 
 	spin_unlock_irqrestore(&efivars->lock, flags);
 
-	if (size)
-		ret = efivar_create_sysfs_entry(efivars,
-					  utf16_strsize(efi_name,
-							DUMP_NAME_LEN * 2),
-					  efi_name, &vendor);
+	if (reason == KMSG_DUMP_OOPS)
+		schedule_work(&efivar_work);
 
 	*id = part;
 	return ret;
@@ -1496,6 +1500,75 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
+{
+	struct efivar_entry *entry, *n;
+	struct efivars *efivars = &__efivars;
+	unsigned long strsize1, strsize2;
+	bool found = false;
+
+	strsize1 = utf16_strsize(variable_name, 1024);
+	list_for_each_entry_safe(entry, n, &efivars->list, list) {
+		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+		if (strsize1 == strsize2 &&
+			!memcmp(variable_name, &(entry->var.VariableName),
+				strsize2) &&
+			!efi_guidcmp(entry->var.VendorGuid,
+				*vendor)) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+static void efivar_update_sysfs_entries(struct work_struct *work)
+{
+	struct efivars *efivars = &__efivars;
+	efi_guid_t vendor;
+	efi_char16_t *variable_name;
+	unsigned long variable_name_size = 1024;
+	efi_status_t status = EFI_NOT_FOUND;
+	bool found;
+
+	/* Add new sysfs entries */
+	while (1) {
+		variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+		if (!variable_name) {
+			pr_err("efivars: Memory allocation failed.\n");
+			return;
+		}
+
+		spin_lock_irq(&efivars->lock);
+		found = false;
+		while (1) {
+			variable_name_size = 1024;
+			status = efivars->ops->get_next_variable(
+							&variable_name_size,
+							variable_name,
+							&vendor);
+			if (status != EFI_SUCCESS) {
+				break;
+			} else {
+				if (!variable_is_present(variable_name,
+				    &vendor)) {
+					found = true;
+					break;
+				}
+			}
+		}
+		spin_unlock_irq(&efivars->lock);
+
+		if (!found) {
+			kfree(variable_name);
+			break;
+		} else
+			efivar_create_sysfs_entry(efivars,
+						  variable_name_size,
+						  variable_name, &vendor);
+	}
+}
+
 /*
  * Let's not leave out systab information that snuck into
  * the efivars driver
@@ -1833,6 +1906,8 @@ err_put:
 static void __exit
 efivars_exit(void)
 {
+	cancel_work_sync(&efivar_work);
+
 	if (efi_enabled) {
 		unregister_efivars(&__efivars);
 		kobject_put(efi_kobj);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8b84916..6f94a25 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -728,7 +728,8 @@ struct efivars {
 	 * 1) ->list - adds, removals, reads, writes
 	 * 2) ops.[gs]et_variable() calls.
 	 * It must not be held when creating sysfs entries or calling kmalloc.
-	 * ops.get_next_variable() is only called from register_efivars(),
+	 * ops.get_next_variable() is only called from register_efivars()
+	 * or efivar_update_sysfs_entry(),
 	 * which is protected by the BKL, so that path is safe.
 	 */
 	spinlock_t lock;
-- 1.7.1

[Dle-develop] [PATCH v5 -next 1/2]efivars: Disable external interrupt while holding efivars->lock

From: Seiji A. <sei...@hd...> - 2013-01-24 00:41:35

[Problem]
There is a scenario which efi_pstore fails to log messages in a panic case.

 - CPUA holds an efi_var->lock in either efivarfs parts 
   or efi_pstore with interrupt enabled.
 - CPUB panics and sends IPI to CPUA in smp_send_stop().
 - CPUA stops with holding the lock.
 - CPUB kicks efi_pstore_write() via kmsg_dump(KSMG_DUMP_PANIC)
   but it returns without logging messages.

[Patch Description]
This patch disables an external interruption while holding efivars->lock 
as follows.

In efi_pstore_write() and get_var_data(), spin_lock/spin_unlock is 
replaced by spin_lock_irqsave/spin_unlock_irqrestore because they may 
be called in an interrupt context.

In other functions, they are replaced by spin_lock_irq/spin_unlock_irq.
because they are all called from a process context.

By applying this patch, we can avoid the problem above with 
a following senario.

 - CPUA holds an efi_var->lock with interrupt disabled.
 - CPUB panics and sends IPI to CPUA in smp_send_stop().
 - CPUA receives the IPI after releasing the lock because it is
   disabling interrupt while holding the lock.
 - CPUB waits for one sec until CPUA releases the lock.
 - CPUB kicks efi_pstore_write() via kmsg_dump(KSMG_DUMP_PANIC)
   And it can hold the lock successfully.

Signed-off-by: Seiji Aguchi <sei...@hd...>
Acked-by: Mike Waychison <mi...@go...>
---
 drivers/firmware/efivars.c |   86 ++++++++++++++++++++++---------------------
 1 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index ef5070d..a64fb7b 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -405,10 +405,11 @@ static efi_status_t
 get_var_data(struct efivars *efivars, struct efi_variable *var)
 {
 	efi_status_t status;
+	unsigned long flags;
 
-	spin_lock(&efivars->lock);
+	spin_lock_irqsave(&efivars->lock, flags);
 	status = get_var_data_locked(efivars, var);
-	spin_unlock(&efivars->lock);
+	spin_unlock_irqrestore(&efivars->lock, flags);
 
 	if (status != EFI_SUCCESS) {
 		printk(KERN_WARNING "efivars: get_variable() failed 0x%lx!\n",
@@ -537,14 +538,14 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
 		return -EINVAL;
 	}
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 	status = efivars->ops->set_variable(new_var->VariableName,
 					    &new_var->VendorGuid,
 					    new_var->Attributes,
 					    new_var->DataSize,
 					    new_var->Data);
 
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 
 	if (status != EFI_SUCCESS) {
 		printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
@@ -713,7 +714,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 	 * amounts of memory. Pick a default size of 64K if
 	 * QueryVariableInfo() isn't supported by the firmware.
 	 */
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 
 	if (!efivars->ops->query_variable_info)
 		status = EFI_UNSUPPORTED;
@@ -723,7 +724,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 						   &remaining_size, &max_size);
 	}
 
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 
 	if (status != EFI_SUCCESS) {
 		if (status != EFI_UNSUPPORTED)
@@ -754,7 +755,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 	 * set_variable call, and removal of the variable from the efivars
 	 * list (in the case of an authenticated delete).
 	 */
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 
 	status = efivars->ops->set_variable(var->var.VariableName,
 					    &var->var.VendorGuid,
@@ -762,7 +763,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 					    data);
 
 	if (status != EFI_SUCCESS) {
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		kfree(data);
 
 		return efi_status_to_err(status);
@@ -783,20 +784,20 @@ static ssize_t efivarfs_file_write(struct file *file,
 					    NULL);
 
 	if (status == EFI_BUFFER_TOO_SMALL) {
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		mutex_lock(&inode->i_mutex);
 		i_size_write(inode, newdatasize + sizeof(attributes));
 		mutex_unlock(&inode->i_mutex);
 
 	} else if (status == EFI_NOT_FOUND) {
 		list_del(&var->list);
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		efivar_unregister(var);
 		drop_nlink(inode);
 		dput(file->f_dentry);
 
 	} else {
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		pr_warn("efivarfs: inconsistent EFI variable implementation? "
 				"status = %lx\n", status);
 	}
@@ -818,11 +819,11 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
 	void *data;
 	ssize_t size = 0;
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 	status = efivars->ops->get_variable(var->var.VariableName,
 					    &var->var.VendorGuid,
 					    &attributes, &datasize, NULL);
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 
 	if (status != EFI_BUFFER_TOO_SMALL)
 		return efi_status_to_err(status);
@@ -832,12 +833,12 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
 	if (!data)
 		return -ENOMEM;
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 	status = efivars->ops->get_variable(var->var.VariableName,
 					    &var->var.VendorGuid,
 					    &attributes, &datasize,
 					    (data + sizeof(attributes)));
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 
 	if (status != EFI_SUCCESS) {
 		size = efi_status_to_err(status);
@@ -965,9 +966,9 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	kobject_uevent(&var->kobj, KOBJ_ADD);
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 	list_add(&var->list, &efivars->list);
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 	d_instantiate(dentry, inode);
 	dget(dentry);
 out:
@@ -984,7 +985,7 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct efivars *efivars = var->efivars;
 	efi_status_t status;
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 
 	status = efivars->ops->set_variable(var->var.VariableName,
 					    &var->var.VendorGuid,
@@ -992,14 +993,14 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (status == EFI_SUCCESS || status == EFI_NOT_FOUND) {
 		list_del(&var->list);
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		efivar_unregister(var);
 		drop_nlink(dir);
 		dput(dentry);
 		return 0;
 	}
 
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 	return -EINVAL;
 };
 
@@ -1065,13 +1066,13 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
 		/* copied by the above to local storage in the dentry. */
 		kfree(name);
 
-		spin_lock(&efivars->lock);
+		spin_lock_irq(&efivars->lock);
 		efivars->ops->get_variable(entry->var.VariableName,
 					   &entry->var.VendorGuid,
 					   &entry->var.Attributes,
 					   &size,
 					   NULL);
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 
 		mutex_lock(&inode->i_mutex);
 		inode->i_private = entry;
@@ -1122,7 +1123,7 @@ static int efi_pstore_open(struct pstore_info *psi)
 {
 	struct efivars *efivars = psi->data;
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 	efivars->walk_entry = list_first_entry(&efivars->list,
 					       struct efivar_entry, list);
 	return 0;
@@ -1132,7 +1133,7 @@ static int efi_pstore_close(struct pstore_info *psi)
 {
 	struct efivars *efivars = psi->data;
 
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 	return 0;
 }
 
@@ -1208,6 +1209,7 @@ static int efi_pstore_write(enum pstore_type_id type,
 	int i, ret = 0;
 	u64 storage_space, remaining_space, max_variable_size;
 	efi_status_t status = EFI_NOT_FOUND;
+	unsigned long flags;
 
 	if (pstore_cannot_block_path(reason)) {
 		/*
@@ -1215,10 +1217,10 @@ static int efi_pstore_write(enum pstore_type_id type,
 		 * this driver returns without entering firmware to avoid
 		 * hanging up.
 		 */
-		if (!spin_trylock(&efivars->lock))
+		if (!spin_trylock_irqsave(&efivars->lock, flags))
 			return -EBUSY;
 	} else
-		spin_lock(&efivars->lock);
+		spin_lock_irqsave(&efivars->lock, flags);
 
 	/*
 	 * Check if there is a space enough to log.
@@ -1230,7 +1232,7 @@ static int efi_pstore_write(enum pstore_type_id type,
 						   &remaining_space,
 						   &max_variable_size);
 	if (status || remaining_space < size + DUMP_NAME_LEN * 2) {
-		spin_unlock(&efivars->lock);
+		spin_unlock_irqrestore(&efivars->lock, flags);
 		*id = part;
 		return -ENOSPC;
 	}
@@ -1244,7 +1246,7 @@ static int efi_pstore_write(enum pstore_type_id type,
 	efivars->ops->set_variable(efi_name, &vendor, PSTORE_EFI_ATTRIBUTES,
 				   size, psi->buf);
 
-	spin_unlock(&efivars->lock);
+	spin_unlock_irqrestore(&efivars->lock, flags);
 
 	if (size)
 		ret = efivar_create_sysfs_entry(efivars,
@@ -1271,7 +1273,7 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
 	sprintf(name, "dump-type%u-%u-%d-%lu", type, (unsigned int)id, count,
 		time.tv_sec);
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 
 	for (i = 0; i < DUMP_NAME_LEN; i++)
 		efi_name[i] = name[i];
@@ -1315,7 +1317,7 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
 	if (found)
 		list_del(&found->list);
 
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 
 	if (found)
 		efivar_unregister(found);
@@ -1385,7 +1387,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 		return -EINVAL;
 	}
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 
 	/*
 	 * Does this variable already exist?
@@ -1403,7 +1405,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 		}
 	}
 	if (found) {
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		return -EINVAL;
 	}
 
@@ -1417,10 +1419,10 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 	if (status != EFI_SUCCESS) {
 		printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
 			status);
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		return -EIO;
 	}
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 
 	/* Create the entry in sysfs.  Locking is not required here */
 	status = efivar_create_sysfs_entry(efivars,
@@ -1448,7 +1450,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 
 	/*
 	 * Does this variable already exist?
@@ -1466,7 +1468,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 		}
 	}
 	if (!found) {
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		return -EINVAL;
 	}
 	/* force the Attributes/DataSize to 0 to ensure deletion */
@@ -1482,12 +1484,12 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 	if (status != EFI_SUCCESS) {
 		printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
 			status);
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		return -EIO;
 	}
 	list_del(&search_efivar->list);
 	/* We need to release this lock before unregistering. */
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 	efivar_unregister(search_efivar);
 
 	/* It's dead Jim.... */
@@ -1602,9 +1604,9 @@ efivar_create_sysfs_entry(struct efivars *efivars,
 	kfree(short_name);
 	short_name = NULL;
 
-	spin_lock(&efivars->lock);
+	spin_lock_irq(&efivars->lock);
 	list_add(&new_efivar->list, &efivars->list);
-	spin_unlock(&efivars->lock);
+	spin_unlock_irq(&efivars->lock);
 
 	return 0;
 }
@@ -1673,9 +1675,9 @@ void unregister_efivars(struct efivars *efivars)
 	struct efivar_entry *entry, *n;
 
 	list_for_each_entry_safe(entry, n, &efivars->list, list) {
-		spin_lock(&efivars->lock);
+		spin_lock_irq(&efivars->lock);
 		list_del(&entry->list);
-		spin_unlock(&efivars->lock);
+		spin_unlock_irq(&efivars->lock);
 		efivar_unregister(entry);
 	}
 	if (efivars->new_var)
-- 
1.7.1

[Dle-develop] [PATCH v5 -next 0/2] make efivars/efi_pstore interrupt-safe

From: Seiji A. <sei...@hd...> - 2013-01-24 00:40:56

Changelogv4 -> v5
   - Rebase from a linus tree to a linux-next tree to avoid getting
     a conflict when this patchset is merged to a linux-next tree.
   - Merge previous patches 2/3 and 3/3 into 2/2 because they fix
     a same problem.
   - Modify to fit a latest upstream kernel as follows.
    - Change spinlock operations of efivarfs which has been 
      introduced recently.(Patch 1/2)
    - Remove delete_all_stale_sysfs_entries() from update_sysfs_entries()
      because a currnet efi_pstore doesn't erase existing entries
      in a write callback and sysfs entries don't become stale. 
      (Patch 2/2)

v3 -> v4
  - Patch 2/3
    Move cancel_work_sync() above an efi_enabled test in efivars_exit().

v2 -> v3
  - Patch 1/3
    Replace spin_lock_irqsave/spin_unlock_irqrestore with spin_lock_irq/spin_unlock_irq in efivars_unregister(),
    efivar_create(), efivar_store_raw() and efivar_delete() which are called in a process context. 

 - Patch 2/3
    Change a name of delete_sysfs_entry() to delete_all_stale_sysfs_entries().
    Also, don't release an efivar->lock while searching efivar->list in delete_all_stale_sysfs_entries().

 - Patch 3/3
    Remove a logic in efi_pstore_erase() which freshly created in patch v2.

v1 -> v2
 - Patch 1/3
    Add spin_lock_irq/spin_unlock_irq to open/close callbacks of efi_pstore 
    instead of moving spin_locks to a read callback.    

 - Patch 2/3
    Replace a periodical timer with schedule_work().

 - Patch 3/3
    freshly create to kick a workqueue in oops case only.

[Problem]
 There are following problems related to an interrupt context in efivars
 including efivarfs and efi_pstore.

(1)There is a scenario which efi_pstore fails to log messages 
   in a panic case.

   - CPUA holds an efi_var->lock in either efivarfs parts 
     or efi_pstore with interrupt enabled.
   - CPUB panics and sends IPI to CPUA in smp_send_stop().
   - CPUA stops with holding the lock.
   - CPUB kicks efi_pstore_write() via kmsg_dump(KSMG_DUMP_PANIC)
     but it returns without logging messages.

(2)Also, efi_pstore creates sysfs entries, which enable users to access to 
   NVRAM, in a write callback.
   If a kernel panic happens in an interrupt contexts, pstore may fail
   because it could sleep due to dynamic memory allocations during creating 
   sysfs entries.
   An actual failure due to the create_sysfs_entry() has been reported.
   http://comments.gmane.org/gmane.linux.kernel.efi/406

To resolve problems above, this patchset makes efivars/efi_pstore
interrupt-safe.

[Patch Description]
  Please see detailed explanations in each patch.

Seiji Aguchi (2):
  efivars: Disable external interrupt while holding efivars->lock
   - This patch fixes a problem (1).

  efi_pstore: Introducing workqueue updating sysfs entries
   - This patch fixes a problem (2). 

 drivers/firmware/efivars.c |  171 ++++++++++++++++++++++++++++++++------------
 include/linux/efi.h        |    3 +-
 2 files changed, 126 insertions(+), 48 deletions(-)

Re: [Dle-develop] [Qemu-devel] [PATCH] Add option to mlock guest and qemu memory

From: Jan K. <jan...@si...> - 2013-01-22 14:58:43

On 2013-01-22 15:45, Satoru Moriya wrote:
> On 01/21/2013 04:43 PM, Marcelo Tosatti wrote:
>> On Fri, Sep 28, 2012 at 10:05:09AM +0200, Jan Kiszka wrote:
>>> On 2012-09-28 01:21, Satoru Moriya wrote:
>>>> This is a first time for me to post a patch to qemu-devel.
>>>> If there is something missing/wrong, please let me know.
>>>>
>>>> We have some plans to migrate old enterprise systems which require 
>>>> low latency (msec order) to kvm virtualized environment. Usually, we 
>>>> uses mlock to preallocate and pin down process memory in order to 
>>>> avoid page allocation in latency critical path. On the other hand, 
>>>> in kvm environment, mlocking in guests is not effective because it 
>>>> can't avoid page reclaim in host. Actually, to avoid guest memory 
>>>> reclaim, qemu has "mem-path" option that is actually for using 
>>>> hugepage. But a memory region of qemu is not allocated on hugepage, 
>>>> so it may be reclaimed. That may cause a latency problem.
>>>>
>>>> To avoid guest and qemu memory reclaim, this patch introduces a new 
>>>> "mlock" option. With this option, we can preallocate and pin down 
>>>> guest and qemu memory before booting guest OS.
>>>
>>> I guess this reduces the likeliness of multi-millisecond latencies 
>>> for you but not eliminate them. Of course, mlockall is part of our 
>>> local changes for real-time QEMU/KVM, but it is just one of the many 
>>> pieces required. I'm wondering how the situation is on your side.
>>>
>>> I think mlockall should once be enabled automatically as soon as you 
>>> ask for real-time support for QEMU guests. How that should be 
>>> controlled is another question. I'm currently carrying a top-level 
>>> switch "-rt maxprio=x[,policy=y]" here, likely not the final 
>>> solution. I'm not really convinced we need to control memory locking 
>>> separately. And as we are very reluctant to add new top-level 
>>> switches, this is even more important.
>>
>> In certain scenarios, latency induced by paging is significant and 
>> memory locking is sufficient.
>>
>> Moreover, scenarios with untrusted guests for which latency 
>> improvement due to mlock is desired, realtime priority is problematic 
>> (guests whose QEMU threads have realtime priority can abuse the host system).
> 
> Right, our usecase is of multiple guests with untrusted VMs.

If you cannot dedicate resources (CPU cores) to the guest, you can still
throttle its RT bandwidth.

Nevertheless, I'm also fine with making this property separately
controllable via -realtime. Enabling -realtime may not require setting a
priority > 0, thus will keep all threads at SCHED_OTHER in that case.
But it will default to enable mlockall. In addition, if you feel like,
-realtime mlock=true|false could be provided to make even this configurable.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

Re: [Dle-develop] [Qemu-devel] [PATCH] Add option to mlock guest and qemu memory

From: Satoru M. <sat...@hd...> - 2013-01-22 14:45:21

On 01/21/2013 04:43 PM, Marcelo Tosatti wrote:
> On Fri, Sep 28, 2012 at 10:05:09AM +0200, Jan Kiszka wrote:
>> On 2012-09-28 01:21, Satoru Moriya wrote:
>>> This is a first time for me to post a patch to qemu-devel.
>>> If there is something missing/wrong, please let me know.
>>>
>>> We have some plans to migrate old enterprise systems which require 
>>> low latency (msec order) to kvm virtualized environment. Usually, we 
>>> uses mlock to preallocate and pin down process memory in order to 
>>> avoid page allocation in latency critical path. On the other hand, 
>>> in kvm environment, mlocking in guests is not effective because it 
>>> can't avoid page reclaim in host. Actually, to avoid guest memory 
>>> reclaim, qemu has "mem-path" option that is actually for using 
>>> hugepage. But a memory region of qemu is not allocated on hugepage, 
>>> so it may be reclaimed. That may cause a latency problem.
>>>
>>> To avoid guest and qemu memory reclaim, this patch introduces a new 
>>> "mlock" option. With this option, we can preallocate and pin down 
>>> guest and qemu memory before booting guest OS.
>>
>> I guess this reduces the likeliness of multi-millisecond latencies 
>> for you but not eliminate them. Of course, mlockall is part of our 
>> local changes for real-time QEMU/KVM, but it is just one of the many 
>> pieces required. I'm wondering how the situation is on your side.
>>
>> I think mlockall should once be enabled automatically as soon as you 
>> ask for real-time support for QEMU guests. How that should be 
>> controlled is another question. I'm currently carrying a top-level 
>> switch "-rt maxprio=x[,policy=y]" here, likely not the final 
>> solution. I'm not really convinced we need to control memory locking 
>> separately. And as we are very reluctant to add new top-level 
>> switches, this is even more important.
> 
> In certain scenarios, latency induced by paging is significant and 
> memory locking is sufficient.
> 
> Moreover, scenarios with untrusted guests for which latency 
> improvement due to mlock is desired, realtime priority is problematic 
> (guests whose QEMU threads have realtime priority can abuse the host system).

Right, our usecase is of multiple guests with untrusted VMs.

Regards,
Satoru

[Dle-develop] [RFC][PATCH v8 3/3] trace, x86: code-sharing between non-trace and trace irq handlers

From: Seiji A. <sei...@hd...> - 2013-01-21 22:15:12

[Issue]

Currently, irq vector handlers for tracing are just
copied non-trace handlers by simply inserting tracepoints.

It is difficult to manage the codes.

[Solution]

This patch shares common codes between non-trace and trace handlers
as follows to make them manageable and readable.

Non-trace irq handler:
smp_irq_handler()
{
	entering_irq(); /* pre-processing of this handler */
	__smp_irq_handler(); /*
                          * common logic between non-trace and trace handlers
                          * in a vector.
                          */
	exiting_irq(); /* post-processing of this handler */

}

Trace irq_handler:
smp_trace_irq_handler()
{
	entering_irq(); /* pre-processing of this handler */
	trace_irq_entry(); /* tracepoint for irq entry */
	__smp_irq_handler(); /*
                          * common logic between non-trace and trace handlers
                          * in a vector.
                          */
	trace_irq_exit(); /* tracepoint for irq exit */
	exiting_irq(); /* post-processing of this handler */

}

Signed-off-by: Seiji Aguchi <sei...@hd...>
---
 arch/x86/include/asm/apic.h              |   27 ++++++++
 arch/x86/kernel/apic/apic.c              |  103 ++++++++----------------------
 arch/x86/kernel/cpu/mcheck/therm_throt.c |   24 +++----
 arch/x86/kernel/cpu/mcheck/threshold.c   |   24 +++----
 arch/x86/kernel/irq.c                    |   34 +++-------
 arch/x86/kernel/irq_work.c               |   22 ++++--
 arch/x86/kernel/smp.c                    |   54 ++++++++++------
 7 files changed, 137 insertions(+), 151 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3388034..f8119b5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -12,6 +12,7 @@
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
 #include <asm/msr.h>
+#include <asm/idle.h>
 
 #define ARCH_APICTIMER_STOPS_ON_C3	1
 
@@ -687,5 +688,31 @@ extern int default_check_phys_apicid_present(int phys_apicid);
 #endif
 
 #endif /* CONFIG_X86_LOCAL_APIC */
+extern void irq_enter(void);
+extern void irq_exit(void);
+
+static inline void entering_irq(void)
+{
+	irq_enter();
+	exit_idle();
+}
+
+static inline void entering_ack_irq(void)
+{
+	ack_APIC_irq();
+	entering_irq();
+}
+
+static inline void exiting_irq(void)
+{
+	irq_exit();
+}
+
+static inline void exiting_ack_irq(void)
+{
+	irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+	ack_APIC_irq();
+}
 
 #endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 89f3f4d..c146cbc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -922,17 +922,14 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-	/*
+	 *
 	 * update_process_times() expects us to have done irq_enter().
 	 * Besides, if we don't timer interrupts ignore the global
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 	local_apic_timer_interrupt();
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 }
@@ -944,19 +941,16 @@ void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-	/*
+	 *
 	 * update_process_times() expects us to have done irq_enter().
 	 * Besides, if we don't timer interrupts ignore the global
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 	trace_local_timer_entry(LOCAL_TIMER_VECTOR);
 	local_apic_timer_interrupt();
 	trace_local_timer_exit(LOCAL_TIMER_VECTOR);
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 }
@@ -1935,12 +1929,10 @@ int __init APIC_init_uniprocessor(void)
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
-void smp_spurious_interrupt(struct pt_regs *regs)
+static inline void __smp_spurious_interrupt(void)
 {
 	u32 v;
 
-	irq_enter();
-	exit_idle();
 	/*
 	 * Check if this really is a spurious interrupt and ACK it
 	 * if it is a vectored one.  Just in case...
@@ -1955,38 +1947,28 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
 	pr_info("spurious APIC interrupt on CPU#%d, "
 		"should never happen.\n", smp_processor_id());
-	irq_exit();
 }
 
-void smp_trace_spurious_interrupt(struct pt_regs *regs)
+void smp_spurious_interrupt(struct pt_regs *regs)
 {
-	u32 v;
+	entering_irq();
+	__smp_spurious_interrupt();
+	exiting_irq();
+}
 
-	irq_enter();
-	exit_idle();
+void smp_trace_spurious_interrupt(struct pt_regs *regs)
+{
+	entering_irq();
 	trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR);
-	/*
-	 * Check if this really is a spurious interrupt and ACK it
-	 * if it is a vectored one.  Just in case...
-	 * Spurious interrupts should not be ACKed.
-	 */
-	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-		ack_APIC_irq();
-
-	inc_irq_stat(irq_spurious_count);
-
-	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
-	pr_info("spurious APIC interrupt on CPU#%d, "
-		"should never happen.\n", smp_processor_id());
+	__smp_spurious_interrupt();
 	trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
-void smp_error_interrupt(struct pt_regs *regs)
+static inline void __smp_error_interrupt(struct pt_regs *regs)
 {
 	u32 v0, v1;
 	u32 i = 0;
@@ -2001,8 +1983,6 @@ void smp_error_interrupt(struct pt_regs *regs)
 		"Illegal register address",	/* APIC Error Bit 7 */
 	};
 
-	irq_enter();
-	exit_idle();
 	/* First tickle the hardware, only then report what went on. -- REW */
 	v0 = apic_read(APIC_ESR);
 	apic_write(APIC_ESR, 0);
@@ -2023,49 +2003,22 @@ void smp_error_interrupt(struct pt_regs *regs)
 
 	apic_printk(APIC_DEBUG, KERN_CONT "\n");
 
-	irq_exit();
 }
 
-void smp_trace_error_interrupt(struct pt_regs *regs)
+void smp_error_interrupt(struct pt_regs *regs)
 {
-	u32 v0, v1;
-	u32 i = 0;
-	static const char * const error_interrupt_reason[] = {
-		"Send CS error",		/* APIC Error Bit 0 */
-		"Receive CS error",		/* APIC Error Bit 1 */
-		"Send accept error",		/* APIC Error Bit 2 */
-		"Receive accept error",		/* APIC Error Bit 3 */
-		"Redirectable IPI",		/* APIC Error Bit 4 */
-		"Send illegal vector",		/* APIC Error Bit 5 */
-		"Received illegal vector",	/* APIC Error Bit 6 */
-		"Illegal register address",	/* APIC Error Bit 7 */
-	};
+	entering_irq();
+	__smp_error_interrupt(regs);
+	exiting_irq();
+}
 
-	irq_enter();
-	exit_idle();
+void smp_trace_error_interrupt(struct pt_regs *regs)
+{
+	entering_irq();
 	trace_error_apic_entry(ERROR_APIC_VECTOR);
-	/* First tickle the hardware, only then report what went on. -- REW */
-	v0 = apic_read(APIC_ESR);
-	apic_write(APIC_ESR, 0);
-	v1 = apic_read(APIC_ESR);
-	ack_APIC_irq();
-	atomic_inc(&irq_err_count);
-
-	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
-		    smp_processor_id(), v0 , v1);
-
-	v1 = v1 & 0xff;
-	while (v1) {
-		if (v1 & 0x1)
-			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
-		i++;
-		v1 >>= 1;
-	}
-
-	apic_printk(APIC_DEBUG, KERN_CONT "\n");
-
+	__smp_error_interrupt(regs);
 	trace_error_apic_exit(ERROR_APIC_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e7aa7fc..2f3a799 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -379,28 +379,26 @@ static void unexpected_thermal_interrupt(void)
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
 
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
+static inline void __smp_thermal_interrupt(void)
 {
-	irq_enter();
-	exit_idle();
 	inc_irq_stat(irq_thermal_count);
 	smp_thermal_vector();
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+}
+
+asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
+{
+	entering_irq();
+	__smp_thermal_interrupt();
+	exiting_ack_irq();
 }
 
 asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs)
 {
-	irq_enter();
-	exit_idle();
+	entering_irq();
 	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
-	inc_irq_stat(irq_thermal_count);
-	smp_thermal_vector();
+	__smp_thermal_interrupt();
 	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+	exiting_ack_irq();
 }
 
 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 0cbef99..fe6b1c8 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -18,26 +18,24 @@ static void default_threshold_interrupt(void)
 
 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
 
-asmlinkage void smp_threshold_interrupt(void)
+static inline void __smp_threshold_interrupt(void)
 {
-	irq_enter();
-	exit_idle();
 	inc_irq_stat(irq_threshold_count);
 	mce_threshold_vector();
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+}
+
+asmlinkage void smp_threshold_interrupt(void)
+{
+	entering_irq();
+	__smp_threshold_interrupt();
+	exiting_ack_irq();
 }
 
 asmlinkage void smp_trace_threshold_interrupt(void)
 {
-	irq_enter();
-	exit_idle();
+	entering_irq();
 	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
-	inc_irq_stat(irq_threshold_count);
-	mce_threshold_vector();
+	__smp_threshold_interrupt();
 	trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+	exiting_ack_irq();
 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 216bec1..ae836cd 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -209,23 +209,21 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 /*
  * Handler for X86_PLATFORM_IPI_VECTOR.
  */
-void smp_x86_platform_ipi(struct pt_regs *regs)
+void __smp_x86_platform_ipi(void)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
-
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
 	inc_irq_stat(x86_platform_ipis);
 
 	if (x86_platform_ipi_callback)
 		x86_platform_ipi_callback();
+}
 
-	irq_exit();
+void smp_x86_platform_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	entering_ack_irq();
+	__smp_x86_platform_ipi();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 
@@ -233,21 +231,11 @@ void smp_trace_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
+	entering_ack_irq();
 	trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
-	inc_irq_stat(x86_platform_ipis);
-
-	if (x86_platform_ipi_callback)
-		x86_platform_ipi_callback();
-
+	__smp_x86_platform_ipi();
 	trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
-	irq_exit();
-
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 09e6262..636a55e 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -10,24 +10,32 @@
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
 
-void smp_irq_work_interrupt(struct pt_regs *regs)
+static inline void irq_work_entering_irq(void)
 {
 	irq_enter();
 	ack_APIC_irq();
+}
+
+static inline void __smp_irq_work_interrupt(void)
+{
 	inc_irq_stat(apic_irq_work_irqs);
 	irq_work_run();
-	irq_exit();
+}
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+	irq_work_entering_irq();
+	__smp_irq_work_interrupt();
+	exiting_irq();
 }
 
 void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_enter();
-	ack_APIC_irq();
+	irq_work_entering_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
-	inc_irq_stat(apic_irq_work_irqs);
-	irq_work_run();
+	__smp_irq_work_interrupt();
 	trace_irq_work_exit(IRQ_WORK_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 void arch_irq_work_raise(void)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index aad58af..f4fe0b8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -250,11 +250,16 @@ finish:
 /*
  * Reschedule call back.
  */
-void smp_reschedule_interrupt(struct pt_regs *regs)
+static inline void __smp_reschedule_interrupt(void)
 {
-	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
 	scheduler_ipi();
+}
+
+void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	__smp_reschedule_interrupt();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
@@ -264,52 +269,61 @@ void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
-	inc_irq_stat(irq_resched_count);
-	scheduler_ipi();
+	__smp_reschedule_interrupt();
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
 }
 
-void smp_call_function_interrupt(struct pt_regs *regs)
+static inline void call_function_entering_irq(void)
 {
 	ack_APIC_irq();
 	irq_enter();
+}
+
+static inline void __smp_call_function_interrupt(void)
+{
 	generic_smp_call_function_interrupt();
 	inc_irq_stat(irq_call_count);
-	irq_exit();
+}
+
+void smp_call_function_interrupt(struct pt_regs *regs)
+{
+	call_function_entering_irq();
+	__smp_call_function_interrupt();
+	exiting_irq();
 }
 
 void smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
-	ack_APIC_irq();
-	irq_enter();
+	call_function_entering_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
-	generic_smp_call_function_interrupt();
-	inc_irq_stat(irq_call_count);
+	__smp_call_function_interrupt();
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
-void smp_call_function_single_interrupt(struct pt_regs *regs)
+static inline void __smp_call_function_single_interrupt(void)
 {
-	ack_APIC_irq();
-	irq_enter();
 	generic_smp_call_function_single_interrupt();
 	inc_irq_stat(irq_call_count);
-	irq_exit();
+}
+
+void smp_call_function_single_interrupt(struct pt_regs *regs)
+{
+	call_function_entering_irq();
+	__smp_call_function_single_interrupt();
+	exiting_irq();
 }
 
 void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
-	ack_APIC_irq();
-	irq_enter();
+	call_function_entering_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
-	generic_smp_call_function_single_interrupt();
-	inc_irq_stat(irq_call_count);
+	__smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 static int __init nonmi_ipi_setup(char *str)
-- 
1.7.1

[Dle-develop] [RFC][PATCH v8 2/3] trace, x86: add x86 irq vector tracepoints

From: Seiji A. <sei...@hd...> - 2013-01-21 22:14:45

[Purpose of this patch]

As Vaibhav explained in the thread below, tracepoints for irq vectors
are useful.

http://www.spinics.net/lists/mm-commits/msg85707.html

<snip>
The current interrupt traces from irq_handler_entry and irq_handler_exit
provide when an interrupt is handled.  They provide good data about when
the system has switched to kernel space and how it affects the currently
running processes.

There are some IRQ vectors which trigger the system into kernel space,
which are not handled in generic IRQ handlers.  Tracing such events gives
us the information about IRQ interaction with other system events.

The trace also tells where the system is spending its time.  We want to
know which cores are handling interrupts and how they are affecting other
processes in the system.  Also, the trace provides information about when
the cores are idle and which interrupts are changing that state.
<snip>

On the other hand, my usecase is tracing just local timer event and
getting a value of instruction pointer.

I suggested to add an argument local timer event to get instruction pointer before.
But there is another way to get it with external module like systemtap.
So, I don't need to add any argument to irq vector tracepoints now.

[Patch Description]

Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events.
But there is an above use case to trace specific irq_vector rather than tracing all events.
In this case, we are concerned about overhead due to unwanted events.

This patch adds following tracepoints instead of introducing irq_vector_entry/exit.
so that we can enable them independently.
   - local_timer_vector
   - reschedule_vector
   - call_function_vector
   - call_function_single_vector
   - irq_work_entry_vector
   - error_apic_vector
   - thermal_apic_vector
   - threshold_apic_vector
   - spurious_apic_vector
   - x86_platform_ipi_vector

Also, it introduces a logic switching IDT at enabling/disabling time so that a time penalty
makes a zero when tracepoints are disabled. Detailed explanations are as follows.
 - Duplicate new irq handlers inserted tracepoints.
 - Create a new IDT, trace_idt_table, at boot time by adding a logic to
   _set_gate(). It is just a copy of original idt table.
 - Registering the new handers for tracpoints to the new IDT by introducing
   macros to alloc_intr_gate() called at regstering time of irq_vector handlers.
 - Switch IDT to new one at enabling TP time.
 - Restore to an original IDT at disabling TP time.
The new IDT is created only when CONFIG_TRACING is enabled to avoid being used for other purposes.

Signed-off-by: Seiji Aguchi <sei...@hd...>
---
 arch/x86/include/asm/desc.h              |   33 ++++++-
 arch/x86/include/asm/entry_arch.h        |    5 +-
 arch/x86/include/asm/hw_irq.h            |   16 +++
 arch/x86/include/asm/trace/irq_vectors.h |  159 ++++++++++++++++++++++++++++++
 arch/x86/kernel/Makefile                 |    1 +
 arch/x86/kernel/apic/apic.c              |   94 ++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/therm_throt.c |   14 +++
 arch/x86/kernel/cpu/mcheck/threshold.c   |   14 +++
 arch/x86/kernel/entry_32.S               |   12 ++-
 arch/x86/kernel/entry_64.S               |   27 ++++-
 arch/x86/kernel/head_64.S                |    6 +
 arch/x86/kernel/irq.c                    |   23 +++++
 arch/x86/kernel/irq_work.c               |   12 +++
 arch/x86/kernel/smp.c                    |   35 +++++++
 arch/x86/kernel/tracepoint.c             |   61 ++++++++++++
 include/xen/events.h                     |    3 +
 16 files changed, 505 insertions(+), 10 deletions(-)
 create mode 100644 arch/x86/include/asm/trace/irq_vectors.h
 create mode 100644 arch/x86/kernel/tracepoint.c

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 8bf1c06..f2a381b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -320,6 +320,17 @@ static inline void set_nmi_gate(int gate, void *addr)
 }
 #endif
 
+#ifdef CONFIG_TRACING
+extern gate_desc trace_idt_table[];
+static inline void trace_set_intr_gate(unsigned int gate, void *addr)
+{
+	gate_desc s;
+
+	pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+	write_idt_entry(trace_idt_table, gate, &s);
+}
+#endif
+
 static inline void _set_gate(int gate, unsigned type, void *addr,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
@@ -331,6 +342,9 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
 	 * setup time
 	 */
 	write_idt_entry(idt_table, gate, &s);
+#ifdef CONFIG_TRACING
+	write_idt_entry(trace_idt_table, gate, &s);
+#endif
 }
 
 /*
@@ -360,12 +374,27 @@ static inline void alloc_system_vector(int vector)
 	}
 }
 
-static inline void alloc_intr_gate(unsigned int n, void *addr)
+#ifdef CONFIG_TRACING
+static inline void __trace_alloc_intr_gate(unsigned int n, void *addr)
+{
+	trace_set_intr_gate(n, addr);
+}
+#else
+#define __trace_alloc_intr_gate(n, addr)
+#endif
+
+static inline void __alloc_intr_gate(unsigned int n, void *addr)
 {
-	alloc_system_vector(n);
 	set_intr_gate(n, addr);
 }
 
+#define alloc_intr_gate(n, addr)				\
+	do {							\
+		alloc_system_vector(n);				\
+		__alloc_intr_gate(n, addr);			\
+		__trace_alloc_intr_gate(n, trace_##addr);	\
+	} while (0)
+
 /*
  * This routine sets up an interrupt gate at directory privilege level 3.
  */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 40afa00..0bb99d8 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -13,8 +13,9 @@
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
-BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
-BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
+BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR,
+		 smp_irq_move_cleanup_interrupt)
+BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt)
 #endif
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eb92a6e..2e297d8 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -76,6 +76,22 @@ extern void threshold_interrupt(void);
 extern void call_function_interrupt(void);
 extern void call_function_single_interrupt(void);
 
+#ifdef CONFIG_TRACING
+/* Interrupt handlers registered during init_IRQ */
+extern void trace_apic_timer_interrupt(void);
+extern void trace_x86_platform_ipi(void);
+extern void trace_error_interrupt(void);
+extern void trace_irq_work_interrupt(void);
+extern void trace_spurious_interrupt(void);
+extern void trace_thermal_interrupt(void);
+extern void trace_reschedule_interrupt(void);
+extern void trace_threshold_interrupt(void);
+extern void trace_call_function_interrupt(void);
+extern void trace_call_function_single_interrupt(void);
+#define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
+#define trace_reboot_interrupt  reboot_interrupt
+#endif /* CONFIG_TRACING */
+
 /* IOAPIC */
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
 extern unsigned long io_apic_irqs;
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
new file mode 100644
index 0000000..b4f1c53
--- /dev/null
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -0,0 +1,159 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq_vectors
+
+#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_VECTORS_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_irq_vector_regfunc(void);
+extern void trace_irq_vector_unregfunc(void);
+
+DECLARE_EVENT_CLASS(x86_irq_vector,
+
+	TP_PROTO(int vector),
+
+	TP_ARGS(vector),
+
+	TP_STRUCT__entry(
+		__field(		int,	vector	)
+	),
+
+	TP_fast_assign(
+		__entry->vector = vector;
+	),
+
+	TP_printk("vector=%d", __entry->vector) );
+
+#define DEFINE_IRQ_VECTOR_EVENT(name)	\
+DEFINE_EVENT_FN(x86_irq_vector, name,	\
+	TP_PROTO(int vector),		\
+	TP_ARGS(vector),		\
+	trace_irq_vector_regfunc,	\
+	trace_irq_vector_unregfunc);
+
+/*
+ * local_timer_entry - called before entering a local timer interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(local_timer_entry);
+
+/*
+ * local_timer_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(local_timer_exit);
+
+/*
+ * reschedule_entry - called before entering a reschedule vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(reschedule_entry);
+
+/*
+ * reschedule_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(reschedule_exit);
+
+/*
+ * spurious_apic_entry - called before entering a spurious apic vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(spurious_apic_entry);
+
+/*
+ * spurious_apic_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(spurious_apic_exit);
+
+/*
+ * error_apic_entry - called before entering an error apic vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(error_apic_entry);
+
+/*
+ * error_apic_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(error_apic_exit);
+
+/*
+ * x86_platform_ipi_entry - called before entering a x86 platform ipi interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi_entry);
+
+/*
+ * x86_platform_ipi_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi_exit);
+
+/*
+ * irq_work_entry - called before entering a irq work interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(irq_work_entry);
+
+/*
+ * irq_work_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(irq_work_exit);
+
+/*
+ * call_function_entry - called before entering a call function interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function_entry);
+
+/*
+ * call_function_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function_exit);
+
+/*
+ * call_function_single_entry - called before entering a call function
+ * single interrupt vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function_single_entry);
+
+/*
+ * call_function_single_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function_single_exit);
+
+/*
+ * threshold_apic_entry - called before entering a threshold apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(threshold_apic_entry);
+
+/*
+ * threshold_apic_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(threshold_apic_exit);
+
+/*
+ * thermal_apic_entry - called before entering a thermal apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(thermal_apic_entry);
+
+/*
+ * thrmal_apic_exit - called immediately after the interrupt vector
+ * handler returns
+ */
+DEFINE_IRQ_VECTOR_EVENT(thermal_apic_exit);
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/x86/include/asm/trace
+#define TRACE_INCLUDE_FILE irq_vectors
+#endif /*  _TRACE_IRQ_VECTORS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34e923a..24e2080 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_OF)			+= devicetree.o
 obj-$(CONFIG_UPROBES)			+= uprobes.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
+obj-$(CONFIG_TRACING)			+= tracepoint.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b994cc8..89f3f4d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -55,6 +55,9 @@
 #include <asm/tsc.h>
 #include <asm/hypervisor.h>
 
+#define CREATE_TRACE_POINTS
+#include <asm/trace/irq_vectors.h>
+
 unsigned int num_processors;
 
 unsigned disabled_cpus __cpuinitdata;
@@ -934,6 +937,30 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+	/*
+	 * update_process_times() expects us to have done irq_enter().
+	 * Besides, if we don't timer interrupts ignore the global
+	 * interrupt lock, which is the WrongThing (tm) to do.
+	 */
+	irq_enter();
+	exit_idle();
+	trace_local_timer_entry(LOCAL_TIMER_VECTOR);
+	local_apic_timer_interrupt();
+	trace_local_timer_exit(LOCAL_TIMER_VECTOR);
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
 int setup_profiling_timer(unsigned int multiplier)
 {
 	return -EINVAL;
@@ -1931,6 +1958,31 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_trace_spurious_interrupt(struct pt_regs *regs)
+{
+	u32 v;
+
+	irq_enter();
+	exit_idle();
+	trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR);
+	/*
+	 * Check if this really is a spurious interrupt and ACK it
+	 * if it is a vectored one.  Just in case...
+	 * Spurious interrupts should not be ACKed.
+	 */
+	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+		ack_APIC_irq();
+
+	inc_irq_stat(irq_spurious_count);
+
+	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
+	pr_info("spurious APIC interrupt on CPU#%d, "
+		"should never happen.\n", smp_processor_id());
+	trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR);
+	irq_exit();
+}
+
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
@@ -1974,6 +2026,48 @@ void smp_error_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_trace_error_interrupt(struct pt_regs *regs)
+{
+	u32 v0, v1;
+	u32 i = 0;
+	static const char * const error_interrupt_reason[] = {
+		"Send CS error",		/* APIC Error Bit 0 */
+		"Receive CS error",		/* APIC Error Bit 1 */
+		"Send accept error",		/* APIC Error Bit 2 */
+		"Receive accept error",		/* APIC Error Bit 3 */
+		"Redirectable IPI",		/* APIC Error Bit 4 */
+		"Send illegal vector",		/* APIC Error Bit 5 */
+		"Received illegal vector",	/* APIC Error Bit 6 */
+		"Illegal register address",	/* APIC Error Bit 7 */
+	};
+
+	irq_enter();
+	exit_idle();
+	trace_error_apic_entry(ERROR_APIC_VECTOR);
+	/* First tickle the hardware, only then report what went on. -- REW */
+	v0 = apic_read(APIC_ESR);
+	apic_write(APIC_ESR, 0);
+	v1 = apic_read(APIC_ESR);
+	ack_APIC_irq();
+	atomic_inc(&irq_err_count);
+
+	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
+		    smp_processor_id(), v0 , v1);
+
+	v1 = v1 & 0xff;
+	while (v1) {
+		if (v1 & 0x1)
+			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+		i++;
+		v1 >>= 1;
+	}
+
+	apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
+	trace_error_apic_exit(ERROR_APIC_VECTOR);
+	irq_exit();
+}
+
 /**
  * connect_bsp_APIC - attach the APIC to the interrupt system
  */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 47a1870..e7aa7fc 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -29,6 +29,7 @@
 #include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
 
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)
@@ -389,6 +390,19 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
 	ack_APIC_irq();
 }
 
+asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	exit_idle();
+	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+	inc_irq_stat(irq_thermal_count);
+	smp_thermal_vector();
+	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+	irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+	ack_APIC_irq();
+}
+
 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
 static int intel_thermal_supported(struct cpuinfo_x86 *c)
 {
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index aa578ca..0cbef99 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -8,6 +8,7 @@
 #include <asm/apic.h>
 #include <asm/idle.h>
 #include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
 
 static void default_threshold_interrupt(void)
 {
@@ -27,3 +28,16 @@ asmlinkage void smp_threshold_interrupt(void)
 	/* Ack only at the end to avoid potential reentry */
 	ack_APIC_irq();
 }
+
+asmlinkage void smp_trace_threshold_interrupt(void)
+{
+	irq_enter();
+	exit_idle();
+	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+	inc_irq_stat(irq_threshold_count);
+	mce_threshold_vector();
+	trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+	irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+	ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index ff84d54..bb52af6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -846,7 +846,17 @@ ENTRY(name)				\
 	CFI_ENDPROC;			\
 ENDPROC(name)
 
-#define BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(name, nr, smp_##name)
+
+#ifdef CONFIG_TRACING
+#define TRACE_BUILD_INTERRUPT(name, nr)		\
+	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+#else
+#define TRACE_BUILD_INTERRUPT(name, nr)
+#endif
+
+#define BUILD_INTERRUPT(name, nr) \
+	BUILD_INTERRUPT3(name, nr, smp_##name); \
+	TRACE_BUILD_INTERRUPT(name, nr)
 
 /* The include is where all of the SMP etc. interrupts come from */
 #include <asm/entry_arch.h>
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 07a7a04..34279ad 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1146,7 +1146,7 @@ END(common_interrupt)
 /*
  * APIC interrupts.
  */
-.macro apicinterrupt num sym do_sym
+.macro apicinterrupt3 num sym do_sym
 ENTRY(\sym)
 	INTR_FRAME
 	ASM_CLAC
@@ -1158,15 +1158,32 @@ ENTRY(\sym)
 END(\sym)
 .endm
 
+#ifdef CONFIG_TRACING
+#define trace(sym) trace_##sym
+#define smp_trace(sym) smp_trace_##sym
+
+.macro trace_apicinterrupt num sym
+apicinterrupt3 \num trace(\sym) smp_trace(\sym)
+.endm
+#else
+.macro trace_apicinterrupt num sym do_sym
+.endm
+#endif
+
+.macro apicinterrupt num sym do_sym
+apicinterrupt3 \num \sym \do_sym
+trace_apicinterrupt \num \sym
+.endm
+
 #ifdef CONFIG_SMP
-apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
 	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt REBOOT_VECTOR \
+apicinterrupt3 REBOOT_VECTOR \
 	reboot_interrupt smp_reboot_interrupt
 #endif
 
 #ifdef CONFIG_X86_UV
-apicinterrupt UV_BAU_MESSAGE \
+apicinterrupt3 UV_BAU_MESSAGE \
 	uv_bau_message_intr1 uv_bau_message_interrupt
 #endif
 apicinterrupt LOCAL_TIMER_VECTOR \
@@ -1454,7 +1471,7 @@ ENTRY(xen_failsafe_callback)
 	CFI_ENDPROC
 END(xen_failsafe_callback)
 
-apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+apicinterrupt3 XEN_HVM_EVTCHN_CALLBACK \
 	xen_hvm_callback_vector xen_evtchn_do_upcall
 
 #endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c..054213b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -471,6 +471,12 @@ ENTRY(idt_table)
 ENTRY(nmi_idt_table)
 	.skip IDT_ENTRIES * 16
 
+#ifdef CONFIG_TRACING
+	.align L1_CACHE_BYTES
+ENTRY(trace_idt_table)
+	.skip IDT_ENTRIES * 16
+#endif
+
 	__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e4595f1..216bec1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -17,6 +17,7 @@
 #include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/hw_irq.h>
+#include <asm/trace/irq_vectors.h>
 
 atomic_t irq_err_count;
 
@@ -228,6 +229,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	exit_idle();
+
+	trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+	inc_irq_stat(x86_platform_ipis);
+
+	if (x86_platform_ipi_callback)
+		x86_platform_ipi_callback();
+
+	trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index ca8f703..09e6262 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -8,6 +8,7 @@
 #include <linux/irq_work.h>
 #include <linux/hardirq.h>
 #include <asm/apic.h>
+#include <asm/trace/irq_vectors.h>
 
 void smp_irq_work_interrupt(struct pt_regs *regs)
 {
@@ -18,6 +19,17 @@ void smp_irq_work_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	trace_irq_work_entry(IRQ_WORK_VECTOR);
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_work_run();
+	trace_irq_work_exit(IRQ_WORK_VECTOR);
+	irq_exit();
+}
+
 void arch_irq_work_raise(void)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 48d2b7d..aad58af 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -30,6 +30,7 @@
 #include <asm/proto.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
+#include <asm/trace/irq_vectors.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
@@ -259,6 +260,18 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 	 */
 }
 
+void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	trace_reschedule_entry(RESCHEDULE_VECTOR);
+	inc_irq_stat(irq_resched_count);
+	scheduler_ipi();
+	trace_reschedule_exit(RESCHEDULE_VECTOR);
+	/*
+	 * KVM uses this interrupt to force a cpu out of guest mode
+	 */
+}
+
 void smp_call_function_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
@@ -268,6 +281,17 @@ void smp_call_function_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_trace_call_function_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	irq_enter();
+	trace_call_function_entry(CALL_FUNCTION_VECTOR);
+	generic_smp_call_function_interrupt();
+	inc_irq_stat(irq_call_count);
+	trace_call_function_exit(CALL_FUNCTION_VECTOR);
+	irq_exit();
+}
+
 void smp_call_function_single_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
@@ -277,6 +301,17 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	irq_enter();
+	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
+	generic_smp_call_function_single_interrupt();
+	inc_irq_stat(irq_call_count);
+	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
+	irq_exit();
+}
+
 static int __init nonmi_ipi_setup(char *str)
 {
 	smp_no_nmi_ipi = true;
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
new file mode 100644
index 0000000..bd5642a
--- /dev/null
+++ b/arch/x86/kernel/tracepoint.c
@@ -0,0 +1,61 @@
+/*
+ * Code for supporting irq vector tracepoints.
+ *
+ * Copyright (C) 2013 Seiji Aguchi <sei...@hd...>
+ *
+ */
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
+
+static struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
+				    (unsigned long) trace_idt_table };
+
+#ifndef CONFIG_X86_64
+gate_desc trace_idt_table[NR_VECTORS] __page_aligned_data
+					= { { { { 0, 0 } } }, };
+#endif
+
+static struct desc_ptr orig_idt_descr[NR_CPUS];
+static int trace_irq_vector_refcount;
+
+static void switch_to_trace_idt(void *arg)
+{
+	store_idt(&orig_idt_descr[smp_processor_id()]);
+	load_idt(&trace_idt_descr);
+
+	return;
+}
+
+static void restore_original_idt(void *arg)
+{
+	if (orig_idt_descr[smp_processor_id()].address) {
+		load_idt(&orig_idt_descr[smp_processor_id()]);
+		memset(&orig_idt_descr[smp_processor_id()], 0,
+		       sizeof(struct desc_ptr));
+	}
+
+	return;
+}
+
+void trace_irq_vector_regfunc(void)
+{
+	if (!trace_irq_vector_refcount) {
+		smp_call_function(switch_to_trace_idt, NULL, 0);
+		local_irq_disable();
+		switch_to_trace_idt(NULL);
+		local_irq_enable();
+	}
+	trace_irq_vector_refcount++;
+}
+
+void trace_irq_vector_unregfunc(void)
+{
+	trace_irq_vector_refcount--;
+	if (!trace_irq_vector_refcount) {
+		smp_call_function(restore_original_idt, NULL, 0);
+		local_irq_disable();
+		restore_original_idt(NULL);
+		local_irq_enable();
+	}
+}
+
diff --git a/include/xen/events.h b/include/xen/events.h
index c6bfe01..9216d07 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -76,6 +76,9 @@ unsigned irq_from_evtchn(unsigned int evtchn);
 
 /* Xen HVM evtchn vector callback */
 void xen_hvm_callback_vector(void);
+#ifdef CONFIG_TRACING
+#define trace_xen_hvm_callback_vector xen_hvm_callback_vector
+#endif
 extern int xen_have_vector_callback;
 int xen_set_callback_via(uint64_t via);
 void xen_evtchn_do_upcall(struct pt_regs *regs);
-- 
1.7.1

[Dle-develop] [RFC][PATCH v8 1/3] tracing: Add DEFINE_EVENT_FN() macro

From: Seiji A. <sei...@hd...> - 2013-01-21 22:13:55

Each TRACE_EVENT() adds several helper functions. If two or more trace events
share the same structure and print format, they can also share most of these
helper functions and save a lot of space from duplicate code. This is why the
DECLARE_EVENT_CLASS() and DEFINE_EVENT() were created.

Some events require a trigger to be called at registering and unregistering of
the event and to do so they use TRACE_EVENT_FN().

If multiple events require a trigger, they currently have no choice but to use
TRACE_EVENT_FN() as there's no DEFINE_EVENT_FN() available. This unfortunately
causes a lot of wasted duplicate code created.

By adding a DEFINE_EVENT_FN(), these events can still use a
DECLARE_EVENT_CLASS() and then define their own triggers.

Signed-off-by: Seiji Aguchi <sei...@hd...>
Signed-off-by: Steven Rostedt <ro...@go...>
---
 include/linux/tracepoint.h   |    2 ++
 include/trace/define_trace.h |    5 +++++
 include/trace/ftrace.h       |    4 ++++
 3 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 2f322c3..9bf59e5 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -378,6 +378,8 @@ static inline void tracepoint_synchronize_unregister(void)
 #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)
 #define DEFINE_EVENT(template, name, proto, args)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_CONDITION(template, name, proto,		\
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 1905ca8..02e1003 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -44,6 +44,10 @@
 #define DEFINE_EVENT(template, name, proto, args) \
 	DEFINE_TRACE(name)
 
+#undef DEFINE_EVENT_FN
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg) \
+	DEFINE_TRACE_FN(name, reg, unreg)
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DEFINE_TRACE(name)
@@ -91,6 +95,7 @@
 #undef TRACE_EVENT_CONDITION
 #undef DECLARE_EVENT_CLASS
 #undef DEFINE_EVENT
+#undef DEFINE_EVENT_FN
 #undef DEFINE_EVENT_PRINT
 #undef DEFINE_EVENT_CONDITION
 #undef TRACE_HEADER_MULTI_READ
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 40dc5e8..7bab676 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -71,6 +71,10 @@
 	static struct ftrace_event_call	__used		\
 	__attribute__((__aligned__(4))) event_##name
 
+#undef DEFINE_EVENT_FN
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
-- 
1.7.1

[Dle-develop] [RFC][PATCH v8 0/3]trace, x86: irq vector tracepoint support

From: Seiji A. <sei...@hd...> - 2013-01-21 22:13:34

Change log 

 v7 -> v8
 - Rebase to 3.8-rc4
 - Add a patch 1/3 introducing DEFINE_EVENT_FN() macro.
 - Rename original patches 1/2 and 2/2 to 2/3 and 3/3.
 - Change a definition of tracepoint to use DEFINE_EVENT_FN(). (patch 2/3)
 - Change alloc_intr_gate() to use do{}while(0) to avoid a warning 
   of checkpatch.pl. (patch 2/3)
 - Move entering_irq()/exiting_irq() to arch/x86/include/asm/apic.h (patch 3/3)

 v6 -> v7
 - Divide into two patches to make a code review easier.
   Summery of each patch is as follows.
    - Patch 1/2
      - Add an irq_vector tracing infrastructure.
      - Create idt_table for tracing. It is refactored to avoid duplicating
        existing logic.
      - Duplicate new irq handlers inserted tracepoints.

    - Patch 2/2
      - Share a common logic among irq handlers to make them
        manageable and readable.

 v5 -> v6
 - Rebased to 3.7

 v4 -> v5
 - Rebased to 3.6.0

 - Introduce a logic switching IDT at enabling/disabling TP time 
   so that a time penalty makes a zero when tracepoints are disabled.
   This IDT is created only when CONFIG_TRACEPOINTS is enabled.

 - Remove arch_irq_vector_entry/exit and add followings again
   so that we can add each tracepoint in a generic way.
   - error_apic_vector
   - thermal_apic_vector
   - threshold_apic_vector
   - spurious_apic_vector
   - x86_platform_ipi_vector

 - Drop nmi tracepoints to begin with apic interrupts and discuss a logic switching
   IDT first.

 - Move irq_vectors.h in the directory of arch/x86/include/asm/trace because
   I'm not sure if a logic switching IDT is sharable with other architectures.

 v3 -> v4
 - Add a latency measurement of each tracepoint
 - Rebased to 3.6-rc6

 v2 -> v3
 - Remove an invalidate_tlb_vector event because it was replaced by a call function vector
   in a following commit.
   http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=52aec3308db85f4e9f5c8b9f5dc4fbd0138c6fa4

 v1 -> v2
 - Modify variable name from irq to vector.
 - Merge arch-specific tracepoints below to an arch_irq_vector_entry/exit.
   - error_apic_vector
   - thermal_apic_vector
   - threshold_apic_vector
   - spurious_apic_vector
   - x86_platform_ipi_vector

[Purpose of this patch]

As Vaibhav explained in the thread below, tracepoints for irq vectors
are useful.

http://www.spinics.net/lists/mm-commits/msg85707.html

<snip>
The current interrupt traces from irq_handler_entry and irq_handler_exit
provide when an interrupt is handled.  They provide good data about when
the system has switched to kernel space and how it affects the currently
running processes.

There are some IRQ vectors which trigger the system into kernel space,
which are not handled in generic IRQ handlers.  Tracing such events gives
us the information about IRQ interaction with other system events.

The trace also tells where the system is spending its time.  We want to
know which cores are handling interrupts and how they are affecting other
processes in the system.  Also, the trace provides information about when
the cores are idle and which interrupts are changing that state.
<snip>

On the other hand, my usecase is tracing just local timer event and 
getting a value of instruction pointer.

I suggested to add an argument local timer event to get instruction pointer before.
But there is another way to get it with external module like systemtap.
So, I don't need to add any argument to irq vector tracepoints now.

[Patch Description]

Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events.
But there is an above use case to trace specific irq_vector rather than tracing all events.
In this case, we are concerned about overhead due to unwanted events.

This patch adds following tracepoints instead of introducing irq_vector_entry/exit.
so that we can enable them independently.
   - local_timer_vector
   - reschedule_vector
   - call_function_vector
   - call_function_single_vector 
   - irq_work_entry_vector
   - error_apic_vector
   - thermal_apic_vector
   - threshold_apic_vector
   - spurious_apic_vector
   - x86_platform_ipi_vector

Please see descriptions in each patch.

Seiji Aguchi (3):
  tracing: Add DEFINE_EVENT_FN() macro
  trace,x86: add x86 irq vector tracepoints
  trace,x86: code-sharing between non-trace and trace irq handlers

 arch/x86/include/asm/apic.h              |   27 +++++
 arch/x86/include/asm/desc.h              |   33 ++++++-
 arch/x86/include/asm/entry_arch.h        |    5 +-
 arch/x86/include/asm/hw_irq.h            |   16 +++
 arch/x86/include/asm/trace/irq_vectors.h |  159 ++++++++++++++++++++++++++++++
 arch/x86/kernel/Makefile                 |    1 +
 arch/x86/kernel/apic/apic.c              |   71 +++++++++++---
 arch/x86/kernel/cpu/mcheck/therm_throt.c |   24 ++++-
 arch/x86/kernel/cpu/mcheck/threshold.c   |   24 ++++-
 arch/x86/kernel/entry_32.S               |   12 ++-
 arch/x86/kernel/entry_64.S               |   27 ++++-
 arch/x86/kernel/head_64.S                |    6 +
 arch/x86/kernel/irq.c                    |   31 ++++--
 arch/x86/kernel/irq_work.c               |   24 ++++-
 arch/x86/kernel/smp.c                    |   65 +++++++++++--
 arch/x86/kernel/tracepoint.c             |   61 ++++++++++++
 include/linux/tracepoint.h               |    2 +
 include/trace/define_trace.h             |    5 +
 include/trace/ftrace.h                   |    4 +
 include/xen/events.h                     |    3 +
 20 files changed, 546 insertions(+), 54 deletions(-)
 create mode 100644 arch/x86/include/asm/trace/irq_vectors.h
 create mode 100644 arch/x86/kernel/tracepoint.c

Re: [Dle-develop] [Qemu-devel] [PATCH] Add option to mlock guest and qemu memory

From: Marcelo T. <mto...@re...> - 2013-01-21 21:49:56

On Fri, Sep 28, 2012 at 10:05:09AM +0200, Jan Kiszka wrote:
> On 2012-09-28 01:21, Satoru Moriya wrote:
> > This is a first time for me to post a patch to qemu-devel.
> > If there is something missing/wrong, please let me know.
> > 
> > We have some plans to migrate old enterprise systems which require
> > low latency (msec order) to kvm virtualized environment. Usually,
> > we uses mlock to preallocate and pin down process memory in order
> > to avoid page allocation in latency critical path. On the other
> > hand, in kvm environment, mlocking in guests is not effective
> > because it can't avoid page reclaim in host. Actually, to avoid
> > guest memory reclaim, qemu has "mem-path" option that is actually
> > for using hugepage. But a memory region of qemu is not allocated
> > on hugepage, so it may be reclaimed. That may cause a latency
> > problem.
> > 
> > To avoid guest and qemu memory reclaim, this patch introduces
> > a new "mlock" option. With this option, we can preallocate and
> > pin down guest and qemu memory before booting guest OS.
> 
> I guess this reduces the likeliness of multi-millisecond latencies for
> you but not eliminate them. Of course, mlockall is part of our local
> changes for real-time QEMU/KVM, but it is just one of the many pieces
> required. I'm wondering how the situation is on your side.
> 
> I think mlockall should once be enabled automatically as soon as you ask
> for real-time support for QEMU guests. How that should be controlled is
> another question. I'm currently carrying a top-level switch "-rt
> maxprio=x[,policy=y]" here, likely not the final solution. I'm not
> really convinced we need to control memory locking separately. And as we
> are very reluctant to add new top-level switches, this is even more
> important.

In certain scenarios, latency induced by paging is significant
and memory locking is sufficient. 

Moreover, scenarios with untrusted guests for which latency improvement
due to mlock is desired, realtime priority is problematic (guests whose
QEMU threads have realtime priority can abuse the host system).

Re: [Dle-develop] [RFC][PATCH v7 1/2] trace, x86: add x86 irq vector tracepoints

From: Seiji A. <sei...@hd...> - 2013-01-16 06:22:12

Steven,

Thank you for reviewing my patchset.
I will update both patch 1/2 and 2/2 in accordance with your comment.

Seiji

> -----Original Message-----
> From: Steven Rostedt [mailto:ro...@go...]
> Sent: Tuesday, January 15, 2013 8:25 PM
> To: Seiji Aguchi
> Cc: x8...@ke...; lin...@vg...; H. Peter Anvin (hp...@zy...); Thomas Gleixner (tg...@li...);
> 'mi...@el...' (mi...@el...); Borislav Petkov (bp...@al...); Satoru Moriya; dle...@li...; linux-
> ed...@vg...; Luck, Tony (ton...@in...)
> Subject: Re: [RFC][PATCH v7 1/2] trace,x86: add x86 irq vector tracepoints
> 
> On Thu, 2013-01-10 at 17:33 +0000, Seiji Aguchi wrote:
> 
> > diff --git a/arch/x86/include/asm/trace/irq_vectors.h
> > b/arch/x86/include/asm/trace/irq_vectors.h
> > new file mode 100644
> > index 0000000..9bcb27b
> > --- /dev/null
> > +++ b/arch/x86/include/asm/trace/irq_vectors.h
> > @@ -0,0 +1,154 @@
> > +#undef TRACE_SYSTEM
> > +#define TRACE_SYSTEM irq_vectors
> > +
> > +#if !defined(_TRACE_IRQ_VECTORS_H) ||
> > +defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_VECTORS_H
> > +
> > +#include <linux/tracepoint.h>
> > +
> > +extern void trace_irq_vector_regfunc(void); extern void
> > +trace_irq_vector_unregfunc(void);
> > +
> > +#define DECLARE_IRQ_VECTOR_EVENT(name)				\
> > +TRACE_EVENT_FN(name,						\
> > +	TP_PROTO(int vector),					\
> > +								\
> > +	TP_ARGS(vector),					\
> > +								\
> > +	TP_STRUCT__entry(					\
> > +		__field(	int,	vector	)		\
> > +	),							\
> > +								\
> > +	TP_fast_assign(						\
> > +		__entry->vector = vector;			\
> > +	),							\
> > +								\
> > +	TP_printk("vector=%d", __entry->vector),		\
> > +	trace_irq_vector_regfunc, trace_irq_vector_unregfunc	\
> > +);
> > +
> > +/*
> > + * local_timer_entry - called before entering a local timer interrupt
> > + * vector handler
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(local_timer_entry)
> 
> This is a great big waste of space. Add the below patch to your series and then just do:
> 
> DECLARE_EVENT_CLASS(x86_irq_vector,
> 
> 	TP_PROTO(int vector),
> 
> 	TP_ARGS(vector),
> 
> 	TP_STRUCT__entry(
> 		__field(		int,	vector	)
> 	),
> 
> 	TP_fast_assign(
> 		__entry->vector = vector;
> 	),
> 
> 	TP_printk("vector=%d", __entry->vector) );
> 
> #define DEFINE_IRQ_VECTOR_EVENT(name) \
> DEFINE_EVENT_FN(name, \
> 	TP_PROTO(int vector), \
> 	TP_ARGS(vector), \
> 	trace_irq_vector_regfunc, \
> 	trace_irq_vector_unregfunc);
> 
> And keep the rest the same.
> 
> > +
> > +/*
> > + * local_timer_exit - called immediately after the interrupt vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(local_timer_exit)
> > +
> > +/*
> > + * reschedule_entry - called before entering a reschedule vector
> > +handler  */
> > +DECLARE_IRQ_VECTOR_EVENT(reschedule_entry)
> > +
> > +/*
> > + * reschedule_exit - called immediately after the interrupt vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(reschedule_exit)
> > +
> > +/*
> > + * spurious_apic_entry - called before entering a spurious apic
> > +vector handler  */
> > +DECLARE_IRQ_VECTOR_EVENT(spurious_apic_entry)
> > +
> > +/*
> > + * spurious_apic_exit - called immediately after the interrupt vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(spurious_apic_exit)
> > +
> > +/*
> > + * error_apic_entry - called before entering an error apic vector
> > +handler  */
> > +DECLARE_IRQ_VECTOR_EVENT(error_apic_entry)
> > +
> > +/*
> > + * error_apic_exit - called immediately after the interrupt vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(error_apic_exit)
> > +
> > +/*
> > + * x86_platform_ipi_entry - called before entering a x86 platform ipi
> > +interrupt
> > + * vector handler
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(x86_platform_ipi_entry)
> > +
> > +/*
> > + * x86_platform_ipi_exit - called immediately after the interrupt
> > +vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(x86_platform_ipi_exit)
> > +
> > +/*
> > + * irq_work_entry - called before entering a irq work interrupt
> > + * vector handler
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(irq_work_entry)
> > +
> > +/*
> > + * irq_work_exit - called immediately after the interrupt vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(irq_work_exit)
> > +
> > +/*
> > + * call_function_entry - called before entering a call function
> > +interrupt
> > + * vector handler
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(call_function_entry)
> > +
> > +/*
> > + * call_function_exit - called immediately after the interrupt vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(call_function_exit)
> > +
> > +/*
> > + * call_function_single_entry - called before entering a call
> > +function
> > + * single interrupt vector handler
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(call_function_single_entry)
> > +
> > +/*
> > + * call_function_single_exit - called immediately after the interrupt
> > +vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(call_function_single_exit)
> > +
> > +/*
> > + * threshold_apic_entry - called before entering a threshold apic
> > +interrupt
> > + * vector handler
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(threshold_apic_entry)
> > +
> > +/*
> > + * threshold_apic_exit - called immediately after the interrupt
> > +vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(threshold_apic_exit)
> > +
> > +/*
> > + * thermal_apic_entry - called before entering a thermal apic
> > +interrupt
> > + * vector handler
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(thermal_apic_entry)
> > +
> > +/*
> > + * thrmal_apic_exit - called immediately after the interrupt vector
> > + * handler returns
> > + */
> > +DECLARE_IRQ_VECTOR_EVENT(thermal_apic_exit)
> > +
> > +#undef TRACE_INCLUDE_PATH
> > +#define TRACE_INCLUDE_PATH ../../arch/x86/include/asm/trace #define
> > +TRACE_INCLUDE_FILE irq_vectors #endif /*  _TRACE_IRQ_VECTORS_H */
> > +
> > +/* This part must be outside protection */ #include
> > +<trace/define_trace.h>
> > +
> 
> ---
> tracing: Add DEFINE_EVENT_FN() macro
> 
> Each TRACE_EVENT() adds several helper functions. If two or more trace events share the same structure and print format, they can
> also share most of these helper functions and save a lot of space from duplicate code. This is why the DECLARE_EVENT_CLASS() and
> DEFINE_EVENT() were created.
> 
> Some events require a trigger to be called at registering and unregistering of the event and to do so they use TRACE_EVENT_FN().
> 
> If multiple events require a trigger, they currently have no choice but to use TRACE_EVENT_FN() as there's no DEFINE_EVENT_FN()
> available. This unfortunately causes a lot of wasted duplicate code created.
> 
> By adding a DEFINE_EVENT_FN(), these events can still use a
> DECLARE_EVENT_CLASS() and then define their own triggers.
> 
> Signed-off-by: Steven Rostedt <ro...@go...>
> 
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 2f322c3..9bf59e5 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -378,6 +378,8 @@ static inline void tracepoint_synchronize_unregister(void)
>  #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)
>  #define DEFINE_EVENT(template, name, proto, args)		\
>  	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
> +#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\
> +	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
>  #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
>  	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
>  #define DEFINE_EVENT_CONDITION(template, name, proto,		\
> diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 1905ca8..02e1003 100644
> --- a/include/trace/define_trace.h
> +++ b/include/trace/define_trace.h
> @@ -44,6 +44,10 @@
>  #define DEFINE_EVENT(template, name, proto, args) \
>  	DEFINE_TRACE(name)
> 
> +#undef DEFINE_EVENT_FN
> +#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg) \
> +	DEFINE_TRACE_FN(name, reg, unreg)
> +
>  #undef DEFINE_EVENT_PRINT
>  #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
>  	DEFINE_TRACE(name)
> @@ -91,6 +95,7 @@
>  #undef TRACE_EVENT_CONDITION
>  #undef DECLARE_EVENT_CLASS
>  #undef DEFINE_EVENT
> +#undef DEFINE_EVENT_FN
>  #undef DEFINE_EVENT_PRINT
>  #undef DEFINE_EVENT_CONDITION
>  #undef TRACE_HEADER_MULTI_READ
> diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 40dc5e8..7bab676 100644
> --- a/include/trace/ftrace.h
> +++ b/include/trace/ftrace.h
> @@ -71,6 +71,10 @@
>  	static struct ftrace_event_call	__used		\
>  	__attribute__((__aligned__(4))) event_##name
> 
> +#undef DEFINE_EVENT_FN
> +#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)	\
> +	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
> +
>  #undef DEFINE_EVENT_PRINT
>  #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
>  	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
>

Re: [Dle-develop] [RFC][PATCH v7 2/2]trace, x86: code-sharing between non-trace and trace irq handlers

From: Steven R. <ro...@go...> - 2013-01-16 01:36:45

On Thu, 2013-01-10 at 17:33 +0000, Seiji Aguchi wrote:
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index 624ef3f..580aa93 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -6,6 +6,7 @@
>  #include <linux/ftrace_irq.h>
>  #include <linux/vtime.h>
>  #include <asm/hardirq.h>
> +#include <asm/idle.h>
>  
>  /*
>   * We put the hardirq and softirq counter into the preemption
> @@ -198,4 +199,28 @@ extern void irq_exit(void);
>  		ftrace_nmi_exit();				\
>  	} while (0)
>  
> +static inline void entering_irq(void)
> +{
> +	irq_enter();
> +	exit_idle();
> +}
> +
> +static inline void entering_ack_irq(void)
> +{
> +	ack_APIC_irq();
> +	entering_irq();
> +}
> +
> +static inline void exiting_irq(void)
> +{
> +	irq_exit();
> +}
> +
> +static inline void exiting_ack_irq(void)
> +{
> +        irq_exit();
> +	/* Ack only at the end to avoid potential reentry */
> +        ack_APIC_irq();
> +}

Shouldn't these be in a x86 specific header? Not a generic header like
hardirq.h. I don't think other archs will be doing an ack_APIC_irq(), as
it's only defined in x86. Why not add these to
arch/x86/include/asm/apic.h ?

-- Steve

> +
>  #endif /* LINUX_HARDIRQ_H */
> -- 1.7.1

Re: [Dle-develop] [RFC][PATCH v7 1/2] trace, x86: add x86 irq vector tracepoints

From: Steven R. <ro...@go...> - 2013-01-16 01:25:37

On Thu, 2013-01-10 at 17:33 +0000, Seiji Aguchi wrote:

> diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
> new file mode 100644
> index 0000000..9bcb27b
> --- /dev/null
> +++ b/arch/x86/include/asm/trace/irq_vectors.h
> @@ -0,0 +1,154 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM irq_vectors
> +
> +#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_IRQ_VECTORS_H
> +
> +#include <linux/tracepoint.h>
> +
> +extern void trace_irq_vector_regfunc(void);
> +extern void trace_irq_vector_unregfunc(void);
> +
> +#define DECLARE_IRQ_VECTOR_EVENT(name)				\
> +TRACE_EVENT_FN(name,						\
> +	TP_PROTO(int vector),					\
> +								\
> +	TP_ARGS(vector),					\
> +								\
> +	TP_STRUCT__entry(					\
> +		__field(	int,	vector	)		\
> +	),							\
> +								\
> +	TP_fast_assign(						\
> +		__entry->vector = vector;			\
> +	),							\
> +								\
> +	TP_printk("vector=%d", __entry->vector),		\
> +	trace_irq_vector_regfunc, trace_irq_vector_unregfunc	\
> +);
> +
> +/*
> + * local_timer_entry - called before entering a local timer interrupt
> + * vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(local_timer_entry)

This is a great big waste of space. Add the below patch to your series
and then just do:

DECLARE_EVENT_CLASS(x86_irq_vector,

	TP_PROTO(int vector),

	TP_ARGS(vector),

	TP_STRUCT__entry(
		__field(		int,	vector	)
	),

	TP_fast_assign(
		__entry->vector = vector;
	),

	TP_printk("vector=%d", __entry->vector)
);

#define DEFINE_IRQ_VECTOR_EVENT(name) \	
DEFINE_EVENT_FN(name, \
	TP_PROTO(int vector), \
	TP_ARGS(vector), \
	trace_irq_vector_regfunc, \
	trace_irq_vector_unregfunc);

And keep the rest the same.

> +
> +/*
> + * local_timer_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(local_timer_exit)
> +
> +/*
> + * reschedule_entry - called before entering a reschedule vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(reschedule_entry)
> +
> +/*
> + * reschedule_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(reschedule_exit)
> +
> +/*
> + * spurious_apic_entry - called before entering a spurious apic vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(spurious_apic_entry)
> +
> +/*
> + * spurious_apic_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(spurious_apic_exit)
> +
> +/*
> + * error_apic_entry - called before entering an error apic vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(error_apic_entry)
> +
> +/*
> + * error_apic_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(error_apic_exit)
> +
> +/*
> + * x86_platform_ipi_entry - called before entering a x86 platform ipi interrupt
> + * vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(x86_platform_ipi_entry)
> +
> +/*
> + * x86_platform_ipi_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(x86_platform_ipi_exit)
> +
> +/*
> + * irq_work_entry - called before entering a irq work interrupt
> + * vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(irq_work_entry)
> +
> +/*
> + * irq_work_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(irq_work_exit)
> +
> +/*
> + * call_function_entry - called before entering a call function interrupt
> + * vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(call_function_entry)
> +
> +/*
> + * call_function_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(call_function_exit)
> +
> +/*
> + * call_function_single_entry - called before entering a call function
> + * single interrupt vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(call_function_single_entry)
> +
> +/*
> + * call_function_single_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(call_function_single_exit)
> +
> +/*
> + * threshold_apic_entry - called before entering a threshold apic interrupt
> + * vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(threshold_apic_entry)
> +
> +/*
> + * threshold_apic_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(threshold_apic_exit)
> +
> +/*
> + * thermal_apic_entry - called before entering a thermal apic interrupt
> + * vector handler
> + */
> +DECLARE_IRQ_VECTOR_EVENT(thermal_apic_entry)
> +
> +/*
> + * thrmal_apic_exit - called immediately after the interrupt vector
> + * handler returns
> + */
> +DECLARE_IRQ_VECTOR_EVENT(thermal_apic_exit)
> +
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH ../../arch/x86/include/asm/trace
> +#define TRACE_INCLUDE_FILE irq_vectors
> +#endif /*  _TRACE_IRQ_VECTORS_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> +

---
tracing: Add DEFINE_EVENT_FN() macro

Each TRACE_EVENT() adds several helper functions. If two or more trace
events share the same structure and print format, they can also share
most of these helper functions and save a lot of space from duplicate
code. This is why the DECLARE_EVENT_CLASS() and DEFINE_EVENT() were
created.

Some events require a trigger to be called at registering and
unregistering of the event and to do so they use TRACE_EVENT_FN().

If multiple events require a trigger, they currently have no choice but
to use TRACE_EVENT_FN() as there's no DEFINE_EVENT_FN() available. This
unfortunately causes a lot of wasted duplicate code created.

By adding a DEFINE_EVENT_FN(), these events can still use a
DECLARE_EVENT_CLASS() and then define their own triggers.

Signed-off-by: Steven Rostedt <ro...@go...>

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 2f322c3..9bf59e5 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -378,6 +378,8 @@ static inline void tracepoint_synchronize_unregister(void)
 #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)
 #define DEFINE_EVENT(template, name, proto, args)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_CONDITION(template, name, proto,		\
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 1905ca8..02e1003 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -44,6 +44,10 @@
 #define DEFINE_EVENT(template, name, proto, args) \
 	DEFINE_TRACE(name)
 
+#undef DEFINE_EVENT_FN
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg) \
+	DEFINE_TRACE_FN(name, reg, unreg)
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DEFINE_TRACE(name)
@@ -91,6 +95,7 @@
 #undef TRACE_EVENT_CONDITION
 #undef DECLARE_EVENT_CLASS
 #undef DEFINE_EVENT
+#undef DEFINE_EVENT_FN
 #undef DEFINE_EVENT_PRINT
 #undef DEFINE_EVENT_CONDITION
 #undef TRACE_HEADER_MULTI_READ
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 40dc5e8..7bab676 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -71,6 +71,10 @@
 	static struct ftrace_event_call	__used		\
 	__attribute__((__aligned__(4))) event_##name
 
+#undef DEFINE_EVENT_FN
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))

Re: [Dle-develop] [PATCH v4 0/2] pstore, efi_pstore: Avoid deadlock in non-blocking paths

From: Seiji A. <sei...@hd...> - 2013-01-12 04:40:13

Anton,

V3 patch caused a build failure in Linux-next tree.
So, I updated to v4 patch.
Tony already applied the latest one.

You don't need to do anything.
And  I'm sorry for the confusion.

Seiji

<snip>
-----Original Message-----
From: Stephen Rothwell [mailto:sf...@ca...] 
Sent: Thursday, January 10, 2013 6:55 PM
To: Luck, Tony
Cc: lin...@vg...; lin...@vg...; Seiji Aguchi
Subject: linux-next: build failure after merge of the ia64 tree

Hi all,

After merging the ia64 tree, today's linux-next build (x86_64_allmodconfig) failed like this:

ERROR: "pstore_cannot_block_path" [drivers/firmware/efivars.ko] undefined!

Caused by commit 1a1df768ea4e ("efi_pstore: Avoid deadlock in non-blocking paths").

I have used the ia64 tree from next-20130110 for today.
--
Cheers,
Stephen Rothwell                    sf...@ca...
<snip>

> -----Original Message-----
> From: Anton Vorontsov [mailto:an...@sc...] On Behalf Of Anton Vorontsov
> Sent: Friday, January 11, 2013 10:38 PM
> To: Seiji Aguchi
> Cc: lin...@vg...; Luck, Tony (ton...@in...); dz...@re...; dle...@li...; Satoru
> Moriya
> Subject: Re: [PATCH v4 0/2] pstore,efi_pstore: Avoid deadlock in non-blocking paths
> 
> On Fri, Jan 11, 2013 at 06:08:52PM +0000, Seiji Aguchi wrote:
> > Changelog
> > v3 -> v4
> 
> I am confused. Tony already applied v3, no? Do you expect me to do anything with these patches, or is it just an update for Tony?
> 
> Thanks,
> Anton

Re: [Dle-develop] [PATCH v4 0/2] pstore, efi_pstore: Avoid deadlock in non-blocking paths

From: Anton V. <an...@en...> - 2013-01-12 04:06:15

On Fri, Jan 11, 2013 at 06:08:52PM +0000, Seiji Aguchi wrote:
> Changelog 
> v3 -> v4

I am confused. Tony already applied v3, no? Do you expect me to do
anything with these patches, or is it just an update for Tony?

Thanks,
Anton

Re: [Dle-develop] [RFC][PATCH v2] add realtime option

From: Satoru M. <sat...@hd...> - 2013-01-11 21:44:52

Could anybody review it?

On 12/21/2012 05:46 PM, Satoru Moriya wrote:
> Changelog
> v1 -> v2
>  - add RFC tag again
>  - change semantics as follows
>    - set event threads' priority to maxprio
>    - set vcpu threads' priority to maxprio - 1
>  - isolate all the posix stuff and put them into os_prioritize() in
>    os-posix.c/qemu-os-win32.h to avoid breaking win32 build
>  - introduce qemu_init_realtime(), qemu_realtime_is_enable and
>    qemu_realtime_get_parameters() and struct QemuRealtimeInfo to
>    keep realtime option and remove related global variables in vl.c
>  - add other benchmark(qpid-latency-test) result
> 
> We have some plans to migrate old enterprise/control systems which 
> require low latency (msec order) to kvm virtualized environment. In 
> order to satify the requirements, this patch adds realtime option to qemu:
> 
> -realtime maxprio=<prio>,policy=<pol>
> 
> This option change the scheduling policy and priority to realtime one 
> (event threads: maxprio, vcpu threads: maxprio - 1) and mlock all qemu 
> and guest memory.
> 
> Of course, we need more improvements to keep latency low in qemu 
> virtualized environment and this is a first step. OTOH, we can meet 
> the requirement of our first migration project with this patch.
> 
> [ Note ]
> This version doesn't support vhost, vpnc and linux-aio.
> 
> These are some basic performance results:
> 
> Host : 4 core, 4GB
> Guest: 1 core, 512MB
> 
> Benchmark: qpid-latency-test
> http://qpid.apache.org/
> https://access.redhat.com/knowledge/docs/en-US/Red_Hat_Enterprise_MRG/2/html/Messaging_Installation_and_Configuration_Guide/qpid_latency_test.html
> 
>  Command:
>  - qemu
>    $ qemu -smp 1 -m 512 -enable-kvm -netdev tap,id=hostnet1 -device
>      virtio-net-pci,netdev=hostnet1 -drive file=vm.img,if=virtio
>      (-realtime maxprio=99,policy=fifo)
> 
>  - benchmark
>    $ chrt -f 99 qpid-latency-test --tcp-nodelay --rate 10000 -b 
> <server>
> 
>  Results: worst latency (msec) from 100 run
>   - no load
>     1. normal qemu  : 17.468400
>     2. chrt qemu(*) : 10.019900
>     3. realtime qemu:  8.048370
> 
>   - load (iperf, server:vm, client:other physical sercer)
>     4. normal qemu  : 26.711100
>     5. chrt qemu    :  8.485140
>     6. realtime qemu: 10.176700
> 
>  (*) $ chrt -f -p 99 <event_thread_tid>
>      $ chrt -f -p 98 <vcpu_thread_tid>
> 
> Any comments are welcome.
> 
> Regards,
> Satoru
> 
> Signed-off-by: Satoru Moriya <sat...@hd...>
> ---
>  cpus.c                    | 17 +++++++++++++++++
>  include/qemu/thread.h     |  4 ++++
>  include/sysemu/os-posix.h |  1 +
>  include/sysemu/os-win32.h |  1 +
>  os-posix.c                | 48 +++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-config.c             | 16 ++++++++++++++++
>  qemu-options.hx           |  9 +++++++++
>  qemu-thread-posix.c       | 27 ++++++++++++++++++++++++++
>  qemu-thread-win32.c       | 13 +++++++++++++
>  vl.c                      | 33 ++++++++++++++++++++++++++++++++
>  10 files changed, 169 insertions(+)
> 
> diff --git a/cpus.c b/cpus.c
> index 4a7782a..a049970 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -734,6 +734,9 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>      CPUArchState *env = arg;
>      CPUState *cpu = ENV_GET_CPU(env);
>      int r;
> +    int rt_policy, rt_priority;
> +    struct sched_param sp;
> +
>  
>      qemu_mutex_lock(&qemu_global_mutex);
>      qemu_thread_get_self(cpu->thread);
> @@ -746,6 +749,20 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>          exit(1);
>      }
>  
> +    if (qemu_realtime_is_enabled()) {
> +        qemu_realtime_get_parameters(&rt_policy, &rt_priority);
> +        /*
> +         * vcpu threads' priority must be set to event thread priority -1
> +         * to avoid starvation.
> +         */
> +        sp.sched_priority = rt_priority - 1;
> +        r = sched_setscheduler(0, rt_policy, &sp);
> +        if (r < 0) {
> +            perror("Setting realtime policy failed");
> +            exit(1);
> +        }
> +    }
> +
>      qemu_kvm_init_cpu_signals(env);
>  
>      /* signal CPU creation */
> diff --git a/include/qemu/thread.h b/include/qemu/thread.h index 
> c02404b..3d8b3d2 100644
> --- a/include/qemu/thread.h
> +++ b/include/qemu/thread.h
> @@ -53,4 +53,8 @@ void qemu_thread_get_self(QemuThread *thread);  bool 
> qemu_thread_is_self(QemuThread *thread);  void qemu_thread_exit(void 
> *retval);
>  
> +void qemu_init_realtime(int, int);
> +bool qemu_realtime_is_enabled(void);
> +void qemu_realtime_get_parameters(int *, int *);
> +
>  #endif
> diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h 
> index 7f198e4..e5995b0 100644
> --- a/include/sysemu/os-posix.h
> +++ b/include/sysemu/os-posix.h
> @@ -31,6 +31,7 @@ void os_set_proc_name(const char *s);  void 
> os_setup_signal_handling(void);  void os_daemonize(void);  void 
> os_setup_post(void);
> +void os_prioritize(const char *, int);
>  
>  typedef struct timeval qemu_timeval;
>  #define qemu_gettimeofday(tp) gettimeofday(tp, NULL) diff --git 
> a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h index 
> d0e9234..946b566 100644
> --- a/include/sysemu/os-win32.h
> +++ b/include/sysemu/os-win32.h
> @@ -78,6 +78,7 @@ static inline void os_daemonize(void) {}  static 
> inline void os_setup_post(void) {}  void os_set_line_buffering(void);  
> static inline void os_set_proc_name(const char *dummy) {}
> +static inline void os_prioritize(const char *pol, int prio) {}
>  
>  #if !defined(EPROTONOSUPPORT)
>  # define EPROTONOSUPPORT EINVAL
> diff --git a/os-posix.c b/os-posix.c
> index 5c64518..8fe0fa2 100644
> --- a/os-posix.c
> +++ b/os-posix.c
> @@ -33,12 +33,14 @@
>  #include <pwd.h>
>  #include <grp.h>
>  #include <libgen.h>
> +#include <sched.h>
>  
>  /* Needed early for CONFIG_BSD etc. */  #include "config-host.h"
>  #include "sysemu/sysemu.h"
>  #include "net/slirp.h"
>  #include "qemu-options.h"
> +#include "qemu-thread.h"
>  
>  #ifdef CONFIG_LINUX
>  #include <sys/prctl.h>
> @@ -363,3 +365,49 @@ bool is_daemonized(void)  {
>      return daemonize;
>  }
> +
> +void os_prioritize(const char *rt_sched_policy, int 
> +max_sched_priority) {
> +    int rt_pol, sys_min_prio, sys_max_prio;
> +
> +    if (rt_sched_policy) {
> +        if (!strcmp(rt_sched_policy, "rr")) {
> +            rt_pol = SCHED_RR;
> +        } else if (!strcmp(rt_sched_policy, "fifo")) {
> +            rt_pol = SCHED_FIFO;
> +        } else {
> +            fprintf(stderr, "qemu: invalid option value '%s'\n",
> +                    rt_sched_policy);
> +            exit(1);
> +        }
> +    } else {
> +        rt_pol = SCHED_RR;
> +    }
> +
> +    sys_min_prio = sched_get_priority_min(rt_pol);
> +    sys_max_prio = sched_get_priority_max(rt_pol);
> +
> +    if (max_sched_priority < sys_min_prio + 1) {
> +        /*
> +         * We set event threads' priority to max_sched_priorty and
> +         * vcpu threads' to max_sched_priority - 1 in order to avoid
> +         * starvation. So, it must be > sys_min_prio + 1.
> +         */
> +        fprintf(stderr, "qemu: invalid option maxprio=%d. It must be >= %d\n",
> +                max_sched_priority, sys_min_prio + 1);
> +        exit(1);
> +    }
> +
> +    if (sys_max_prio < max_sched_priority) {
> +        fprintf(stderr, "qemu: invalid option maxprio=%d. It must be <= %d\n",
> +                max_sched_priority, sys_max_prio);
> +        exit(1);
> +    }
> +
> +    qemu_init_realtime(rt_pol, max_sched_priority);
> +
> +    if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
> +        perror("mlockall");
> +        exit(1);
> +    }
> +}
> diff --git a/qemu-config.c b/qemu-config.c index 2188c3e..b945d07 
> 100644
> --- a/qemu-config.c
> +++ b/qemu-config.c
> @@ -647,6 +647,21 @@ static QemuOptsList qemu_object_opts = {
>      },
>  };
>  
> +static QemuOptsList qemu_realtime_opts = {
> +    .name = "realtime",
> +    .head = QTAILQ_HEAD_INITIALIZER(qemu_realtime_opts.head),
> +    .desc = {
> +        {
> +            .name = "maxprio",
> +            .type = QEMU_OPT_NUMBER,
> +        }, {
> +            .name = "policy",
> +            .type = QEMU_OPT_STRING,
> +        },
> +        { /* end of list */ }
> +    },
> +};
> +
>  static QemuOptsList *vm_config_groups[32] = {
>      &qemu_drive_opts,
>      &qemu_chardev_opts,
> @@ -664,6 +679,7 @@ static QemuOptsList *vm_config_groups[32] = {
>      &qemu_sandbox_opts,
>      &qemu_add_fd_opts,
>      &qemu_object_opts,
> +    &qemu_realtime_opts,
>      NULL,
>  };
>  
> diff --git a/qemu-options.hx b/qemu-options.hx index 9df0cde..968a20a 
> 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -2447,6 +2447,15 @@ STEXI
>  Do not start CPU at startup (you must type 'c' in the monitor).
>  ETEXI
>  
> +DEF("realtime", HAS_ARG, QEMU_OPTION_realtime,
> +    "-realtime maxprio=prio[,policy=pol]\n",
> +    QEMU_ARCH_ALL)
> +STEXI
> +@item -realtime maxprio=@var{prio}[,policy=@var{pold}]
> +@findex -realtime
> +run qemu as realtime process with priority @var{prio} and policy @var{pol}.
> +ETEXI
> +
>  DEF("gdb", HAS_ARG, QEMU_OPTION_gdb, \
>      "-gdb dev        wait for gdb connection on 'dev'\n", QEMU_ARCH_ALL)
>  STEXI
> diff --git a/qemu-thread-posix.c b/qemu-thread-posix.c index 
> 7be292e..10a97cc 100644
> --- a/qemu-thread-posix.c
> +++ b/qemu-thread-posix.c
> @@ -22,6 +22,15 @@
>  #include <sys/time.h>
>  #include "qemu/thread.h"
>  
> +struct QemuRealtimeInfo {
> +    bool is_realtime;
> +    int policy;
> +    int max_priority;
> +};
> +typedef struct QemuRealtimeInfo QemuRealtimeInfo;
> +
> +static QemuRealtimeInfo rt_info;
> +
>  static void error_exit(int err, const char *msg)  {
>      fprintf(stderr, "qemu: %s: %s\n", msg, strerror(err)); @@ -324,3 
> +333,21 @@ void *qemu_thread_join(QemuThread *thread)
>      }
>      return ret;
>  }
> +
> +void qemu_init_realtime(int rt_sched_policy, int max_sched_priority) 
> +{
> +    rt_info.is_realtime = true;
> +    rt_info.policy = rt_sched_policy;
> +    rt_info.max_priority = max_sched_priority; }
> +
> +bool qemu_realtime_is_enabled(void)
> +{
> +    return rt_info.is_realtime;
> +}
> +
> +void qemu_realtime_get_parameters(int *policy, int *max_priority) {
> +    *policy = rt_info.policy;
> +    *max_priority = rt_info.max_priority; }
> diff --git a/qemu-thread-win32.c b/qemu-thread-win32.c index 
> 8037b39..3beebcf 100644
> --- a/qemu-thread-win32.c
> +++ b/qemu-thread-win32.c
> @@ -369,3 +369,16 @@ bool qemu_thread_is_self(QemuThread *thread)  {
>      return GetCurrentThreadId() == thread->tid;  }
> +
> +void qemu_init_realtime(int rt_sched_policy, int max_sched_priority) 
> +{ }
> +
> +bool qemu_realtime_is_enabled(void)
> +{
> +    return false;
> +}
> +
> +void qemu_realtime_get_parameters(int *policy, int *max_priority) { }
> diff --git a/vl.c b/vl.c
> index e6a8d89..c310587 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -29,6 +29,7 @@
>  #include <sys/time.h>
>  #include <zlib.h>
>  #include "qemu/bitmap.h"
> +#include "qemu-thread.h"
>  
>  /* Needed early for CONFIG_BSD etc. */  #include "config-host.h"
> @@ -1148,6 +1149,17 @@ static void smp_parse(const char *optarg)
>          max_cpus = smp_cpus;
>  }
>  
> +static void configure_realtime(QemuOpts *opts) {
> +    const char *pol;
> +    int prio;
> +
> +    pol = qemu_opt_get(opts, "policy");
> +    prio = qemu_opt_get_number(opts, "maxprio", 1);
> +
> +    os_prioritize(pol, prio);
> +}
> +
>  /***********************************************************/
>  /* USB devices */
>  
> @@ -1754,9 +1766,22 @@ static void main_loop(void)  {
>      bool nonblocking;
>      int last_io = 0;
> +    int rt_policy, rt_priority;
> +    struct sched_param sp;
>  #ifdef CONFIG_PROFILER
>      int64_t ti;
>  #endif
> +
> +    if (qemu_realtime_is_enabled()) {
> +        qemu_realtime_get_parameters(&rt_policy, &rt_priority);
> +
> +        sp.sched_priority = rt_priority;;
> +        if (sched_setscheduler(0, rt_policy, &sp) < 0) {
> +            perror("Setting realtime policy failed");
> +            exit(1);
> +        }
> +    }
> +
>      do {
>          nonblocking = !kvm_enabled() && last_io > 0;  #ifdef 
> CONFIG_PROFILER @@ -2758,6 +2783,14 @@ int main(int argc, char **argv, 
> char **envp)
>                  }
>                  numa_add(optarg);
>                  break;
> +            case QEMU_OPTION_realtime:
> +                opts = qemu_opts_parse(qemu_find_opts("realtime"), optarg, 0);
> +                if (!opts) {
> +                    fprintf(stderr, "parse error: %s\n", optarg);
> +                    exit(1);
> +                }
> +                configure_realtime(opts);
> +                break;
>              case QEMU_OPTION_display:
>                  display_type = select_display(optarg);
>                  break;
>

[Dle-develop] [PATCH v4 2/2] efi_pstore: Avoid deadlock in non-blocking paths

From: Seiji A. <sei...@hd...> - 2013-01-11 18:10:16

[Issue]

There is a scenario which efi_pstore may hang up:

 - cpuA grabs efivars->lock
 - cpuB panics and calls smp_send_stop
 - smp_send_stop sends IRQ to cpuA
 - after 1 second, cpuB gives up on cpuA and sends an NMI instead
 - cpuA is now in an NMI handler while still holding efivars->lock
 - cpuB is deadlocked

This case may happen if a firmware has a bug and
cpuA is stuck talking with it.

[Solution]

This patch changes a spin_lock to a spin_trylock in non-blocking paths.
and if the spin_lock has already taken by another cpu,
it returns without accessing to a firmware to avoid the deadlock.

Signed-off-by: Seiji Aguchi <sei...@hd...>
Acked-by: Don Zickus <dz...@re...>
Acked-by: Tony Luck <ton...@in...>
---
 drivers/firmware/efivars.c |   11 ++++++++++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 7b1c374..ef5070d 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -1209,7 +1209,16 @@ static int efi_pstore_write(enum pstore_type_id type,
 	u64 storage_space, remaining_space, max_variable_size;
 	efi_status_t status = EFI_NOT_FOUND;
 
-	spin_lock(&efivars->lock);
+	if (pstore_cannot_block_path(reason)) {
+		/*
+		 * If the lock is taken by another cpu in non-blocking path,
+		 * this driver returns without entering firmware to avoid
+		 * hanging up.
+		 */
+		if (!spin_trylock(&efivars->lock))
+			return -EBUSY;
+	} else
+		spin_lock(&efivars->lock);
 
 	/*
 	 * Check if there is a space enough to log.
-- 1.7.1

[Dle-develop] [PATCH v4 1/2] pstore: Avoid deadlock in panic and emergency-restart path

From: Seiji A. <sei...@hd...> - 2013-01-11 18:09:51

[Issue]

When pstore is in panic and emergency-restart paths, it may be blocked
in those paths because it simply takes spin_lock.

This is an example scenario which pstore may hang up in a panic path:

 - cpuA grabs psinfo->buf_lock
 - cpuB panics and calls smp_send_stop
 - smp_send_stop sends IRQ to cpuA
 - after 1 second, cpuB gives up on cpuA and sends an NMI instead
 - cpuA is now in an NMI handler while still holding buf_lock
 - cpuB is deadlocked

This case may happen if a firmware has a bug and 
cpuA is stuck talking with it more than one second.

Also, this is a similar scenario in an emergency-restart path:
 
 - cpuA grabs psinfo->buf_lock and stucks in a firmware
 - cpuB kicks emergency-restart via either sysrq-b or hangcheck timer.
   And then, cpuB is deadlocked by taking psinfo->buf_lock again.

[Solution]

This patch avoids the deadlocking issues in both panic and emergency_restart
paths by introducing a function, is_non_blocking_path(), to check if a cpu 
can be blocked in current path.

With this patch, pstore is not blocked even if another cpu has 
taken a spin_lock, in those paths by changing from spin_lock_irqsave 
to spin_trylock_irqsave.

In addition, according to a comment of emergency_restart() in kernel/sys.c,
spin_lock shouldn't be taken in an emergency_restart path to avoid
deadlock. This patch fits the comment below.

<snip>
/**
 *      emergency_restart - reboot the system
 *
 *      Without shutting down any hardware or taking any locks
 *      reboot the system.  This is called when we know we are in
 *      trouble so this is our best effort to reboot.  This is
 *      safe to call in interrupt context.
 */
void emergency_restart(void)
<snip>

Signed-off-by: Seiji Aguchi <sei...@hd...>
Acked-by: Don Zickus <dz...@re...>
Acked-by: Tony Luck <ton...@in...>
---
 fs/pstore/platform.c   |   35 +++++++++++++++++++++++++++++------
 include/linux/pstore.h |    6 ++++++
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 5ea2e77..86d1038 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
 	}
 }
 
+bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+{
+	/*
+	 * In case of NMI path, pstore shouldn't be blocked
+	 * regardless of reason.
+	 */
+	if (in_nmi())
+		return true;
+
+	switch (reason) {
+	/* In panic case, other cpus are stopped by smp_send_stop(). */
+	case KMSG_DUMP_PANIC:
+	/* Emergency restart shouldn't be blocked by spin lock. */
+	case KMSG_DUMP_EMERG:
+		return true;
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
+
 /*
  * callback from kmsg_dump. (s2,l2) has the most recently
  * written bytes, older bytes are in (s1,l1). Save as much
@@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 
 	why = get_reason_str(reason);
 
-	if (in_nmi()) {
-		is_locked = spin_trylock(&psinfo->buf_lock);
-		if (!is_locked)
-			pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+	if (pstore_cannot_block_path(reason)) {
+		is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
+		if (!is_locked) {
+			pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
+				       , in_nmi() ? "NMI" : why);
+		}
 	} else
 		spin_lock_irqsave(&psinfo->buf_lock, flags);
 	oopscount++;
@@ -143,9 +166,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		total += hsize + len;
 		part++;
 	}
-	if (in_nmi()) {
+	if (pstore_cannot_block_path(reason)) {
 		if (is_locked)
-			spin_unlock(&psinfo->buf_lock);
+			spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 	} else
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 }
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 1788909..75d0176 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -68,12 +68,18 @@ struct pstore_info {
 
 #ifdef CONFIG_PSTORE
 extern int pstore_register(struct pstore_info *);
+extern bool pstore_cannot_block_path(enum kmsg_dump_reason reason);
 #else
 static inline int
 pstore_register(struct pstore_info *psi)
 {
 	return -ENODEV;
 }
+static inline bool
+pstore_cannot_block_path(enum kmsg_dump_reason reason)
+{
+	return false;
+}
 #endif
 
 #endif /*_LINUX_PSTORE_H*/
-- 1.7.1

[Dle-develop] [PATCH v4 0/2] pstore, efi_pstore: Avoid deadlock in non-blocking paths

From: Seiji A. <sei...@hd...> - 2013-01-11 18:09:28

Changelog 
v3 -> v4
 - Add EXPORT_SYMBOL_GPL(pstore_cannot_block_path) to build successfully
   with CONFIG_EFI_VARS=m. (PATCH 1/2)

v2 -> v3
 - Merge modifications of pstore part in 2/2 to 1/2.
 - Rename pstore_is_non_blocking_path() to pstore_cannot_block_path().

v1 -> v2
 - Erase a logic checking the number of online cpus.
 - Create a patchset to fix deadlocking issue in both pstore filesystem and
   efi_pstore driver.
   - Introduce a function, is_non_blocking_path(), to check if pstore 
     is in panic and emergency-restart paths (PATCH 1/2)
   - Avoid efi_pstore_driver is blocked in non-blocking paths
     such as nmi, panic and emergency-restart paths (PATCH 2/2)

[Issue]

There are some paths in kernel which shouldn't be blocked, 
like NMI, panic case after stopping cpus, emergency-restart.

On the other hand, current pstore avoids blocking in a NMI path 
but it may be blocked in other paths.
Also, an efi_pstore driver may be blocked in all of those paths 
because it simply takes a spin lock at writing time.

If they are blocked in those paths, the system will hang up and
it has a big impact for users.

Here is an example scenario which pstore is blocked in panic path.

 - cpuA grabs psinfo->buf_lock
 - cpuB panics and calls smp_send_stop
 - smp_send_stop sends IRQ to cpuA
   after 1 second, cpuB gives up on cpuA and sends an NMI instead
 - cpuA is now in an NMI handler while still holding buf_lock.
   And then, cpuB is deadlocked by taking efi_pstore->lock again.

This case may happen if a firmware has a bug and cpuA is stuck in it.

[Solution]

This patchset avoids that pstore and efi_pstore driver are blocked 
in the non-blocking paths like NMI, panic, and emrgency-restart
by introducing a function checking if they are in those paths.
Please see each patch for detailed explanations.

Seiji Aguchi (2):
  [PATCH v4 1/2] pstore: Avoid deadlock in panic and emergency-restart path
  [PATCH v4 2/2] efi_pstore: Avoid deadlock in non-blocking paths

 drivers/firmware/efivars.c |   11 ++++++++++-
 fs/pstore/platform.c       |   35 +++++++++++++++++++++++++++++------
 include/linux/pstore.h     |    6 ++++++
 3 files changed, 45 insertions(+), 7 deletions(-)

Re: [Dle-develop] [PATCH v3 1/2] pstore: Avoid deadlock in panic and emergency-restart path

From: Tony L. <ton...@gm...> - 2013-01-10 19:31:21

On Thu, Jan 10, 2013 at 10:23 AM, Seiji Aguchi <sei...@hd...> wrote:
> Please apply these to your tree.

Ok. Applied and pushed to my "next" branch. Should show up in linux-next
in the next day or two.

-Tony

Re: [Dle-develop] [PATCH v3 1/2] pstore: Avoid deadlock in panic and emergency-restart path

From: Seiji A. <sei...@hd...> - 2013-01-10 18:23:30

> Acked-by: Tony Luck <ton...@in...>
> 
> [Also Ack for part2 the touches efivars.c]
> 

Thanks :)

> -Tony
> 
> [Or are you asking me to apply these rather than just Ack them??]

Please apply these to your tree.

Seiji

> -----Original Message-----
> From: Tony Luck [mailto:ton...@gm...]
> Sent: Thursday, January 10, 2013 1:21 PM
> To: Seiji Aguchi
> Cc: lin...@vg...; dz...@re...; cc...@an...; kee...@ch...; cbo...@gm...;
> Satoru Moriya; dle...@li...
> Subject: Re: [PATCH v3 1/2] pstore: Avoid deadlock in panic and emergency-restart path
> 
> On Thu, Dec 20, 2012 at 7:12 AM, Seiji Aguchi <sei...@hd...> wrote:
> > +       if (pstore_cannot_block_path(reason)) {
> > +               is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
> > +               if (!is_locked) {
> > +                       pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
> > +                                      , in_nmi() ? "NMI" : why);
> > +               }
> 
> My only quibble with this patchset is this message. The sentiment is nice, but nobody will see it. kmsg_dump has already picked the
> pieces of log_buf that will be saved to pstore - so this new message won't be included.  I suppose it will show up on a serial console -
> but if a user has a serial console, they don't need to use pstore.
> 
> But I don't think it is likely to hurt us (to get this far in a panic we already printed a bunch of stuff to the console and I can't think of a
> credible scenario where a few extra bytes would run into a problem that the earlier messages didn't).
> 
> So:
> 
> Acked-by: Tony Luck <ton...@in...>
> 
> [Also Ack for part2 the touches efivars.c]
> 
> -Tony
> 
> [Or are you asking me to apply these rather than just Ack them??]

Re: [Dle-develop] [PATCH v3 1/2] pstore: Avoid deadlock in panic and emergency-restart path

From: Tony L. <ton...@gm...> - 2013-01-10 18:21:05

On Thu, Dec 20, 2012 at 7:12 AM, Seiji Aguchi <sei...@hd...> wrote:
> +       if (pstore_cannot_block_path(reason)) {
> +               is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
> +               if (!is_locked) {
> +                       pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
> +                                      , in_nmi() ? "NMI" : why);
> +               }

My only quibble with this patchset is this message. The sentiment is
nice, but nobody will see it. kmsg_dump has already picked the
pieces of log_buf that will be saved to pstore - so this new message
won't be included.  I suppose it will show up on a serial console - but
if a user has a serial console, they don't need to use pstore.

But I don't think it is likely to hurt us (to get this far in a panic we already
printed a bunch of stuff to the console and I can't think of a credible
scenario where a few extra bytes would run into a problem that the
earlier messages didn't).

So:

Acked-by: Tony Luck <ton...@in...>

[Also Ack for part2 the touches efivars.c]

-Tony

[Or are you asking me to apply these rather than just Ack them??]

[Dle-develop] [RFC][PATCH v7 2/2]trace, x86: code-sharing between non-trace and trace irq handlers

From: Seiji A. <sei...@hd...> - 2013-01-10 17:35:31

[Issue]

Currently, irq vector handlers for tracing are just 
copied non-trace handlers by simply inserting tracepoints.

It is difficult to manage the codes.

[Solution]

This patch shares common codes between non-trace and trace handlers
as follows to make them manageable and readable.

Non-trace irq handler:
smp_irq_handler()
{
	entering_irq(); /* pre-processing of this handler */
	__smp_irq_handler(); /* 
                          * common logic between non-trace and trace handlers 
                          * in a vector.
                          */
	exiting_irq(); /* post-processing of this handler */

}

Trace irq_handler:
smp_trace_irq_handler()
{
	entering_irq(); /* pre-processing of this handler */
	trace_irq_entry(); /* tracepoint for irq entry */
	__smp_irq_handler(); /* 
                          * common logic between non-trace and trace handlers 
                          * in a vector.
                          */
	trace_irq_exit(); /* tracepoint for irq exit */
	exiting_irq(); /* post-processing of this handler */

}

Signed-off-by: Seiji Aguchi <sei...@hd...>
---
 arch/x86/kernel/apic/apic.c              |  103 ++++++++----------------------
 arch/x86/kernel/cpu/mcheck/therm_throt.c |   24 +++----
 arch/x86/kernel/cpu/mcheck/threshold.c   |   24 +++----
 arch/x86/kernel/irq.c                    |   34 +++-------
 arch/x86/kernel/irq_work.c               |   22 ++++--
 arch/x86/kernel/smp.c                    |   54 ++++++++++------
 include/linux/hardirq.h                  |   25 +++++++
 7 files changed, 135 insertions(+), 151 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 89f3f4d..c146cbc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -922,17 +922,14 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-	/*
+	 *
 	 * update_process_times() expects us to have done irq_enter().
 	 * Besides, if we don't timer interrupts ignore the global
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 	local_apic_timer_interrupt();
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 }
@@ -944,19 +941,16 @@ void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-	/*
+	 *
 	 * update_process_times() expects us to have done irq_enter().
 	 * Besides, if we don't timer interrupts ignore the global
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 	trace_local_timer_entry(LOCAL_TIMER_VECTOR);
 	local_apic_timer_interrupt();
 	trace_local_timer_exit(LOCAL_TIMER_VECTOR);
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 }
@@ -1935,12 +1929,10 @@ int __init APIC_init_uniprocessor(void)
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
-void smp_spurious_interrupt(struct pt_regs *regs)
+static inline void __smp_spurious_interrupt(void)
 {
 	u32 v;
 
-	irq_enter();
-	exit_idle();
 	/*
 	 * Check if this really is a spurious interrupt and ACK it
 	 * if it is a vectored one.  Just in case...
@@ -1955,38 +1947,28 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
 	pr_info("spurious APIC interrupt on CPU#%d, "
 		"should never happen.\n", smp_processor_id());
-	irq_exit();
 }
 
-void smp_trace_spurious_interrupt(struct pt_regs *regs)
+void smp_spurious_interrupt(struct pt_regs *regs)
 {
-	u32 v;
+	entering_irq();
+	__smp_spurious_interrupt();
+	exiting_irq();
+}
 
-	irq_enter();
-	exit_idle();
+void smp_trace_spurious_interrupt(struct pt_regs *regs)
+{
+	entering_irq();
 	trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR);
-	/*
-	 * Check if this really is a spurious interrupt and ACK it
-	 * if it is a vectored one.  Just in case...
-	 * Spurious interrupts should not be ACKed.
-	 */
-	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-		ack_APIC_irq();
-
-	inc_irq_stat(irq_spurious_count);
-
-	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
-	pr_info("spurious APIC interrupt on CPU#%d, "
-		"should never happen.\n", smp_processor_id());
+	__smp_spurious_interrupt();
 	trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
-void smp_error_interrupt(struct pt_regs *regs)
+static inline void __smp_error_interrupt(struct pt_regs *regs)
 {
 	u32 v0, v1;
 	u32 i = 0;
@@ -2001,8 +1983,6 @@ void smp_error_interrupt(struct pt_regs *regs)
 		"Illegal register address",	/* APIC Error Bit 7 */
 	};
 
-	irq_enter();
-	exit_idle();
 	/* First tickle the hardware, only then report what went on. -- REW */
 	v0 = apic_read(APIC_ESR);
 	apic_write(APIC_ESR, 0);
@@ -2023,49 +2003,22 @@ void smp_error_interrupt(struct pt_regs *regs)
 
 	apic_printk(APIC_DEBUG, KERN_CONT "\n");
 
-	irq_exit();
 }
 
-void smp_trace_error_interrupt(struct pt_regs *regs)
+void smp_error_interrupt(struct pt_regs *regs)
 {
-	u32 v0, v1;
-	u32 i = 0;
-	static const char * const error_interrupt_reason[] = {
-		"Send CS error",		/* APIC Error Bit 0 */
-		"Receive CS error",		/* APIC Error Bit 1 */
-		"Send accept error",		/* APIC Error Bit 2 */
-		"Receive accept error",		/* APIC Error Bit 3 */
-		"Redirectable IPI",		/* APIC Error Bit 4 */
-		"Send illegal vector",		/* APIC Error Bit 5 */
-		"Received illegal vector",	/* APIC Error Bit 6 */
-		"Illegal register address",	/* APIC Error Bit 7 */
-	};
+	entering_irq();
+	__smp_error_interrupt(regs);
+	exiting_irq();
+}
 
-	irq_enter();
-	exit_idle();
+void smp_trace_error_interrupt(struct pt_regs *regs)
+{
+	entering_irq();
 	trace_error_apic_entry(ERROR_APIC_VECTOR);
-	/* First tickle the hardware, only then report what went on. -- REW */
-	v0 = apic_read(APIC_ESR);
-	apic_write(APIC_ESR, 0);
-	v1 = apic_read(APIC_ESR);
-	ack_APIC_irq();
-	atomic_inc(&irq_err_count);
-
-	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
-		    smp_processor_id(), v0 , v1);
-
-	v1 = v1 & 0xff;
-	while (v1) {
-		if (v1 & 0x1)
-			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
-		i++;
-		v1 >>= 1;
-	}
-
-	apic_printk(APIC_DEBUG, KERN_CONT "\n");
-
+	__smp_error_interrupt(regs);
 	trace_error_apic_exit(ERROR_APIC_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e7aa7fc..2f3a799 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -379,28 +379,26 @@ static void unexpected_thermal_interrupt(void)
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
 
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
+static inline void __smp_thermal_interrupt(void)
 {
-	irq_enter();
-	exit_idle();
 	inc_irq_stat(irq_thermal_count);
 	smp_thermal_vector();
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+}
+
+asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
+{
+	entering_irq();
+	__smp_thermal_interrupt();
+	exiting_ack_irq();
 }
 
 asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs)
 {
-	irq_enter();
-	exit_idle();
+	entering_irq();
 	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
-	inc_irq_stat(irq_thermal_count);
-	smp_thermal_vector();
+	__smp_thermal_interrupt();
 	trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+	exiting_ack_irq();
 }
 
 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 0cbef99..fe6b1c8 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -18,26 +18,24 @@ static void default_threshold_interrupt(void)
 
 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
 
-asmlinkage void smp_threshold_interrupt(void)
+static inline void __smp_threshold_interrupt(void)
 {
-	irq_enter();
-	exit_idle();
 	inc_irq_stat(irq_threshold_count);
 	mce_threshold_vector();
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+}
+
+asmlinkage void smp_threshold_interrupt(void)
+{
+	entering_irq();
+	__smp_threshold_interrupt();
+	exiting_ack_irq();
 }
 
 asmlinkage void smp_trace_threshold_interrupt(void)
 {
-	irq_enter();
-	exit_idle();
+	entering_irq();
 	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
-	inc_irq_stat(irq_threshold_count);
-	mce_threshold_vector();
+	__smp_threshold_interrupt();
 	trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
-	irq_exit();
-	/* Ack only at the end to avoid potential reentry */
-	ack_APIC_irq();
+	exiting_ack_irq();
 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 216bec1..ae836cd 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -209,23 +209,21 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 /*
  * Handler for X86_PLATFORM_IPI_VECTOR.
  */
-void smp_x86_platform_ipi(struct pt_regs *regs)
+void __smp_x86_platform_ipi(void)
 {
-	struct pt_regs *old_regs = set_irq_regs(regs);
-
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
 	inc_irq_stat(x86_platform_ipis);
 
 	if (x86_platform_ipi_callback)
 		x86_platform_ipi_callback();
+}
 
-	irq_exit();
+void smp_x86_platform_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	entering_ack_irq();
+	__smp_x86_platform_ipi();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 
@@ -233,21 +231,11 @@ void smp_trace_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
+	entering_ack_irq();
 	trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
-	inc_irq_stat(x86_platform_ipis);
-
-	if (x86_platform_ipi_callback)
-		x86_platform_ipi_callback();
-
+	__smp_x86_platform_ipi();
 	trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
-	irq_exit();
-
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 09e6262..636a55e 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -10,24 +10,32 @@
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
 
-void smp_irq_work_interrupt(struct pt_regs *regs)
+static inline void irq_work_entering_irq(void)
 {
 	irq_enter();
 	ack_APIC_irq();
+}
+
+static inline void __smp_irq_work_interrupt(void)
+{
 	inc_irq_stat(apic_irq_work_irqs);
 	irq_work_run();
-	irq_exit();
+}
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+	irq_work_entering_irq();
+	__smp_irq_work_interrupt();
+	exiting_irq();
 }
 
 void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_enter();
-	ack_APIC_irq();
+	irq_work_entering_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
-	inc_irq_stat(apic_irq_work_irqs);
-	irq_work_run();
+	__smp_irq_work_interrupt();
 	trace_irq_work_exit(IRQ_WORK_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 void arch_irq_work_raise(void)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index aad58af..7b194a2 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -250,11 +250,16 @@ finish:
 /*
  * Reschedule call back.
  */
-void smp_reschedule_interrupt(struct pt_regs *regs)
+static inline void __smp_reschedule_interrupt(void)
 {
-	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
 	scheduler_ipi();
+}
+
+void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	__smp_reschedule_interrupt();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
@@ -264,52 +269,61 @@ void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
-	inc_irq_stat(irq_resched_count);
-	scheduler_ipi();
+	__smp_reschedule_interrupt();
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
 }
 
-void smp_call_function_interrupt(struct pt_regs *regs)
+static inline void call_function_entering_irq()
 {
 	ack_APIC_irq();
 	irq_enter();
+}
+
+static inline void __smp_call_function_interrupt(void)
+{
 	generic_smp_call_function_interrupt();
 	inc_irq_stat(irq_call_count);
-	irq_exit();
+}
+
+void smp_call_function_interrupt(struct pt_regs *regs)
+{
+	call_function_entering_irq();
+	__smp_call_function_interrupt();
+	exiting_irq();
 }
 
 void smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
-	ack_APIC_irq();
-	irq_enter();
+	call_function_entering_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
-	generic_smp_call_function_interrupt();
-	inc_irq_stat(irq_call_count);
+	__smp_call_function_interrupt();
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
-void smp_call_function_single_interrupt(struct pt_regs *regs)
+static inline void __smp_call_function_single_interrupt(void)
 {
-	ack_APIC_irq();
-	irq_enter();
 	generic_smp_call_function_single_interrupt();
 	inc_irq_stat(irq_call_count);
-	irq_exit();
+}
+
+void smp_call_function_single_interrupt(struct pt_regs *regs)
+{
+	call_function_entering_irq();
+	__smp_call_function_single_interrupt();
+	exiting_irq();
 }
 
 void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
-	ack_APIC_irq();
-	irq_enter();
+	call_function_entering_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
-	generic_smp_call_function_single_interrupt();
-	inc_irq_stat(irq_call_count);
+	__smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
-	irq_exit();
+	exiting_irq();
 }
 
 static int __init nonmi_ipi_setup(char *str)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 624ef3f..580aa93 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -6,6 +6,7 @@
 #include <linux/ftrace_irq.h>
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
+#include <asm/idle.h>
 
 /*
  * We put the hardirq and softirq counter into the preemption
@@ -198,4 +199,28 @@ extern void irq_exit(void);
 		ftrace_nmi_exit();				\
 	} while (0)
 
+static inline void entering_irq(void)
+{
+	irq_enter();
+	exit_idle();
+}
+
+static inline void entering_ack_irq(void)
+{
+	ack_APIC_irq();
+	entering_irq();
+}
+
+static inline void exiting_irq(void)
+{
+	irq_exit();
+}
+
+static inline void exiting_ack_irq(void)
+{
+        irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+        ack_APIC_irq();
+}
+
 #endif /* LINUX_HARDIRQ_H */
-- 1.7.1

454 messages has been excluded from this view by a project administrator.

Flat | Threaded

<< < 1 .. 24 25 26 27 28 .. 99 > >> (Page 26 of 99)