Hello Dave,
On 09/09/2014 10:04 PM, Dave Anderson wrote:
> > Many of the changes reflect the contents of per-cpu data structures
> > of offlined cpus, but even though the cpu is currently offline, the
> > data structures still exist. Why prevent the user from viewing their
> > contents?
>
> I think just showing online cpu's data is reasonable.
Why? Give me an example as to when it is/was a problem?
> What about adding a internal crash variables (used by command set) to
> hide/show offline cpu's data?
I suppose that could be done, but again, in my opinion there is no compelling
reason to do so. I could be wrong, but aside from maybe "help -r", it seems
that you are trying to answer a question that nobody's asking.
I know it is important to show data of offline cpu, like debugging hot remove.
But for those who don't care about the removed cpu, hiding offline cpu will be
more clear. Then let me talk about the reason why I think hiding will be more
clear.
I first got a vmcore with 90 cpus at first and 30 of them were physically
removed. After 30 cpus physically removed, the machine works with 60 cpus.
To those who don't care about data of the removed cpu, the following data
is confusing:
1. The machine only got 60 cpus, but crash shows 90 cpus.
2. when I execute command timer, crash show 90 TVEC_BASES, some of them(maybe
exceed 30) are empty. But I have to check which cpu is offline, and then I
can know whether the empty is because of offline cpu or just no timer was set
on that cpu.
3. comes to idle tasks, offline cpu is halt and related idle tasks will not
work, but crash shows they are running right now.
4. ...
After I check kernel, I found when cpu is set to offline, things, processes,
timers, interrupts etc., are migrated to a new cpu. So I tried to hide when
cpu is set offline(logically removed) instead of physically removed.
The attachment is what I am trying to implement. If you don't like it, we can
go on discussing it.
--
Regards
Qiao Nuohan
From 77855d6b2b9e790ae47bf0e3bf96cd58c1b449f4 Mon Sep 17 00:00:00 2001
From: Qiao Nuohan <qiaonuohan@xxxxxxxxxxxxxx>
Date: Fri, 12 Sep 2014 14:06:03 +0800
Subject: [PATCH 1/3] add an API to check an offline cpu
check_offline_cpu() is used to check whether a cpu is logically
removed(offline).
Signed-off-by: Qiao Nuohan <qiaonuohan@xxxxxxxxxxxxxx>
---
defs.h | 1 +
kernel.c | 12 ++++++++++++
2 files changed, 13 insertions(+)
diff --git a/defs.h b/defs.h
index cbdaa47..8fbbdeb 100755
--- a/defs.h
+++ b/defs.h
@@ -4893,6 +4893,7 @@ int get_cpus_online(void);
int get_cpus_active(void);
int get_cpus_present(void);
int get_cpus_possible(void);
+int check_offline_cpu(int);
int get_highest_cpu_online(void);
int get_highest_cpu_present(void);
int get_cpus_to_display(void);
diff --git a/kernel.c b/kernel.c
index 87a0b75..8d3db9c 100755
--- a/kernel.c
+++ b/kernel.c
@@ -7988,6 +7988,18 @@ get_cpus_online()
return online;
}
+int
+check_offline_cpu(int cpu)
+{
+ if (!cpu_map_addr("online"))
+ return FALSE;
+
+ if (in_cpu_map(ONLINE_MAP, cpu))
+ return FALSE;
+
+ return TRUE;
+}
+
/*
* If it exists, return the highest cpu number in the cpu_online_map.
*/
--
1.8.5.3
From 26bd107763037f9cd9534ccc19119bd027bf2062 Mon Sep 17 00:00:00 2001
From: Qiao Nuohan <qiaonuohan@xxxxxxxxxxxxxx>
Date: Sun, 14 Sep 2014 15:00:22 +0800
Subject: [PATCH 2/3] add a flag to display/hide data related to offline cpu
Add a field in pc->flags2 to display/hide data related to offline cpu. This
flag can be changed by the following 2 ways:
1. start crash with "--offline_cpu [on|off]"
2. execute crash command "set offline_cpu on|off"
The default set is offline_cpu off(hiding data related to offline cpu). This
flag will be used by following patches to hide date related to offline cpus.
And if offline_cpu is set to on, those hidden data will be shown as same as
the original output.
Signed-off-by: Qiao Nuohan <qiaonuohan@xxxxxxxxxxxxxx>
---
defs.h | 1 +
help.c | 6 ++++++
main.c | 15 ++++++++++++++-
tools.c | 15 +++++++++++++++
4 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/defs.h b/defs.h
index 8fbbdeb..19b4b8f 100755
--- a/defs.h
+++ b/defs.h
@@ -505,6 +505,7 @@ struct program_context {
#define REM_PAUSED_F (0x1000ULL)
#define RAMDUMP (0x2000ULL)
#define REMOTE_PAUSED() (pc->flags2 & REM_PAUSED_F)
+#define OFFLINE_CPU (0x4000ULL)
char *cleanup;
char *namelist_orig;
char *namelist_debug_orig;
diff --git a/help.c b/help.c
index 30d7759..7da0728 100755
--- a/help.c
+++ b/help.c
@@ -339,6 +339,10 @@ char *program_usage_info[] = {
" --kvmio <size>",
" override the automatically-calculated KVM guest I/O hole size.",
"",
+ " --offline_cpu [on|off]",
+ " Display/hide data related to offline cpu. Crash command \"set offline",
+ " [on|off]\" can be used to override the set here",
+ "",
"FILES:",
"",
" .crashrc",
@@ -1054,6 +1058,8 @@ char *help_set[] = {
" must be a kernel or module text address, which",
" may be expressed symbolically or as a hexadecimal",
" value.",
+" offline_cpu on | off if on, crash will display data related to offline",
+" cpu.",
" ",
" Internal variables may be set in four manners:\n",
" 1. entering the set command in $HOME/.%src.",
diff --git a/main.c b/main.c
index ad66058..35305c4 100755
--- a/main.c
+++ b/main.c
@@ -70,6 +70,7 @@ static struct option long_options[] = {
{"dec", 0, 0, 0},
{"no_strip", 0, 0, 0},
{"hash", required_argument, 0, 0},
+ {"offline_cpu", required_argument, 0, 0},
{0, 0, 0, 0}
};
@@ -279,7 +280,17 @@ main(int argc, char **argv)
pc->flags2 |= RADIX_OVERRIDE;
pc->output_radix = 10;
}
-
+
+ else if (STREQ(long_options[option_index].name, "offline_cpu")) {
+ if (STREQ(optarg, "on"))
+ pc->flags2 |= OFFLINE_CPU;
+ else if (STREQ(optarg, "off"))
+ pc->flags2 &= ~OFFLINE_CPU;
+ else {
+ error(INFO, "invalid --offline_cpu argument: %s\n", optarg);
+ program_usage(SHORT_FORM);
+ }
+ }
else {
error(INFO, "internal error: option %s unhandled\n",
@@ -1393,6 +1404,8 @@ dump_program_context(void)
fprintf(fp, "%sALLOW_FP", others++ ? "|" : "");
if (pc->flags2 & RAMDUMP)
fprintf(fp, "%sRAMDUMP", others++ ? "|" : "");
+ if (pc->flags2 & OFFLINE_CPU)
+ fprintf(fp, "%sOFFLINE_CPU", others++ ? "|" : "");
fprintf(fp, ")\n");
fprintf(fp, " namelist: %s\n", pc->namelist);
diff --git a/tools.c b/tools.c
index a5f514f..2ed17aa 100755
--- a/tools.c
+++ b/tools.c
@@ -2365,6 +2365,20 @@ cmd_set(void)
"on" : "off");
return;
+ } else if (STREQ(args[optind], "offline_cpu")) {
+
+ if (args[optind+1]) {
+ optind++;
+ if (STREQ(args[optind], "on"))
+ pc->flags2 |= OFFLINE_CPU;
+ else if(STREQ(args[optind], "off"))
+ pc->flags2 &= ~OFFLINE_CPU;
+ else
+ goto invalid_set_command;
+ }
+
+ return;
+
} else if (XEN_HYPER_MODE()) {
error(FATAL, "invalid argument for the Xen hypervisor\n");
} else if (pc->flags & MINIMAL_MODE) {
@@ -2467,6 +2481,7 @@ show_options(void)
fprintf(fp, "(%s)\n", value_to_symstr(pc->scope, buf, 0));
else
fprintf(fp, "(not set)\n");
+ fprintf(fp, " offline_cpu: %s\n", pc->flags2 & OFFLINE_CPU ? "on" : "off");
}
--
1.8.5.3
From c4e21174a425cf8e174e20d7c4cff7dcbb9ccca8 Mon Sep 17 00:00:00 2001
From: Qiao Nuohan <qiaonuohan@xxxxxxxxxxxxxx>
Date: Sun, 14 Sep 2014 16:20:53 +0800
Subject: [PATCH 3/3] modify timer to hide cpus' data
With this patch, command timer can hide offline cpus's data. Please check
the following example.
cpu #2 is offline. The output of original timer is:
<cut>
crash> timer
TVEC_BASES[0]: ffffffff81c994c0
JIFFIES
4310640297
EXPIRES TIMER_LIST FUNCTION
4310640004 ffff88003fc103c0 ffffffff8107c150 <delayed_work_timer_fn>
4310641008 ffff88003d4be348 ffffffff814f0840 <dev_watchdog>
4310643584 ffffffff819a6c50 ffffffff8107c150 <delayed_work_timer_fn>
4310647936 ffff88003691fc48 ffffffff8107c150 <delayed_work_timer_fn>
4310648544 ffff88003b781db0 ffffffff8106e070 <process_timeout>
4310868000 ffff88003fc0d920 ffffffff81030610 <mce_timer_fn>
4310921216 ffffffff81e147d0 ffffffff8154f790 <inet_frag_secret_rebuild>
TVEC_BASES[1]: ffff88003db08000
JIFFIES
4310640297
EXPIRES TIMER_LIST FUNCTION
4310635000 ffff88003fc903c0 ffffffff8107c150 <delayed_work_timer_fn>
4310665216 ffffffff819a75c0 ffffffff8107c150 <delayed_work_timer_fn>
4310867997 ffff88003fc8d920 ffffffff81030610 <mce_timer_fn>
4310933504 ffffffffa0311750 ffffffff8154f790 <inet_frag_secret_rebuild>
4380688384 ffff880036dae568 ffffffff8157ed30 <ipv6_regen_rndid>
TVEC_BASES[2]: ffff88003db20000
JIFFIES
4310640297
EXPIRES TIMER_LIST FUNCTION
TVEC_BASES[3]: ffff88003db44000
JIFFIES
4310640297
EXPIRES TIMER_LIST FUNCTION
4310639992 ffff88003fd903c0 ffffffff8107c150 <delayed_work_timer_fn>
4310641152 ffff880036ef5380 ffffffff8107c150 <delayed_work_timer_fn>
4310642176 ffff88003691f048 ffffffff8107c150 <delayed_work_timer_fn>
4310651392 ffffffff819ab310 ffffffff8107c150 <delayed_work_timer_fn>
4310666112 ffff880036a40cc8 ffffffff8129a140 <blk_rq_timed_out_timer>
4310704128 ffff88003fd93e40 ffffffff8107d1a0 <idle_worker_timeout>
4310760448 ffffffff819aaae0 ffffffff81581aa0 <addrconf_verify>
4310867991 ffff88003fd8d920 ffffffff81030610 <mce_timer_fn>
4310921216 ffffffff81e0f710 ffffffff814e4dc0 <flow_cache_new_hashrnd>
4310921216 ffffffff81e1b4d0 ffffffff8154f790 <inet_frag_secret_rebuild>
<cut>
With this patch, TVEC_BASES[2] is hide:
<cut>
TVEC_BASES[0]: ffffffff81c994c0
JIFFIES
4310640297
EXPIRES TIMER_LIST FUNCTION
4310640004 ffff88003fc103c0 ffffffff8107c150 <delayed_work_timer_fn>
4310641008 ffff88003d4be348 ffffffff814f0840 <dev_watchdog>
4310643584 ffffffff819a6c50 ffffffff8107c150 <delayed_work_timer_fn>
4310647936 ffff88003691fc48 ffffffff8107c150 <delayed_work_timer_fn>
4310648544 ffff88003b781db0 ffffffff8106e070 <process_timeout>
4310868000 ffff88003fc0d920 ffffffff81030610 <mce_timer_fn>
4310921216 ffffffff81e147d0 ffffffff8154f790 <inet_frag_secret_rebuild>
TVEC_BASES[1]: ffff88003db08000
JIFFIES
4310640297
EXPIRES TIMER_LIST FUNCTION
4310635000 ffff88003fc903c0 ffffffff8107c150 <delayed_work_timer_fn>
4310665216 ffffffff819a75c0 ffffffff8107c150 <delayed_work_timer_fn>
4310867997 ffff88003fc8d920 ffffffff81030610 <mce_timer_fn>
4310933504 ffffffffa0311750 ffffffff8154f790 <inet_frag_secret_rebuild>
4380688384 ffff880036dae568 ffffffff8157ed30 <ipv6_regen_rndid>
TVEC_BASES[3]: ffff88003db44000
JIFFIES
4310640297
EXPIRES TIMER_LIST FUNCTION
4310639992 ffff88003fd903c0 ffffffff8107c150 <delayed_work_timer_fn>
4310641152 ffff880036ef5380 ffffffff8107c150 <delayed_work_timer_fn>
4310642176 ffff88003691f048 ffffffff8107c150 <delayed_work_timer_fn>
4310651392 ffffffff819ab310 ffffffff8107c150 <delayed_work_timer_fn>
4310666112 ffff880036a40cc8 ffffffff8129a140 <blk_rq_timed_out_timer>
4310704128 ffff88003fd93e40 ffffffff8107d1a0 <idle_worker_timeout>
4310760448 ffffffff819aaae0 ffffffff81581aa0 <addrconf_verify>
4310867991 ffff88003fd8d920 ffffffff81030610 <mce_timer_fn>
4310921216 ffffffff81e0f710 ffffffff814e4dc0 <flow_cache_new_hashrnd>
4310921216 ffffffff81e1b4d0 ffffffff8154f790 <inet_frag_secret_rebuild>
<cut>
Signed-off-by: Qiao Nuohan <qiaonuohan@xxxxxxxxxxxxxx>
---
kernel.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/kernel.c b/kernel.c
index 8d3db9c..0023038 100755
--- a/kernel.c
+++ b/kernel.c
@@ -7399,6 +7399,11 @@ dump_timer_data_tvec_bases_v2(void)
cpu = 0;
next_cpu:
+ if (!(pc->flags2 & OFFLINE_CPU) && check_offline_cpu(cpu)) {
+ if (++cpu < kt->cpus)
+ goto next_cpu;
+ }
+
count = 0;
td = (struct timer_data *)NULL;
--
1.8.5.3
--
Crash-utility mailing list
Crash-utility@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/crash-utility