Crash shows - Invalid Memory size

Sharyathi Nagesh <sharyath@xxxxxxxxxx> · Tue, 21 Nov 2006 13:08:27 +0530

Hi

We encountered this problem with Crash showing invalid memory when ran
on live machine:
======================================
[root@venuslp11 ~]# free -m
             total       used       free     shared    buffers     cached
Mem:          2151       1594        557          0        583        795
             ^^^^^^
-/+ buffers/cache:        215       1935
Swap:         1983          0       1983
[root@venuslp11 ~]# cat /proc/ppc64/lparcfg | grep DesMem
DesMem=2304
      ^^^^^^
[root@venuslp11 ~]# rpm -q crash
crash-4.0-3.7
[root@venuslp11 ~]# crash
...
      KERNEL: /usr/lib/debug/lib/modules/2.6.18-1.2732.el5/vmlinux
    DUMPFILE: /dev/mem
        CPUS: 2
        DATE: Fri Oct 27 07:58:54 2006
      UPTIME: 01:12:34
LOAD AVERAGE: 1.52, 1.05, 0.48
       TASKS: 98
    NODENAME: venuslp11.upt.austin.ibm.com
     RELEASE: 2.6.18-1.2732.el5
     VERSION: #1 SMP Tue Oct 17 18:24:27 EDT 2006
     MACHINE: ppc64  (2301 Mhz)
      MEMORY: 3.2 GB
             ^^^^^^^^
         PID: 25097
     COMMAND: "crash"
        TASK: c000000000fedbe0  [THREAD_INFO: c00000000b190000]
         CPU: 0
       STATE: TASK_RUNNING (ACTIVE)

crash>
==================================
As I looked into the code I found:
	The differences are observed because of the different way in which
they(proc and crash) are implemented to calculate Total Memory.
In /proc/meminfo it traverse through the memory counting each page and it has
different routines to calculate No of pages in highmem, init section, bootmem etc.
Which may be difficult to implement with Crash.
  Instead we can look into sys file
implementation(/sys/devices/system/node/node<n>/meminfo). Here the Total Page is
got not from unsigned long node_spanned_pages but from long node_present_pages.
The definitions of  node_present_pages says 'total number of physical pages'
while node_spanned_pages says 'total size of physical page range, including holes'.
	This is observed because of way Node 2 is spread in the machine its
pfn(physical frame number) starts from 0 while that of 0th and 1st node
starts from 4096 and 8192 pfns respectively. so node3->spanned_pages has
double counted value from even the node 0 and node 1. Hence I feel its
better to use present_pages which has only the pages from the node
excluding the holes.

======================================
The patch to fix the problem:
Let me know of your opinion..

--- memory.c.orig	2006-11-17 08:03:18.000000000 -0600
+++ memory.c	2006-11-17 08:05:23.000000000 -0600
@@ -11123,7 +11123,7 @@ dump_memory_nodes(int initialize)
         ulong node_start_paddr;
 	ulong node_start_pfn;
         ulong node_start_mapnr;
-	ulong node_spanned_pages;
+	ulong node_present_pages;
         ulong free_pages, zone_size, node_size, cum_zone_size;
 	ulong zone_start_paddr, zone_start_mapnr, zone_mem_map;
 	physaddr_t phys;
@@ -11232,11 +11232,11 @@ dump_memory_nodes(int initialize)
 			readmem(pgdat+OFFSET(pglist_data_node_size), 
 				KVADDR, &node_size, sizeof(ulong), 
 				"pglist node_size", FAULT_ON_ERROR);
-		else if (VALID_MEMBER(pglist_data_node_spanned_pages)) {
-			readmem(pgdat+OFFSET(pglist_data_node_spanned_pages), 
-				KVADDR, &node_spanned_pages, sizeof(ulong), 
-				"pglist node_spanned_pages", FAULT_ON_ERROR);
-			node_size = node_spanned_pages;
+		else if (VALID_MEMBER(pglist_data_node_present_pages)) {
+			readmem(pgdat+OFFSET(pglist_data_node_present_pages), 
+				KVADDR, &node_present_pages, sizeof(ulong), 
+				"pglist node_present_pages", FAULT_ON_ERROR);
+			node_size = node_present_pages;
 		} else error(INFO, "cannot determine zone size\n");
 
 		readmem(pgdat+OFFSET(pglist_data_bdata), KVADDR, &bdata,
--
Crash-utility mailing list
Crash-utility@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/crash-utility