[PATCH 1/2] x86/CPU/AMD: Present package as die instead of socket

Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx> · Tue, 27 Jun 2017 01:40:52 -0500

According to the Documentation/x86/topology.txt, AMD nomenclature for
package is NUMA node (or die). However, this is not the case on AMD
family17h multi-die processor platforms, which can have up to 4 dies
per socket as shown in the following system topology.

Die (Dx) View :
             ----------------------------
         C0  | T0 T1 |    ||    | T0 T1 | C4
             --------|    ||    |--------
         C1  | T0 T1 | L3 || L3 | T0 T1 | C5
             --------|    ||    |--------
         C2  | T0 T1 | #0 || #1 | T0 T1 | C6
             --------|    ||    |--------
         C3  | T0 T1 |    ||    | T0 T1 | C7
             ----------------------------

System View (with 2 socket) :
           --------------------
           |     -------------|------
           |     |            |     |
         ------------       ------------
         | D1 -- D0 |       | D7 -- D6 |
         | |  \/ |  |       | |  \/ |  |
 SOCKET0 | |  /\ |  |       | |  /\ |  | SOCKET1
         | D2 -- D3 |       | D4 -- D5 |
         ------------       ------------
           |     |            |     |
           ------|------------|     |
                 --------------------

Current logic interpretes package as socket (i.e. phys_proc_id is
socket id), which results in setting x86_has_numa_in_package, and omits
the DIE schedule domain. However, NUMA schedule domains are derived from
SRAT/SLIT, which assumes NUMA node is a die, and build NUMA schedule
domains on top of NUMA nodes. This results in incomplete schedule domains
as following:
    domain 0: SMT
    domain 1: MC       /* core complex w/ shared L3*/
    ---- Missing DIE level domain ----
    domain 2: NUMA     /* socket */
    domain 3: NUMA     /* platform */

Presenting package-as-die does not set x86_has_numa_in_package.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>
Signed-off-by: Leo Duran <leo.duran@xxxxxxx>
Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx> # v4.10+
---
 arch/x86/kernel/cpu/amd.c | 189 +++++++++++++++++++++++++++-------------------
 1 file changed, 112 insertions(+), 77 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bb5abe8..2f5869c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "x86/AMD: " fmt
+
 #include <linux/export.h>
 #include <linux/bitops.h>
 #include <linux/elf.h>
@@ -32,6 +34,12 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
  */
 static u32 nodes_per_socket = 1;
 
+/*
+ * l3_num_threads_sharing: Stores the number of threads sharing L3 cache.
+ * Refer to CPUID_Fn8000001D_EAX_x03 [Cache Properties (L3)] NumSharingCache.
+ */
+static u32 l3_num_threads_sharing;
+
 static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
 {
 	u32 gprs[8] = { 0 };
@@ -296,96 +304,122 @@ static int nearby_node(int apicid)
 }
 #endif
 
+#ifdef CONFIG_SMP
+
 /*
- * Fixup core topology information for
- * (1) AMD multi-node processors
+ * Per Documentation/x86/topology.c, the kernel works with
+ *  {packages, cores, threads}, and we will map:
+ *
+ *  thread  = core in compute-unit (CMT), or thread in core (SMT)
+ *  core    = compute-unit (CMT), or core (SMT)
+ *  package = node (die)
+ *
+ * Discover topology based on available information from CPUID first,
+ * and only derive them as needed.
+ *
+ * (1) phys_proc_id is die ID in AMD multi-die processors.
  *     Assumption: Number of cores in each internal node is the same.
- * (2) AMD processors supporting compute units
+ * (2) cpu_core_id is derived from either CPUID topology extension
+ *     or initial APIC_ID.
+ * (3) cpu_llc_id is either L3 or per-node
  */
-#ifdef CONFIG_SMP
 static void amd_get_topology(struct cpuinfo_x86 *c)
 {
-	u8 node_id;
 	int cpu = smp_processor_id();
 
-	/* get information required for multi-node processors */
 	if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
 
-		node_id  = ecx & 0xff;
+		c->phys_proc_id = ecx & 0xff;
 		smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
 
-		if (c->x86 == 0x15)
-			c->cu_id = ebx & 0xff;
-
-		if (c->x86 >= 0x17) {
-			c->cpu_core_id = ebx & 0xff;
-
-			if (smp_num_siblings > 1)
-				c->x86_max_cores /= smp_num_siblings;
-		}
+		/* Adjustment to get core per die */
+		c->x86_max_cores /= smp_num_siblings;
 
 		/*
-		 * We may have multiple LLCs if L3 caches exist, so check if we
-		 * have an L3 cache by looking at the L3 cache CPUID leaf.
+		 * For family15h/16h, this is ComputeUnitId per socket
+		 * For family17h, this is CoreId per socket
 		 */
+		c->cpu_core_id = (ebx & 0xff);
+
 		if (cpuid_edx(0x80000006)) {
-			if (c->x86 == 0x17) {
+			cpuid_count(0x8000001d, 3, &eax, &ebx, &ecx, &edx);
+			l3_num_threads_sharing = ((eax >> 14) & 0xfff) + 1;
+		}
+
+		if (c->x86 == 0x17) {
+			/*
+			 * In family 17h, the CPUID_Fn8000001E_EBX[7:0] (CoreId)
+			 * is non-contiguous in down-coring and non-SMT cases.
+			 * This logic fixes up the cpu_core_id to be contiguous
+			 * for cores within the die.
+			 */
+			u32 tmp = c->cpu_core_id;
+			u32 die_offset, ccx_offset, cpu_offset;
+
+			if (smp_num_siblings == 1) {
 				/*
-				 * LLC is at the core complex level.
-				 * Core complex id is ApicId[3].
+				 * For SMT-disabled case, the CoreId bit-encoding is
+				 * [7:4] : die
+				 * [3]   : ccx
+				 * [2:0] : core
 				 */
-				per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+				die_offset = ((tmp >> 4) & 0xf) * c->x86_max_cores;
+				ccx_offset = ((tmp >> 3) & 1) * l3_num_threads_sharing;
+				cpu_offset = tmp & 7;
 			} else {
-				/* LLC is at the node level. */
-				per_cpu(cpu_llc_id, cpu) = node_id;
+				/*
+				 * For SMT-enabled case, the CoreId bit-encoding is
+				 * [7:3] : die
+				 * [2]   : ccx
+				 * [1:0] : core
+				 */
+				die_offset = ((tmp >> 3) & 0x1f) * c->x86_max_cores;
+				ccx_offset = ((tmp >> 2) & 1) * l3_num_threads_sharing / smp_num_siblings;
+				cpu_offset = tmp & 3;
 			}
+			c->cpu_core_id = die_offset + ccx_offset + cpu_offset;
+			pr_debug("Fixup CoreId:%#x to cpu_core_id:%#x\n", tmp, c->cpu_core_id);
 		}
-	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
-		u64 value;
+	} else {
+		if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+			u64 value;
 
-		rdmsrl(MSR_FAM10H_NODE_ID, value);
-		node_id = value & 7;
-
-		per_cpu(cpu_llc_id, cpu) = node_id;
-	} else
-		return;
-
-	/* fixup multi-node processor information */
-	if (nodes_per_socket > 1) {
-		u32 cus_per_node;
-
-		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-		cus_per_node = c->x86_max_cores / nodes_per_socket;
+			/* Use MSR provided node ID */
+			rdmsrl(MSR_FAM10H_NODE_ID, value);
+			c->phys_proc_id = value & 7;
+		} else {
+			/*
+			 * On older AMD dual core setup the lower
+			 * bits of the APIC id distinguish the cores.
+			 * Assumes number of cores is a power of two.
+			 */
+			c->phys_proc_id = c->initial_apicid >> c->x86_coreid_bits;
+		}
 
-		/* core id has to be in the [0 .. cores_per_node - 1] range */
-		c->cpu_core_id %= cus_per_node;
+		/* Get core id from APIC */
+		c->cpu_core_id = c->initial_apicid & ((1 << c->x86_coreid_bits) - 1);
 	}
-}
-#endif
 
-/*
- * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
- * Assumes number of cores is a power of two.
- */
-static void amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits;
-	int cpu = smp_processor_id();
+	/* core id has to be in the [0 .. cores_per_die - 1] range */
+	c->cpu_core_id %= c->x86_max_cores;
 
-	bits = c->x86_coreid_bits;
-	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
-	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
-	/* use socket ID also for last level cache */
+	/* Default LLC is at the die level. */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-	amd_get_topology(c);
-#endif
+
+	/*
+	 * We may have multiple LLCs if L3 caches exist, so check if we
+	 * have an L3 cache by looking at the L3 cache CPUID leaf.
+	 * For family17h, LLC is at the core complex level.
+	 * Core complex id is ApicId[3].
+	 */
+	if (cpuid_edx(0x80000006) && c->x86 == 0x17)
+		per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+
 }
+#endif
 
 u16 amd_get_nb_id(int cpu)
 {
@@ -412,7 +446,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 
 	node = numa_cpu_node(cpu);
 	if (node == NUMA_NO_NODE)
-		node = per_cpu(cpu_llc_id, cpu);
+		node = c->phys_proc_id;
 
 	/*
 	 * On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -457,26 +491,23 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 static void early_init_amd_mc(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	unsigned bits, ecx;
+	u32 threads_per_socket;
 
 	/* Multi core CPU? */
 	if (c->extended_cpuid_level < 0x80000008)
 		return;
 
-	ecx = cpuid_ecx(0x80000008);
-
-	c->x86_max_cores = (ecx & 0xff) + 1;
-
-	/* CPU telling us the core id bits shift? */
-	bits = (ecx >> 12) & 0xF;
-
-	/* Otherwise recompute */
-	if (bits == 0) {
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
-	}
+	/* Threads per socket */
+	threads_per_socket = (cpuid_ecx(0x80000008) & 0xff) + 1;
+	/* Thread per die */
+	c->x86_max_cores = threads_per_socket / nodes_per_socket;
 
-	c->x86_coreid_bits = bits;
+	/*
+	 * This is per socket, and should only be used to decode APIC ID,
+	 * which is needed on older systems where X86_FEATURE_TOPOEXT
+	 * is not supported.
+	 */
+	c->x86_coreid_bits = get_count_order(threads_per_socket);
 #endif
 }
 
@@ -765,11 +796,15 @@ static void init_amd(struct cpuinfo_x86 *c)
 
 	cpu_detect_cache_sizes(c);
 
-	/* Multi core CPU? */
+#ifdef CONFIG_SMP
 	if (c->extended_cpuid_level >= 0x80000008) {
-		amd_detect_cmp(c);
+		amd_get_topology(c);
 		srat_detect_node(c);
 	}
+#endif
+	/* Multi-die? */
+	if (nodes_per_socket > 1)
+		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
 
 #ifdef CONFIG_X86_32
 	detect_ht(c);
-- 
2.7.4