Re: linux-next: manual merge of the nvdimm tree with the tip tree

Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx> · Sun, 19 Aug 2018 10:34:15 +1000

Hi all,

On Tue, 26 Jun 2018 12:18:53 +1000 Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx> wrote:
>
> Today's linux-next merge of the nvdimm tree got a conflict in:
> 
>   arch/x86/kernel/cpu/mcheck/mce.c
> 
> between commit:
> 
>   d3d6923cd1ae ("x86/mce: Carve out the crashing_cpu check")
> 
> from the tip tree and commit:
> 
>   f6785eac562b ("x86/memory_failure: Introduce {set,clear}_mce_nospec()")
> 
> from the nvdimm tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> -- 
> Cheers,
> Stephen Rothwell
> 
> diff --cc arch/x86/kernel/cpu/mcheck/mce.c
> index 9a16f15f79eb,a0fbf0a8b7e6..000000000000
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@@ -1076,129 -1070,6 +1072,100 @@@ static int do_memory_failure(struct mc
>   	return ret;
>   }
>   
> - #ifndef mce_unmap_kpfn
> - static void mce_unmap_kpfn(unsigned long pfn)
> - {
> - 	unsigned long decoy_addr;
> - 
> - 	/*
> - 	 * Unmap this page from the kernel 1:1 mappings to make sure
> - 	 * we don't log more errors because of speculative access to
> - 	 * the page.
> - 	 * We would like to just call:
> - 	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
> - 	 * but doing that would radically increase the odds of a
> - 	 * speculative access to the poison page because we'd have
> - 	 * the virtual address of the kernel 1:1 mapping sitting
> - 	 * around in registers.
> - 	 * Instead we get tricky.  We create a non-canonical address
> - 	 * that looks just like the one we want, but has bit 63 flipped.
> - 	 * This relies on set_memory_np() not checking whether we passed
> - 	 * a legal address.
> - 	 */
> - 
> - 	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
> - 
> - 	if (set_memory_np(decoy_addr, 1))
> - 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
> - }
> - #endif
> - 
> - 
>  +/*
>  + * Cases where we avoid rendezvous handler timeout:
>  + * 1) If this CPU is offline.
>  + *
>  + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
>  + *  skip those CPUs which remain looping in the 1st kernel - see
>  + *  crash_nmi_callback().
>  + *
>  + * Note: there still is a small window between kexec-ing and the new,
>  + * kdump kernel establishing a new #MC handler where a broadcasted MCE
>  + * might not get handled properly.
>  + */
>  +static bool __mc_check_crashing_cpu(int cpu)
>  +{
>  +	if (cpu_is_offline(cpu) ||
>  +	    (crashing_cpu != -1 && crashing_cpu != cpu)) {
>  +		u64 mcgstatus;
>  +
>  +		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
>  +		if (mcgstatus & MCG_STATUS_RIPV) {
>  +			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
>  +			return true;
>  +		}
>  +	}
>  +	return false;
>  +}
>  +
>  +static void __mc_scan_banks(struct mce *m, struct mce *final,
>  +			    unsigned long *toclear, unsigned long *valid_banks,
>  +			    int no_way_out, int *worst)
>  +{
>  +	struct mca_config *cfg = &mca_cfg;
>  +	int severity, i;
>  +
>  +	for (i = 0; i < cfg->banks; i++) {
>  +		__clear_bit(i, toclear);
>  +		if (!test_bit(i, valid_banks))
>  +			continue;
>  +
>  +		if (!mce_banks[i].ctl)
>  +			continue;
>  +
>  +		m->misc = 0;
>  +		m->addr = 0;
>  +		m->bank = i;
>  +
>  +		m->status = mce_rdmsrl(msr_ops.status(i));
>  +		if (!(m->status & MCI_STATUS_VAL))
>  +			continue;
>  +
>  +		/*
>  +		 * Corrected or non-signaled errors are handled by
>  +		 * machine_check_poll(). Leave them alone, unless this panics.
>  +		 */
>  +		if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
>  +			!no_way_out)
>  +			continue;
>  +
>  +		/* Set taint even when machine check was not enabled. */
>  +		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
>  +
>  +		severity = mce_severity(m, cfg->tolerant, NULL, true);
>  +
>  +		/*
>  +		 * When machine check was for corrected/deferred handler don't
>  +		 * touch, unless we're panicking.
>  +		 */
>  +		if ((severity == MCE_KEEP_SEVERITY ||
>  +		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
>  +			continue;
>  +
>  +		__set_bit(i, toclear);
>  +
>  +		/* Machine check event was not enabled. Clear, but ignore. */
>  +		if (severity == MCE_NO_SEVERITY)
>  +			continue;
>  +
>  +		mce_read_aux(m, i);
>  +
>  +		/* assuming valid severity level != 0 */
>  +		m->severity = severity;
>  +
>  +		mce_log(m);
>  +
>  +		if (severity > *worst) {
>  +			*final = *m;
>  +			*worst = severity;
>  +		}
>  +	}
>  +
>  +	/* mce_clear_state will clear *final, save locally for use later */
>  +	*m = *final;
>  +}
>  +
>   /*
>    * The actual machine check handler. This only handles real
>    * exceptions when something got corrupted coming in through int 18.

This is now a conflict between Linus' tree and the nvdimm tree.

-- 
Cheers,
Stephen Rothwell
Attachment:
pgpU2G3CkDuUO.pgp

Description: OpenPGP digital signature