Re: [PATCH v10 2/5] powerpc/vdso: Prepare for switching VDSO to generic C implementation.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

On 08/05/2020 02:03 PM, Segher Boessenkool wrote:
Hi!

On Wed, Aug 05, 2020 at 07:09:23AM +0000, Christophe Leroy wrote:
+/*
+ * powerpc specific delta calculation.
+ *
+ * This variant removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */

It cannot optimise it because it does not know shift < 32.  The code
below is incorrect for shift equal to 32, fwiw.

Is there a way to tell it ?


+static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+{
+	return (cycles - last) * mult;
+}
+#define vdso_calc_delta vdso_calc_delta
+
+#ifndef __powerpc64__
+static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
+{
+	u32 hi = ns >> 32;
+	u32 lo = ns;
+
+	lo >>= shift;
+	lo |= hi << (32 - shift);
+	hi >>= shift;


+	if (likely(hi == 0))
+		return lo;

Removing these two lines shouldn't change generated object code?  Or not
make it worse, at least.

I remember it made noticeable difference allthough I can't remember the details. See below with GCC 10.1. At least we see that with those two lines, GCC only sets a 16 bytes stack frame. Without those lines it sets a 32 bytes stack frame and seems to save some values for no reason.

With the two lines:

000006ac <__c_kernel_clock_gettime>:
 6ac:	28 03 00 0f 	cmplwi  r3,15
 6b0:	41 81 01 04 	bgt     7b4 <__c_kernel_clock_gettime+0x108>
 6b4:	39 40 00 01 	li      r10,1
 6b8:	7d 4a 18 30 	slw     r10,r10,r3
 6bc:	71 47 08 83 	andi.   r7,r10,2179
 6c0:	41 82 01 2c 	beq     7ec <__c_kernel_clock_gettime+0x140>
 6c4:	94 21 ff f0 	stwu    r1,-16(r1)
 6c8:	54 63 20 36 	rlwinm  r3,r3,4,0,27
 6cc:	93 e1 00 0c 	stw     r31,12(r1)
 6d0:	7d 85 1a 14 	add     r12,r5,r3
 6d4:	80 05 00 00 	lwz     r0,0(r5)
 6d8:	70 06 00 01 	andi.   r6,r0,1
 6dc:	40 82 00 d4 	bne     7b0 <__c_kernel_clock_gettime+0x104>
 6e0:	7d 4d 42 e6 	mftbu   r10
 6e4:	7d 6c 42 e6 	mftb    r11
 6e8:	7c ed 42 e6 	mftbu   r7
 6ec:	7c 0a 38 40 	cmplw   r10,r7
 6f0:	40 82 ff f0 	bne     6e0 <__c_kernel_clock_gettime+0x34>
 6f4:	80 e5 00 0c 	lwz     r7,12(r5)
 6f8:	80 65 00 08 	lwz     r3,8(r5)
 6fc:	7c e7 58 10 	subfc   r7,r7,r11
 700:	81 65 00 18 	lwz     r11,24(r5)
 704:	7d 43 51 10 	subfe   r10,r3,r10
 708:	7f e7 58 16 	mulhwu  r31,r7,r11
 70c:	7d 4a 59 d6 	mullw   r10,r10,r11
 710:	7c e7 59 d6 	mullw   r7,r7,r11
 714:	80 6c 00 2c 	lwz     r3,44(r12)
 718:	81 6c 00 28 	lwz     r11,40(r12)
 71c:	7c e7 18 14 	addc    r7,r7,r3
 720:	7d 4a fa 14 	add     r10,r10,r31
 724:	80 65 00 1c 	lwz     r3,28(r5)
 728:	7d 4a 59 14 	adde    r10,r10,r11
 72c:	7c e7 1c 30 	srw     r7,r7,r3
 730:	21 63 00 20 	subfic  r11,r3,32
 734:	7d 43 1c 31 	srw.    r3,r10,r3
 738:	7d 4a 58 30 	slw     r10,r10,r11
 73c:	7d 49 3b 78 	or      r9,r10,r7
 740:	39 00 00 00 	li      r8,0
 744:	40 82 00 84 	bne     7c8 <__c_kernel_clock_gettime+0x11c>
 748:	80 6c 00 24 	lwz     r3,36(r12)
 74c:	81 45 00 00 	lwz     r10,0(r5)
 750:	7c 00 50 40 	cmplw   r0,r10
 754:	40 a2 ff 80 	bne     6d4 <__c_kernel_clock_gettime+0x28>
 758:	2c 08 00 00 	cmpwi   r8,0
 75c:	41 82 00 7c 	beq     7d8 <__c_kernel_clock_gettime+0x12c>
 760:	3c e0 c4 65 	lis     r7,-15259
 764:	3c 00 3b 9a 	lis     r0,15258
 768:	60 e7 36 00 	ori     r7,r7,13824
 76c:	60 00 c9 ff 	ori     r0,r0,51711
 770:	7c a9 38 14 	addc    r5,r9,r7
 774:	7d 48 01 d4 	addme   r10,r8
 778:	2c 0a 00 00 	cmpwi   r10,0
 77c:	7d 48 53 78 	mr      r8,r10
 780:	7c a9 2b 78 	mr      r9,r5
 784:	38 c6 00 01 	addi    r6,r6,1
 788:	40 82 ff e8 	bne     770 <__c_kernel_clock_gettime+0xc4>
 78c:	7c 05 00 40 	cmplw   r5,r0
 790:	41 81 ff e0 	bgt     770 <__c_kernel_clock_gettime+0xc4>
 794:	7c 66 18 14 	addc    r3,r6,r3
 798:	90 64 00 00 	stw     r3,0(r4)
 79c:	91 24 00 04 	stw     r9,4(r4)
 7a0:	38 60 00 00 	li      r3,0
 7a4:	83 e1 00 0c 	lwz     r31,12(r1)
 7a8:	38 21 00 10 	addi    r1,r1,16
 7ac:	4e 80 00 20 	blr
 7b0:	4b ff ff 24 	b       6d4 <__c_kernel_clock_gettime+0x28>
 7b4:	38 00 00 f6 	li      r0,246
 7b8:	44 00 00 02 	sc
 7bc:	40 a3 00 08 	bns     7c4 <__c_kernel_clock_gettime+0x118>
 7c0:	7c 63 00 d0 	neg     r3,r3
 7c4:	4e 80 00 20 	blr
 7c8:	7d 2a 4b 78 	mr      r10,r9
 7cc:	7c 68 1b 78 	mr      r8,r3
 7d0:	7d 49 53 78 	mr      r9,r10
 7d4:	4b ff ff 74 	b       748 <__c_kernel_clock_gettime+0x9c>
 7d8:	3d 40 3b 9a 	lis     r10,15258
 7dc:	61 4a c9 ff 	ori     r10,r10,51711
 7e0:	7c 09 50 40 	cmplw   r9,r10
 7e4:	41 81 ff 7c 	bgt     760 <__c_kernel_clock_gettime+0xb4>
 7e8:	4b ff ff b0 	b       798 <__c_kernel_clock_gettime+0xec>
 7ec:	71 47 00 60 	andi.   r7,r10,96
 7f0:	54 69 20 36 	rlwinm  r9,r3,4,0,27
 7f4:	7d 25 4a 14 	add     r9,r5,r9
 7f8:	40 82 00 14 	bne     80c <__c_kernel_clock_gettime+0x160>
 7fc:	71 4a 00 10 	andi.   r10,r10,16
 800:	41 a2 ff b4 	beq     7b4 <__c_kernel_clock_gettime+0x108>
 804:	38 a5 00 f0 	addi    r5,r5,240
 808:	4b ff fe bc 	b       6c4 <__c_kernel_clock_gettime+0x18>
 80c:	81 05 00 00 	lwz     r8,0(r5)
 810:	71 0a 00 01 	andi.   r10,r8,1
 814:	40 a2 ff f8 	bne     80c <__c_kernel_clock_gettime+0x160>
 818:	80 69 00 24 	lwz     r3,36(r9)
 81c:	81 49 00 2c 	lwz     r10,44(r9)
 820:	80 e5 00 00 	lwz     r7,0(r5)
 824:	7c 08 38 40 	cmplw   r8,r7
 828:	40 a2 ff e4 	bne     80c <__c_kernel_clock_gettime+0x160>
 82c:	90 64 00 00 	stw     r3,0(r4)
 830:	91 44 00 04 	stw     r10,4(r4)
 834:	38 60 00 00 	li      r3,0
 838:	4e 80 00 20 	blr


Without the two lines:

000006ac <__c_kernel_clock_gettime>:
 6ac:	28 03 00 0f 	cmplwi  r3,15
 6b0:	41 81 01 14 	bgt     7c4 <__c_kernel_clock_gettime+0x118>
 6b4:	39 20 00 01 	li      r9,1
 6b8:	7d 29 18 30 	slw     r9,r9,r3
 6bc:	71 2a 08 83 	andi.   r10,r9,2179
 6c0:	41 82 01 2c 	beq     7ec <__c_kernel_clock_gettime+0x140>
 6c4:	94 21 ff e0 	stwu    r1,-32(r1)
 6c8:	54 63 20 36 	rlwinm  r3,r3,4,0,27
 6cc:	93 81 00 10 	stw     r28,16(r1)
 6d0:	93 a1 00 14 	stw     r29,20(r1)
 6d4:	93 c1 00 18 	stw     r30,24(r1)
 6d8:	93 e1 00 1c 	stw     r31,28(r1)
 6dc:	7c 65 1a 14 	add     r3,r5,r3
 6e0:	81 85 00 00 	lwz     r12,0(r5)
 6e4:	71 87 00 01 	andi.   r7,r12,1
 6e8:	40 82 00 d8 	bne     7c0 <__c_kernel_clock_gettime+0x114>
 6ec:	7d 2d 42 e6 	mftbu   r9
 6f0:	7c cc 42 e6 	mftb    r6
 6f4:	7d 4d 42 e6 	mftbu   r10
 6f8:	7c 09 50 40 	cmplw   r9,r10
 6fc:	40 82 ff f0 	bne     6ec <__c_kernel_clock_gettime+0x40>
 700:	83 83 00 28 	lwz     r28,40(r3)
 704:	83 a3 00 2c 	lwz     r29,44(r3)
 708:	81 65 00 08 	lwz     r11,8(r5)
 70c:	81 05 00 0c 	lwz     r8,12(r5)
 710:	83 c5 00 18 	lwz     r30,24(r5)
 714:	83 e5 00 1c 	lwz     r31,28(r5)
 718:	80 03 00 24 	lwz     r0,36(r3)
 71c:	81 45 00 00 	lwz     r10,0(r5)
 720:	7c 0c 50 40 	cmplw   r12,r10
 724:	40 a2 ff bc 	bne     6e0 <__c_kernel_clock_gettime+0x34>
 728:	7d 48 30 10 	subfc   r10,r8,r6
 72c:	7c cb 49 10 	subfe   r6,r11,r9
 730:	7c c6 f1 d6 	mullw   r6,r6,r30
 734:	7d 2a f0 16 	mulhwu  r9,r10,r30
 738:	7d 4a f1 d6 	mullw   r10,r10,r30
 73c:	7c c6 4a 14 	add     r6,r6,r9
 740:	7d 4a e8 14 	addc    r10,r10,r29
 744:	7c c6 e1 14 	adde    r6,r6,r28
 748:	7c c8 fc 30 	srw     r8,r6,r31
 74c:	2c 08 00 00 	cmpwi   r8,0
 750:	20 bf 00 20 	subfic  r5,r31,32
 754:	7d 4a fc 30 	srw     r10,r10,r31
 758:	7c c5 28 30 	slw     r5,r6,r5
 75c:	7c a9 53 78 	or      r9,r5,r10
 760:	41 82 00 78 	beq     7d8 <__c_kernel_clock_gettime+0x12c>
 764:	3c c0 c4 65 	lis     r6,-15259
 768:	3c 60 3b 9a 	lis     r3,15258
 76c:	60 c6 36 00 	ori     r6,r6,13824
 770:	60 63 c9 ff 	ori     r3,r3,51711
 774:	7c a9 30 14 	addc    r5,r9,r6
 778:	7d 48 01 d4 	addme   r10,r8
 77c:	2c 0a 00 00 	cmpwi   r10,0
 780:	7d 48 53 78 	mr      r8,r10
 784:	7c a9 2b 78 	mr      r9,r5
 788:	38 e7 00 01 	addi    r7,r7,1
 78c:	40 82 ff e8 	bne     774 <__c_kernel_clock_gettime+0xc8>
 790:	7c 05 18 40 	cmplw   r5,r3
 794:	41 81 ff e0 	bgt     774 <__c_kernel_clock_gettime+0xc8>
 798:	7c 07 00 14 	addc    r0,r7,r0
 79c:	90 04 00 00 	stw     r0,0(r4)
 7a0:	91 24 00 04 	stw     r9,4(r4)
 7a4:	38 60 00 00 	li      r3,0
 7a8:	83 81 00 10 	lwz     r28,16(r1)
 7ac:	83 a1 00 14 	lwz     r29,20(r1)
 7b0:	83 c1 00 18 	lwz     r30,24(r1)
 7b4:	83 e1 00 1c 	lwz     r31,28(r1)
 7b8:	38 21 00 20 	addi    r1,r1,32
 7bc:	4e 80 00 20 	blr
 7c0:	4b ff ff 20 	b       6e0 <__c_kernel_clock_gettime+0x34>
 7c4:	38 00 00 f6 	li      r0,246
 7c8:	44 00 00 02 	sc
 7cc:	40 a3 00 08 	bns     7d4 <__c_kernel_clock_gettime+0x128>
 7d0:	7c 63 00 d0 	neg     r3,r3
 7d4:	4e 80 00 20 	blr
 7d8:	3d 40 3b 9a 	lis     r10,15258
 7dc:	61 4a c9 ff 	ori     r10,r10,51711
 7e0:	7c 09 50 40 	cmplw   r9,r10
 7e4:	41 81 ff 80 	bgt     764 <__c_kernel_clock_gettime+0xb8>
 7e8:	4b ff ff b4 	b       79c <__c_kernel_clock_gettime+0xf0>
 7ec:	71 2a 00 60 	andi.   r10,r9,96
 7f0:	40 82 00 14 	bne     804 <__c_kernel_clock_gettime+0x158>
 7f4:	71 29 00 10 	andi.   r9,r9,16
 7f8:	41 a2 ff cc 	beq     7c4 <__c_kernel_clock_gettime+0x118>
 7fc:	38 a5 00 f0 	addi    r5,r5,240
 800:	4b ff fe c4 	b       6c4 <__c_kernel_clock_gettime+0x18>
 804:	54 69 20 36 	rlwinm  r9,r3,4,0,27
 808:	7d 25 4a 14 	add     r9,r5,r9
 80c:	81 05 00 00 	lwz     r8,0(r5)
 810:	71 0a 00 01 	andi.   r10,r8,1
 814:	40 82 00 28 	bne     83c <__c_kernel_clock_gettime+0x190>
 818:	80 09 00 24 	lwz     r0,36(r9)
 81c:	81 49 00 2c 	lwz     r10,44(r9)
 820:	80 e5 00 00 	lwz     r7,0(r5)
 824:	7c 08 38 40 	cmplw   r8,r7
 828:	40 a2 ff e4 	bne     80c <__c_kernel_clock_gettime+0x160>
 82c:	90 04 00 00 	stw     r0,0(r4)
 830:	91 44 00 04 	stw     r10,4(r4)
 834:	38 60 00 00 	li      r3,0
 838:	4e 80 00 20 	blr
 83c:	4b ff ff d0 	b       80c <__c_kernel_clock_gettime+0x160>



+	return ((u64)hi << 32) | lo;
+}


What does the compiler do for just

static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
	return ns >> (shift & 31);
}


Worse:

000006ac <__c_kernel_clock_gettime>:
 6ac:	28 03 00 0f 	cmplwi  r3,15
 6b0:	41 81 01 30 	bgt     7e0 <__c_kernel_clock_gettime+0x134>
 6b4:	39 20 00 01 	li      r9,1
 6b8:	7d 29 18 30 	slw     r9,r9,r3
 6bc:	71 2a 08 83 	andi.   r10,r9,2179
 6c0:	41 82 01 48 	beq     808 <__c_kernel_clock_gettime+0x15c>
 6c4:	94 21 ff e0 	stwu    r1,-32(r1)
 6c8:	54 63 20 36 	rlwinm  r3,r3,4,0,27
 6cc:	93 81 00 10 	stw     r28,16(r1)
 6d0:	93 a1 00 14 	stw     r29,20(r1)
 6d4:	93 c1 00 18 	stw     r30,24(r1)
 6d8:	93 e1 00 1c 	stw     r31,28(r1)
 6dc:	7c 65 1a 14 	add     r3,r5,r3
 6e0:	80 c5 00 00 	lwz     r6,0(r5)
 6e4:	70 c7 00 01 	andi.   r7,r6,1
 6e8:	40 82 00 f4 	bne     7dc <__c_kernel_clock_gettime+0x130>
 6ec:	7d 2d 42 e6 	mftbu   r9
 6f0:	7d 0c 42 e6 	mftb    r8
 6f4:	7d 4d 42 e6 	mftbu   r10
 6f8:	7c 09 50 40 	cmplw   r9,r10
 6fc:	40 82 ff f0 	bne     6ec <__c_kernel_clock_gettime+0x40>
 700:	83 83 00 28 	lwz     r28,40(r3)
 704:	83 c3 00 2c 	lwz     r30,44(r3)
 708:	81 65 00 08 	lwz     r11,8(r5)
 70c:	81 45 00 0c 	lwz     r10,12(r5)
 710:	83 e5 00 18 	lwz     r31,24(r5)
 714:	81 85 00 1c 	lwz     r12,28(r5)
 718:	80 03 00 24 	lwz     r0,36(r3)
 71c:	83 a5 00 00 	lwz     r29,0(r5)
 720:	7c 06 e8 40 	cmplw   r6,r29
 724:	40 a2 ff bc 	bne     6e0 <__c_kernel_clock_gettime+0x34>
 728:	7d 0a 40 10 	subfc   r8,r10,r8
 72c:	7c cb 49 10 	subfe   r6,r11,r9
 730:	7c c6 f9 d6 	mullw   r6,r6,r31
 734:	7d 28 f8 16 	mulhwu  r9,r8,r31
 738:	7d 08 f9 d6 	mullw   r8,r8,r31
 73c:	55 8c 06 fe 	clrlwi  r12,r12,27
 740:	7f c8 f0 14 	addc    r30,r8,r30
 744:	7c c6 4a 14 	add     r6,r6,r9
 748:	7c c6 e1 14 	adde    r6,r6,r28
 74c:	34 6c ff e0 	addic.  r3,r12,-32
 750:	41 80 00 70 	blt     7c0 <__c_kernel_clock_gettime+0x114>
 754:	7c c9 1c 30 	srw     r9,r6,r3
 758:	39 00 00 00 	li      r8,0
 75c:	2c 08 00 00 	cmpwi   r8,0
 760:	41 82 00 94 	beq     7f4 <__c_kernel_clock_gettime+0x148>
 764:	3c c0 c4 65 	lis     r6,-15259
 768:	3c 60 3b 9a 	lis     r3,15258
 76c:	60 c6 36 00 	ori     r6,r6,13824
 770:	60 63 c9 ff 	ori     r3,r3,51711
 774:	7c a9 30 14 	addc    r5,r9,r6
 778:	7d 48 01 d4 	addme   r10,r8
 77c:	2c 0a 00 00 	cmpwi   r10,0
 780:	7d 48 53 78 	mr      r8,r10
 784:	7c a9 2b 78 	mr      r9,r5
 788:	38 e7 00 01 	addi    r7,r7,1
 78c:	40 82 ff e8 	bne     774 <__c_kernel_clock_gettime+0xc8>
 790:	7c 05 18 40 	cmplw   r5,r3
 794:	41 81 ff e0 	bgt     774 <__c_kernel_clock_gettime+0xc8>
 798:	7c 07 00 14 	addc    r0,r7,r0
 79c:	90 04 00 00 	stw     r0,0(r4)
 7a0:	91 24 00 04 	stw     r9,4(r4)
 7a4:	38 60 00 00 	li      r3,0
 7a8:	83 81 00 10 	lwz     r28,16(r1)
 7ac:	83 a1 00 14 	lwz     r29,20(r1)
 7b0:	83 c1 00 18 	lwz     r30,24(r1)
 7b4:	83 e1 00 1c 	lwz     r31,28(r1)
 7b8:	38 21 00 20 	addi    r1,r1,32
 7bc:	4e 80 00 20 	blr
 7c0:	54 c3 08 3c 	rlwinm  r3,r6,1,0,30
 7c4:	21 6c 00 1f 	subfic  r11,r12,31
 7c8:	7c 63 58 30 	slw     r3,r3,r11
 7cc:	7f c9 64 30 	srw     r9,r30,r12
 7d0:	7c 69 4b 78 	or      r9,r3,r9
 7d4:	7c c8 64 30 	srw     r8,r6,r12
 7d8:	4b ff ff 84 	b       75c <__c_kernel_clock_gettime+0xb0>
 7dc:	4b ff ff 04 	b       6e0 <__c_kernel_clock_gettime+0x34>
 7e0:	38 00 00 f6 	li      r0,246
 7e4:	44 00 00 02 	sc
 7e8:	40 a3 00 08 	bns     7f0 <__c_kernel_clock_gettime+0x144>
 7ec:	7c 63 00 d0 	neg     r3,r3
 7f0:	4e 80 00 20 	blr
 7f4:	3d 40 3b 9a 	lis     r10,15258
 7f8:	61 4a c9 ff 	ori     r10,r10,51711
 7fc:	7c 09 50 40 	cmplw   r9,r10
 800:	41 81 ff 64 	bgt     764 <__c_kernel_clock_gettime+0xb8>
 804:	4b ff ff 98 	b       79c <__c_kernel_clock_gettime+0xf0>
 808:	71 2a 00 60 	andi.   r10,r9,96
 80c:	40 82 00 14 	bne     820 <__c_kernel_clock_gettime+0x174>
 810:	71 29 00 10 	andi.   r9,r9,16
 814:	41 a2 ff cc 	beq     7e0 <__c_kernel_clock_gettime+0x134>
 818:	38 a5 00 f0 	addi    r5,r5,240
 81c:	4b ff fe a8 	b       6c4 <__c_kernel_clock_gettime+0x18>
 820:	54 69 20 36 	rlwinm  r9,r3,4,0,27
 824:	7d 25 4a 14 	add     r9,r5,r9
 828:	81 05 00 00 	lwz     r8,0(r5)
 82c:	71 0a 00 01 	andi.   r10,r8,1
 830:	40 82 00 28 	bne     858 <__c_kernel_clock_gettime+0x1ac>
 834:	80 09 00 24 	lwz     r0,36(r9)
 838:	81 49 00 2c 	lwz     r10,44(r9)
 83c:	80 e5 00 00 	lwz     r7,0(r5)
 840:	7c 08 38 40 	cmplw   r8,r7
 844:	40 a2 ff e4 	bne     828 <__c_kernel_clock_gettime+0x17c>
 848:	90 04 00 00 	stw     r0,0(r4)
 84c:	91 44 00 04 	stw     r10,4(r4)
 850:	38 60 00 00 	li      r3,0
 854:	4e 80 00 20 	blr
 858:	4b ff ff d0 	b       828 <__c_kernel_clock_gettime+0x17c>

Christophe



[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux