D1 Performance Testing 1: test_memcpy_std, test_memcpy_rvv



  • Thanks @BruceHoult

    Test of standard glibc memcpy() vs RISC-V Vector extension version on the
    Allwinner D1 at 1.0 GHz.

    Sadly, this chip implements RVV draft version 0.7.1 which is incompatible at both assembly language and binary level with the current 1.0 draft.

    However it shows the promise of this ISA extension.

    [email protected]:~$ gcc -O test_memcpy.c rvv_lib.o -o test_memcpy_rvv
    [email protected]:~$ gcc -O test_memcpy.c -o test_memcpy_std
    [email protected]:~$ ./test_memcpy_std
    Byte size : ns Speed
    0 : 50.3 0.0 MB/s
    1 : 54.8 17.4 MB/s
    2 : 61.6 31.0 MB/s
    4 : 71.6 53.3 MB/s
    8 : 91.6 83.3 MB/s
    16 : 93.7 162.9 MB/s
    32 : 99.7 306.2 MB/s
    64 : 111.6 546.8 MB/s
    128 : 140.5 868.5 MB/s
    256 : 198.4 1230.6 MB/s
    512 : 314.0 1554.9 MB/s
    1024 : 551.7 1770.0 MB/s
    2048 : 1011.4 1931.1 MB/s
    4096 : 1937.8 2015.8 MB/s
    8192 : 3795.8 2058.2 MB/s
    16384 : 8336.3 1874.3 MB/s
    32768 : 20937.3 1492.5 MB/s
    65536 : 58882.3 1061.4 MB/s
    131072 : 113748.5 1098.9 MB/s
    262144 : 225554.1 1108.4 MB/s
    524288 : 446150.4 1120.7 MB/s
    1048576 : 927754.9 1077.9 MB/s
    2097152 : 1849499.0 1081.4 MB/s
    4194304 : 3666302.7 1091.0 MB/s
    8388608 : 7309773.4 1094.4 MB/s
    16777216 : 14528070.3 1101.3 MB/s
    33554432 : 28922562.5 1106.4 MB/s
    67108864 : 57848562.5 1106.3 MB/s

    [email protected]:~$ ./test_memcpy_rvv
    Byte size : ns Speed
    0 : 30.9 0.0 MB/s
    1 : 30.9 30.9 MB/s
    2 : 30.9 61.8 MB/s
    4 : 30.9 123.5 MB/s
    8 : 30.9 247.1 MB/s
    16 : 30.9 494.1 MB/s
    32 : 30.9 988.2 MB/s
    64 : 30.9 1976.2 MB/s
    128 : 46.8 2606.9 MB/s
    256 : 78.8 3100.0 MB/s
    512 : 142.5 3426.4 MB/s
    1024 : 271.1 3601.9 MB/s
    2048 : 526.4 3710.5 MB/s
    4096 : 1041.7 3749.8 MB/s
    8192 : 2071.6 3771.2 MB/s
    16384 : 5127.2 3047.4 MB/s
    32768 : 18495.6 1689.6 MB/s
    65536 : 56765.8 1101.0 MB/s
    131072 : 112453.9 1111.6 MB/s
    262144 : 223401.5 1119.1 MB/s
    524288 : 445860.8 1121.4 MB/s
    1048576 : 921998.0 1084.6 MB/s
    2097152 : 1844350.6 1084.4 MB/s
    4194304 : 3664435.5 1091.6 MB/s
    8388608 : 7282343.8 1098.5 MB/s
    16777216 : 14460382.8 1106.5 MB/s
    33554432 : 28772437.5 1112.2 MB/s
    67108864 : 57382062.5 1115.3 MB/s

    [email protected]:~$ objdump -d rvv_lib.o

    rvv_lib.o: file format elf64-littleriscv

    Disassembly of section .text:

    0000000000000000 :
    0: 86aa mv a3,a0

    0000000000000002 <.L1^B1>:
    2: 00267757 vsetvli a4,a2,e8,m4,d1
    6: 12058007 vlb.v v0,(a1)
    a: 95ba add a1,a1,a4
    c: 8e19 sub a2,a2,a4
    e: 02068027 vsb.v v0,(a3)
    12: 96ba add a3,a3,a4
    14: f67d bnez a2,2 <.L1^B1>
    16: 8082 ret

    The following is the standard glibc memcpy() code. Its length and complexity is
    quite a contrast with the RISC-V Vector code above: 622 vs 24 bytes.

    0000000000025fde :
    25fde: 7179 addi sp,sp,-48
    25fe0: f022 sd s0,32(sp)
    25fe2: e84a sd s2,16(sp)
    25fe4: f406 sd ra,40(sp)
    25fe6: ec26 sd s1,24(sp)
    25fe8: e44e sd s3,8(sp)
    25fea: 47bd li a5,15
    25fec: 892a mv s2,a0
    25fee: 872a mv a4,a0
    25ff0: 842e mv s0,a1
    25ff2: 04c7f463 bgeu a5,a2,2603a <memcpy+0x5c>
    25ff6: 40a00533 neg a0,a0
    25ffa: 891d andi a0,a0,7
    25ffc: 40a604b3 sub s1,a2,a0
    26000: c53d beqz a0,2606e <memcpy+0x90>
    26002: 012509b3 add s3,a0,s2
    26006: 87ae mv a5,a1
    26008: 0007c683 lbu a3,0(a5)
    2600c: 0705 addi a4,a4,1
    2600e: 0785 addi a5,a5,1
    26010: fed70fa3 sb a3,-1(a4)
    26014: fee99ae3 bne s3,a4,26008 <memcpy+0x2a>
    26018: 00a58433 add s0,a1,a0
    2601c: 00747793 andi a5,s0,7
    26020: 0034d613 srli a2,s1,0x3
    26024: 85a2 mv a1,s0
    26026: 854e mv a0,s3
    26028: e3a1 bnez a5,26068 <memcpy+0x8a>
    2602a: 048000ef jal ra,26072 <_wordcopy_fwd_aligned>
    2602e: ff84f713 andi a4,s1,-8
    26032: 943a add s0,s0,a4
    26034: 0074f613 andi a2,s1,7
    26038: 974e add a4,a4,s3
    2603a: 177d addi a4,a4,-1
    2603c: 87a2 mv a5,s0
    2603e: 008605b3 add a1,a2,s0
    26042: 8f01 sub a4,a4,s0
    26044: ca11 beqz a2,26058 <memcpy+0x7a>
    26046: 0007c603 lbu a2,0(a5)
    2604a: 0785 addi a5,a5,1
    2604c: 00f706b3 add a3,a4,a5
    26050: 00c68023 sb a2,0(a3)
    26054: fef599e3 bne a1,a5,26046 <memcpy+0x68>
    26058: 70a2 ld ra,40(sp)
    2605a: 7402 ld s0,32(sp)
    2605c: 64e2 ld s1,24(sp)
    2605e: 69a2 ld s3,8(sp)
    26060: 854a mv a0,s2
    26062: 6942 ld s2,16(sp)
    26064: 6145 addi sp,sp,48
    26066: 8082 ret
    26068: 0f0000ef jal ra,26158 <_wordcopy_fwd_dest_aligned>
    2606c: b7c9 j 2602e <memcpy+0x50>
    2606e: 89ca mv s3,s2
    26070: b775 j 2601c <memcpy+0x3e>

    0000000000026072 <_wordcopy_fwd_aligned>:
    26072: 00767793 andi a5,a2,7
    26076: 0002c717 auipc a4,0x2c
    2607a: 0f270713 addi a4,a4,242 # 52168 <PRETTY_FUNCTION.0+0x70>
    2607e: 078a slli a5,a5,0x2
    26080: 97ba add a5,a5,a4
    26082: 439c lw a5,0(a5)
    26084: 97ba add a5,a5,a4
    26086: 8782 jr a5
    26088: 6198 ld a4,0(a1)
    2608a: ff050793 addi a5,a0,-16
    2608e: 15e1 addi a1,a1,-8
    26090: 0605 addi a2,a2,1
    26092: 0105b803 ld a6,16(a1)
    26096: 01878693 addi a3,a5,24
    2609a: e118 sd a4,0(a0)
    2609c: 6d98 ld a4,24(a1)
    2609e: 02078513 addi a0,a5,32
    260a2: 0106b023 sd a6,0(a3)
    260a6: 0205b803 ld a6,32(a1)
    260aa: 02878693 addi a3,a5,40
    260ae: e118 sd a4,0(a0)
    260b0: 7598 ld a4,40(a1)
    260b2: 03078513 addi a0,a5,48
    260b6: 0106b023 sd a6,0(a3)
    260ba: 0305b803 ld a6,48(a1)
    260be: 03878693 addi a3,a5,56
    260c2: e118 sd a4,0(a0)
    260c4: 7d98 ld a4,56(a1)
    260c6: 04078793 addi a5,a5,64
    260ca: 0106b023 sd a6,0(a3)
    260ce: 1661 addi a2,a2,-8
    260d0: 853e mv a0,a5
    260d2: 04058593 addi a1,a1,64
    260d6: ce01 beqz a2,260ee <_wordcopy_fwd_aligned+0x7c>
    260d8: 0005b803 ld a6,0(a1)
    260dc: 00878693 addi a3,a5,8
    260e0: e118 sd a4,0(a0)
    260e2: 6598 ld a4,8(a1)
    260e4: 01078513 addi a0,a5,16
    260e8: 0106b023 sd a6,0(a3)
    260ec: b75d j 26092 <_wordcopy_fwd_aligned+0x20>
    260ee: e118 sd a4,0(a0)
    260f0: 8082 ret
    260f2: 167d addi a2,a2,-1
    260f4: 6198 ld a4,0(a1)
    260f6: de65 beqz a2,260ee <_wordcopy_fwd_aligned+0x7c>
    260f8: 05a1 addi a1,a1,8
    260fa: 87aa mv a5,a0
    260fc: bff1 j 260d8 <_wordcopy_fwd_aligned+0x66>
    260fe: 0005b803 ld a6,0(a1)
    26102: 86aa mv a3,a0
    26104: fd058593 addi a1,a1,-48
    26108: fc850793 addi a5,a0,-56
    2610c: 0619 addi a2,a2,6
    2610e: bf5d j 260c4 <_wordcopy_fwd_aligned+0x52>
    26110: 6198 ld a4,0(a1)
    26112: fd050793 addi a5,a0,-48
    26116: fd858593 addi a1,a1,-40
    2611a: 0615 addi a2,a2,5
    2611c: bf79 j 260ba <_wordcopy_fwd_aligned+0x48>
    2611e: 0005b803 ld a6,0(a1)
    26122: 86aa mv a3,a0
    26124: 1581 addi a1,a1,-32
    26126: fd850793 addi a5,a0,-40
    2612a: 0611 addi a2,a2,4
    2612c: b751 j 260b0 <_wordcopy_fwd_aligned+0x3e>
    2612e: 6198 ld a4,0(a1)
    26130: fe050793 addi a5,a0,-32
    26134: 15a1 addi a1,a1,-24
    26136: 060d addi a2,a2,3
    26138: b7bd j 260a6 <_wordcopy_fwd_aligned+0x34>
    2613a: 0005b803 ld a6,0(a1)
    2613e: 86aa mv a3,a0
    26140: 15c1 addi a1,a1,-16
    26142: fe850793 addi a5,a0,-24
    26146: 0609 addi a2,a2,2
    26148: bf91 j 2609c <_wordcopy_fwd_aligned+0x2a>
    2614a: d25d beqz a2,260f0 <_wordcopy_fwd_aligned+0x7e>
    2614c: 0005b803 ld a6,0(a1)
    26150: 86aa mv a3,a0
    26152: ff850793 addi a5,a0,-8
    26156: b771 j 260e2 <_wordcopy_fwd_aligned+0x70>

    0000000000026158 <_wordcopy_fwd_dest_aligned>:
    26158: 0075f713 andi a4,a1,7
    2615c: 0037179b slliw a5,a4,0x3
    26160: 00371313 slli t1,a4,0x3
    26164: 00367693 andi a3,a2,3
    26168: 04000713 li a4,64
    2616c: 4809 li a6,2
    2616e: 40f707bb subw a5,a4,a5
    26172: 99e1 andi a1,a1,-8
    26174: 0b068663 beq a3,a6,26220 <_wordcopy_fwd_dest_aligned+0xc8>
    26178: 470d li a4,3
    2617a: 02e68363 beq a3,a4,261a0 <_wordcopy_fwd_dest_aligned+0x48>
    2617e: 4705 li a4,1
    26180: 00e68463 beq a3,a4,26188 <_wordcopy_fwd_dest_aligned+0x30>
    26184: ea5d bnez a2,2623a <_wordcopy_fwd_dest_aligned+0xe2>
    26186: 8082 ret
    26188: 167d addi a2,a2,-1
    2618a: 6198 ld a4,0(a1)
    2618c: 0085b803 ld a6,8(a1)
    26190: e24d bnez a2,26232 <_wordcopy_fwd_dest_aligned+0xda>
    26192: 00675733 srl a4,a4,t1
    26196: 00f817b3 sll a5,a6,a5
    2619a: 8f5d or a4,a4,a5
    2619c: e118 sd a4,0(a0)
    2619e: 8082 ret
    261a0: 6198 ld a4,0(a1)
    261a2: 6594 ld a3,8(a1)
    261a4: 88ae mv a7,a1
    261a6: 0605 addi a2,a2,1
    261a8: ff050593 addi a1,a0,-16
    261ac: a03d j 261da <_wordcopy_fwd_dest_aligned+0x82>
    261ae: 006756b3 srl a3,a4,t1
    261b2: 00f81733 sll a4,a6,a5
    261b6: 8ed9 or a3,a3,a4
    261b8: 00858e13 addi t3,a1,8
    261bc: 0008b703 ld a4,0(a7) # fffffffffffff000 <BSS_END+0xfffffffff7f8a680>
    261c0: e114 sd a3,0(a0)
    261c2: 00f716b3 sll a3,a4,a5
    261c6: 00685833 srl a6,a6,t1
    261ca: 00d86833 or a6,a6,a3
    261ce: 01058513 addi a0,a1,16
    261d2: 0088b683 ld a3,8(a7)
    261d6: 010e3023 sd a6,0(t3)
    261da: 00675733 srl a4,a4,t1
    261de: 00f69833 sll a6,a3,a5
    261e2: 01076833 or a6,a4,a6
    261e6: 01858e13 addi t3,a1,24
    261ea: 0108b703 ld a4,16(a7)
    261ee: 01053023 sd a6,0(a0)
    261f2: 00f71533 sll a0,a4,a5
    261f6: 0066d6b3 srl a3,a3,t1
    261fa: 8ec9 or a3,a3,a0
    261fc: 0188b803 ld a6,24(a7)
    26200: 02058593 addi a1,a1,32
    26204: 00de3023 sd a3,0(t3)
    26208: 1671 addi a2,a2,-4
    2620a: 852e mv a0,a1
    2620c: 02088893 addi a7,a7,32
    26210: fe59 bnez a2,261ae <_wordcopy_fwd_dest_aligned+0x56>
    26212: 00675733 srl a4,a4,t1
    26216: 00f817b3 sll a5,a6,a5
    2621a: 8f5d or a4,a4,a5
    2621c: e118 sd a4,0(a0)
    2621e: b741 j 2619e <_wordcopy_fwd_dest_aligned+0x46>
    26220: 6194 ld a3,0(a1)
    26222: 6598 ld a4,8(a1)
    26224: ff858893 addi a7,a1,-8
    26228: 8e2a mv t3,a0
    2622a: fe850593 addi a1,a0,-24
    2622e: 0609 addi a2,a2,2
    26230: b7c9 j 261f2 <_wordcopy_fwd_dest_aligned+0x9a>
    26232: 01058893 addi a7,a1,16
    26236: 85aa mv a1,a0
    26238: bf9d j 261ae <_wordcopy_fwd_dest_aligned+0x56>
    2623a: 0005b803 ld a6,0(a1)
    2623e: 6598 ld a4,8(a1)
    26240: 00858893 addi a7,a1,8
    26244: 8e2a mv t3,a0
    26246: ff850593 addi a1,a0,-8
    2624a: bfa5 j 261c2 <_wordcopy_fwd_dest_aligned+0x6a>



  • @traits 你是推特上那个 @YunhaiShang ?:grin:



  • 哈哈,我们国外用户的测试,还测试了RVV的访存性能


Log in to reply