超线程是如何工作的 - 实验

2024-11-03 约 2898 字预计阅读 6 分钟

文章简介：通过一个实验理解超线程是如何工作的

背景

学习 CPU 超线程的时候，实践一下 plantegg 大佬的实验。实验大概是这样的：

写一个能把 IPC（instructions per cycle）跑到最高的代码 (可以试试跑一段死循环的 IPC 能到多少)；写一个能把 IPC 跑到最低的程序。然后用 perf 去看他们的 IPC，用 top 去看他们的 CPU 使用率。进一步同时把这样的程序跑两份，但是将他们绑到一对超线程上，然后再看他们的 IPC 以及 top。

nop.c

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133


void main() {
    while (1) {
        __asm__(
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop\n\t"
            "nop");
    }
}

pause.c

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133


void main() {
    while (1) {
        __asm__(
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause\n\t"
            "pause");
    }
}

实验环境信息

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


root@localhost# cat /etc/redhat-release 
Rocky Linux release 8.10 (Green Obsidian)
root@localhost# uname -a
Linux localhost.localdomain 3.10.0-1160.el7.x86_64 #1 SMP Mon Oct 19 16:18:59 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux
root@localhost# lscpu
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              80
On-line CPU(s) list: 0-79
Thread(s) per core:  2
Core(s) per socket:  20
Socket(s):           2
NUMA node(s):        2
Vendor ID:           GenuineIntel
BIOS Vendor ID:      Intel(R) Corporation
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz
BIOS Model name:     Intel(R) Xeon(R) Gold 5218R CPU @ 2.10GHz
Stepping:            7
CPU MHz:             799.932
CPU max MHz:         4000.0000
CPU min MHz:         800.0000
BogoMIPS:            4200.00
Virtualization:      VT-x
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            28160K
NUMA node0 CPU(s):   0-19,40-59
NUMA node1 CPU(s):   20-39,60-79
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch epb cat_l3 cdp_l3 invpcid_single intel_ppin intel_pt ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm ida arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req pku ospke avx512_vnni md_clear spec_ctrl intel_stibp flush_l1d arch_capabilities

numactl -H
available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
node 0 size: 128410 MB
node 0 free: 52082 MB
node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
node 1 size: 128985 MB
node 1 free: 120673 MB
node distances:
node   0   1 
  0:  10  21 
  1:  21  10

lstopo-no-graphics --output-format svg > IntelRXeonGold5218R.svg

我的环境比较奇怪，宿主机的系统是 centos7.9，重装系统会相对麻烦一些。centos 已经 EOL，直接宿主机安装各种需要的软件、搭建编译环境相对麻烦，且我的电脑是 M1 arm，搭建交叉编译环境会涉及修改编译命令，入门门槛相对较高。所以尝试在 docker 中搭建运行、编译环境。环境如下：

编译运行环境

Dockerfile 见如下，文件保存到 tools/Dockerfile.labs：

1
2
3
4


FROM --platform=linux/amd64 rockylinux/rockylinux:8

RUN dnf groupinstall -y 'Development Tools' \
    && dnf --enablerepo=devel -y install bison byacc flex patch glibc-static git libtirpc libtirpc-devel numactl numa* hwloc perf tmux

编译 docker 镜像

1

docker build -f tools/Dockerfile.labs -t cpulabs:dev .

只使用镜像中的 linux 发行版 rootfs，其他所有的隔离手段不使用，尽可能匹配宿主机裸机执行的环境：

1

docker run --name casestudy --rm -it --privileged --userns=host --network=host --pid=host -v $(pwd):/host -w /host cpulabs:dev bash

实验主体

测试 nop 指令

理论上 intel 4 条流水线并行，nop 执行需要一个时钟周期，cpu 完全跑 nop 指令的 IPC 大约是 4。接下来验证一下。

1

gcc ./nop.c -o nop

执行测试

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


perf stat timeout 10 ./nop

 Performance counter stats for 'timeout 10 ./nop':

           9977.71 msec task-clock                #    0.997 CPUs utilized          
                 8      context-switches          #    0.802 /sec                   
                 2      cpu-migrations            #    0.200 /sec                   
               511      page-faults               #   51.214 /sec                   
       38351130635      cycles                    #    3.844 GHz                    
      149695103720      instructions              #    3.90  insn per cycle         
        1168639909      branches                  #  117.125 M/sec                  
            465623      branch-misses             #    0.04% of all branches        

      10.002747608 seconds time elapsed

       9.977095000 seconds user
       0.001750000 seconds sys

现象：

IPC 跑到 3.90
CPUs util 0.997

问题：

IPC 跟 perf 执行多久无关，但是执行时间过短，IPC 计数会小一点点，猜测可能是时间太短，while 循环也需要执行执行，跳转指令让指令流水线受阻导致的，长时间的执行令分支预测更加准确，IPC 趋于稳定
- perf stat timeout 0.1 ./nop IPC 在 3.84
- perf stat timeout 1 ./nop IPC 在 3.90
IPC 是否可以接近 4?
- 据说 intel 流水线可达 4 条并行，理论值是 4，while 循环也会转换成其他命令，会稀释一部分导致 IPC 降低，可以考虑增加 asm 中的 nop 的数量

测试 pause 指令

理论上 intel 共 4 条流水线并行，一个 pause 指令大概 140 时钟周期，4/140=0.02857142857142857，接下来验证一下。

1

gcc ./pause.c -o pause

执行测试

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


perf stat timeout 10 ./pause

 Performance counter stats for 'timeout 10 ./pause':

           9980.35 msec task-clock                #    0.997 CPUs utilized          
                 6      context-switches          #    0.601 /sec                   
                 3      cpu-migrations            #    0.301 /sec                   
               510      page-faults               #   51.100 /sec                   
       35500797866      cycles                    #    3.557 GHz                    
         929191714      instructions              #    0.03  insn per cycle         
          15430712      branches                  #    1.546 M/sec                  
            373753      branch-misses             #    2.42% of all branches        

      10.005685906 seconds time elapsed

       9.979564000 seconds user
       0.002129000 seconds sys

现象：

IPC 大约 0.03，估计是精度问题，与计算出的理论值接近。

测试 nop 指令跑两份各自绑定到同一物理核

cat /proc/cpuinfo 中希望找到相同物理核心的两个超线程，即寻找 core id 相同且 physical id 相同的两个 processor 即可。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


perf stat timeout 3 taskset -c 0 ./nop &
perf stat timeout 3 taskset -c 40 ./nop

 Performance counter stats for 'taskset -c 40 ./nop':

           6668.00 msec task-clock                #    0.997 CPUs utilized          
                 2      context-switches          #    0.300 /sec                   
                 1      cpu-migrations            #    0.150 /sec                   
               343      page-faults               #   51.440 /sec                   
       21645830628      cycles                    #    3.246 GHz                    
       32415527643      instructions              #    1.50  insn per cycle         
         256383559      branches                  #   38.450 M/sec                  
            273069      branch-misses             #    0.11% of all branches        

       6.684805024 seconds time elapsed

       6.666732000 seconds user
       0.001996000 seconds sys

 Performance counter stats for 'timeout 3 taskset -c 40 ./nop':

           3000.46 msec task-clock                #    0.996 CPUs utilized          
                 6      context-switches          #    2.000 /sec                   
                 1      cpu-migrations            #    0.333 /sec                   
               740      page-faults               #  246.629 /sec                   
       10749571656      cycles                    #    3.583 GHz                    
       19580067188      instructions              #    1.82  insn per cycle         
         155065143      branches                  #   51.680 M/sec                  
            159560      branch-misses             #    0.10% of all branches        

       3.013625515 seconds time elapsed

       2.995919000 seconds user
       0.008434000 seconds sys

可以看到两个 nop 跑在相同物理核的两个超线程下，IPC 下降到原来的一半以下了。

(1.50+1.82)/(3.90*2)=0.42

测试 pause 指令跑两份各自绑定到同一物理核

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


perf stat timeout 3 taskset -c 0 ./pause &
perf stat timeout 3 taskset -c 40 ./pause

 Performance counter stats for 'timeout 3 taskset -c 40 ./pause':

           3000.71 msec task-clock                #    0.997 CPUs utilized          
                 5      context-switches          #    1.666 /sec                   
                 1      cpu-migrations            #    0.333 /sec                   
               740      page-faults               #  246.608 /sec                   
        9127896305      cycles                    #    3.042 GHz                    
         232869301      instructions              #    0.03  insn per cycle         
           5153047      branches                  #    1.717 M/sec                  
            149048      branch-misses             #    2.89% of all branches        

       3.008989678 seconds time elapsed

       2.996138000 seconds user
       0.005710000 seconds sys

 Performance counter stats for 'timeout 3 taskset -c 0 ./pause':

           3002.08 msec task-clock                #    0.997 CPUs utilized          
                 8      context-switches          #    2.665 /sec                   
                 1      cpu-migrations            #    0.333 /sec                   
               740      page-faults               #  246.496 /sec                   
        9036205346      cycles                    #    3.010 GHz                    
         228681050      instructions              #    0.03  insn per cycle         
           4910490      branches                  #    1.636 M/sec                  
            151013      branch-misses             #    3.08% of all branches        

       3.010698825 seconds time elapsed

       3.000847000 seconds user
       0.002973000 seconds sys

现象：

绑定在 0 号 CPU 上的 IPC 为 0.03，绑定到 40 号 CPU 上的 IPC 为 0.03，与原始跑一个 pause 时一样

测试 pause/nop 指令各自绑定到同一物理核

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


perf stat timeout 3 taskset -c 0 ./nop &
perf stat timeout 3 taskset -c 40 ./pause

 Performance counter stats for 'timeout 3 taskset -c 0 ./nop':

           3004.09 msec task-clock                #    0.997 CPUs utilized          
                 9      context-switches          #    2.996 /sec                   
                 1      cpu-migrations            #    0.333 /sec                   
               741      page-faults               #  246.663 /sec                   
        8734459274      cycles                    #    2.908 GHz                    
       27382699175      instructions              #    3.14  insn per cycle         
         215882285      branches                  #   71.863 M/sec                  
            170322      branch-misses             #    0.08% of all branches        

       3.013204769 seconds time elapsed

       2.999945000 seconds user
       0.006140000 seconds sys

 Performance counter stats for 'timeout 3 taskset -c 40 ./pause':

           2997.37 msec task-clock                #    0.997 CPUs utilized          
                 5      context-switches          #    1.668 /sec                   
                 1      cpu-migrations            #    0.334 /sec                   
               738      page-faults               #  246.216 /sec                   
        8791793294      cycles                    #    2.933 GHz                    
         219616224      instructions              #    0.02  insn per cycle         
           5261702      branches                  #    1.755 M/sec                  
            144141      branch-misses             #    2.74% of all branches        

       3.005700376 seconds time elapsed

       2.992557000 seconds user
       0.006074000 seconds sys

现象：

计算当前 IPC 是跑在两个物理核心上的 (3.14+0.02)/(3.90+0.03) = 0.80，比同时跑 nop 的 0.42 提高明显。

待解答问题

为什么死循环里要写这么多 pause/nop，少了，是什么因素导致的 IPC 降低呢？

有什么方式可以查看当前 CPU 的流水线长度吗？

TODO:

总结

学习了 perf 命令监控程序性能
学习了 taskset 命令控制绑核
学习了超线程相关信息

上面学习到的都是需要后续深入研究。

目录