本文通过libbpf-bootstrap中一个实例来展示如何在BPF中使用全局变量来控制程序流以及全局变量的实现原理。
先看一下大神对BPF全局变量的概括
| BPF global variables look and behave exactly like a user-space variables: they can be used in expressions, updated (the non-const ones), you can even take their address and pass around into helper functions. But that is only true for the BPF code side. From user-space, they can be read and updated only through BPF skeleton: · skel->rodata for read-only variables; · skel->bss for mutable zero-initialized variables; · skel->data for non-zero-initialized mutable variables. You can still read/update them from user-space and those updates will be immediately reflected on the BPF side. But they are not global variables on the user-space side, they are just members of BPF skeleton’s rodata, bss, or data members, which are initialized during the skeleton load phase. | 
Example
通过libbpf-bootstrap中minimal application来解释BPF是如何使用全局变量。
my_pid是在BPF prog minimal.bpf.c中声明的一个全局变量;编译时放置在.bss段中;
BPF prog截获进入sys_enter_write的系统调用,并获取当前上下文的pid,若pid与my_pid相同,则输出日志信息。
Application运行流程是:
open BPF application -> config my_pid -> load & verify BPF -> attach tracepoint -> trigger BPF prog
Code
| User space | 
| int main(int argc, char **argv) { struct minimal_bpf *skel; int err; 
 /* Open BPF application */ skel = minimal_bpf__open(); if (!skel) { fprintf(stderr, "Failed to open BPF skeleton\n"); return 1; } 
 /* ensure BPF program only handles write() syscalls from our process */ skel->bss->my_pid = getpid(); 
 /* Load & verify BPF programs */ err = minimal_bpf__load(skel); if (err) { fprintf(stderr, "Failed to load and verify BPF skeleton\n"); goto cleanup; } 
 /* Attach tracepoint handler */ err = minimal_bpf__attach(skel); if (err) { fprintf(stderr, "Failed to attach BPF skeleton\n"); goto cleanup; } 
 printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` " "to see output of the BPF programs.\n"); 
 for (;;) { /* trigger our BPF program */ fprintf(stderr, "."); sleep(1); } | 
| BPF prog | 
| // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Copyright (c) 2020 Facebook */ 
					#include  
					#include  
 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 
 int my_pid = 0; 
 SEC("tp/syscalls/sys_enter_write") int handle_tp(void *ctx) { int pid = bpf_get_current_pid_tgid() >> 32; 
 if (pid != my_pid) return 0; 
 bpf_printk("BPF triggered from PID %d.\n", pid); 
 return 0; } | 
Relocatable symbols in ELF
	参照c 代码来解读汇编指令, 
						minimal.bpf.o:                 file format elf64-bpf
					 
						 
					 
						Disassembly of section tp/syscalls/sys_enter_write:
					 
						 
					 
						0000000000000000  
						       0:           85 00 00 00 0e 00 00 00     call 14    /* bpf_get_current_pid_tgid */
					 
						       1:           77 00 00 00 20 00 00 00     r0 >>= 32    /* store returned pid value */
					 
						       2:           18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00         r1 = 0 ll
					 
						                         0000000000000010:  R_BPF_64_64       my_pid
					 
						       4:           61 11 00 00 00 00 00 00     r1 = *(u32 *)(r1 + 0)
					 
						       5:           5d 01 05 00 00 00 00 00     if r1 != r0 goto +5  
						       6:           18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00         r1 = 0 ll
					 
						                         0000000000000030:  R_BPF_64_64       .rodata
					 
						       8:           b7 02 00 00 1c 00 00 00     r2 = 28
					 
						       9:           bf 03 00 00 00 00 00 00     r3 = r0
					 
						      10:          85 00 00 00 06 00 00 00     call 6
					 
						 
					 
						0000000000000058  
						      11:          b7 00 00 00 00 00 00 00     r0 = 0
					 
						      12:          95 00 00 00 00 00 00 00     exit
					 
						int handle_tp(void *ctx)
					 
						{
					 
						                  int pid = bpf_get_current_pid_tgid() >> 32;
					 
						 
					 
						                  if (pid != my_pid)
					 
						                                    return 0;
					 
						 
					 
						                  bpf_printk("BPF triggered from PID %d.\n", pid);
					 
						 
					 
						                  return 0;
					 
						}
					 
		insn #0:  call bpf_get_current_pid_tgid();
	 
		insn #1: 返回值右移32bit, 结果存入r0;
	 
		insn #5: 比较pid 和 my_pid;
	 
		 
	 
		那么可以推测 insn #2和insn #4与 全局变量my_pid有关;
	 
		insn#2是两个BPF指令的长度;它是一条LD_imm64指令,BPF loader在load之前会修改 insn #2指令;
	 
		 
	 
		 要修改insn#2指令需要借助tp/syscalls/sys_enter_write的重定位表
	 
						typedef struct
					 
						{
					 
						  Elf64_Addr    r_offset;  // Offset from the beginning of section.
					 
						  Elf64_Xword   r_info;    // Relocation type and symbol index.
					 
						} Elf64_Rel;
					 
		可以看到有两条重定位信息,OFFSET表示在段内的偏移;
	 
		my_pid重定位信息显示,在段内偏移是第3条指令, relocation type是 1(R_BPF_64_64),对应符号表的index是 6 (.bss); my_pid的初始值为0;
	 
		 
	 
		BPF prog如何引用Global variable?
	 
		 
	 
		全局变量是通过bpf map加载到kernel,map类型为BPF_MAP_TYPE_ARRAY
	 
		·      open phase,BPF backend分析 BPF object file,会发现global variables 并初始化一个map结构,但这时并不创建map;
	 
		·      Load phase,创建map,接着会处理所有包含ELF relocation的prog section ,修改的BPF  insn将map fd 以立即数的方式编码,并load到reg。这样内核就可以通过fd找到内核态的map指针。
	 
		创建对应 .data/.rodata/.bss section的map结构,填充初始化信息;
	 
		bpf_object__init_maps
	 
		->bpf_object__init_global_data_maps
	 
		->bpf_object__init_internal_map()
	 
		 
	 
		收集bpf prog中重定位信息,本文例子中对应的是 解析.reltp/syscalls/sys_enter_write section的内容,并记录在prog->reloc_desc;
	 
		 
	 
		bpf_object__collect_relos
	 
		-> bpf_object__collect_prog_relos
	 
		            -> bpf_program__record_reloc  //  记录重定位描述信息
	 
		 
	 
		创建/更新 bss map;更新bss段内的变量的实现后面章节有介绍。
	 
		 
	 
		bpf_object__create_maps
	 
		-> bpf_object__create_map
	 
		-> bpf_object__populate_internal_map()
	 
		            -> bpf_map_update_elem(map->fd, &zero, map->mmaped, 0);
	 
		在加载bpf prog到内核之前,loader将 map fd编码在指令中;
	 
		 
	 
		bpf_object__relocate()
	 
		-> bpf_object__relocate_data()
	 
		            -> case RELO_DATA:
	 
		                        insn[1].imm = insn[0].imm + relo->sym_off;
	 
		                        insn[0].src_reg = BPF_PSEUDO_MAP_VALUE;
	 
		                        insn[0].imm = obj->maps[relo->map_idx].fd;   // map fd
	 
		 
	 
		原有指令
	 
		       2:   18 01 00 00 00 00 00 00 | 00 00 00 00 00 00 00 00     r1 = 0 ll
	 
		insn[0].code = 0x18
	 
		insn[0].dst_reg = r1
	 
		 
	 
		patch后指令:
	 
		insn[0].code = 0x18
	 
		insn[0].dst_reg = r1
	 
		insn[0].src_reg = BPF_PSEUDO_MAP_VALUE
	 
		insn[0]. imm = obj->maps[relo->map_idx].fd
	 
		insn[1].imm = insn[0].imm + relo->sym_off;   //  段内偏移
	 
		 
	 
		可以阅读array_map_alloc()代码;
	 
		需要注意的是,实际的数据是通过array->value访问,对于设置了BPF_F_MMAPABLE flag的map,要确保array->value 页面对齐。
	 
		copy 用户数据到array->value
	 
		在bpf verify阶段会再次修改ld_imm64指令,通过map fd找到实际访问的内存地址;并将该地址编码在指令中。
	 
		 
	 
		resolve_pseudo_ldimm64
	 
		-> map = __bpf_map_get(f);  // f.file->private_data;
	 
		-> map->ops->map_direct_value_addr(map, &addr, off);    //
	 
		            -> array_map_direct_value_addr()
	 
		                        -> struct bpf_array *array = container_of(map, struct bpf_array, map);
	 
		                        -> *imm = (unsigned long)array->value;  // 内核中全局变量所在的地址;
	 
		-> addr += off;
	 
		insn[0].imm = (u32)addr;
	 
		insn[1].imm = addr >> 32;
	 
		此时insn内容为
	 
		insn[0].code = 0x18
	 
		insn[0].dst_reg = r1
	 
		insn[0].src_reg = BPF_PSEUDO_MAP_VALUE
	 
		insn[0]. imm = (u32)addr;
	 
		insn[1].imm = addr >> 32;
	 
		 
	 
		map的有效地址被load到r1
	 
		 
	 
		这样BPF prog访问全局变量就不存在因查找带来的overhead;
	 
		直接访问内存。
	 
			其本质是用户空间与内核空间共享/访问同一块内存;
		 
			通过BPF skeleton可以实现在用户空间对全局变量的操作。
		 
							struct minimal_bpf {
						 
							                  ……
						 
							                  struct minimal_bpf__bss {
						 
							                                    int my_pid;
						 
							                  } *bss;
						 
							                  struct minimal_bpf__rodata {
						 
							                  } *rodata;
						 
							};
						 
			 
		 
			在open 阶段, skeleton和 loader之间共享内存来初始化 bss内的全局变量;
		 
			 
		 
			BPF loader在初始化bpf map时,会通过mmap()申请一片内存,
		 
		struct minimal_bpf  *bss 最终指向了 mmap() 返回的地址;
	 
		 
	 
						s->maps[0].mmaped = (void **)&obj->bss; 
					 
						**mmaped = s->maps[i].mmaped;
					 
						*mmaped = (*map)->mmaped;
					 
		 
	 
		此时用户空间可以重写 skel->bss->my_pid 来更新 bss段内的变量值。
	 
		注意这时还没有创建 .bss map;仅仅在对 .bss内变量做初始化。
	 
		在load bpf成功后, bss map已经被create,可以得到其map fd;
	 
		再次执行mmap();这次入参fd被设定为新创建map的fd,
	 
						**mmaped = s->maps[i].mmaped;
					 
						*mmaped = mmap(map->mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, map_fd, 0);
					 
						 
					 
		 
	 
		struct minimal_bpf  *bss再次被更新,指向了kernel存放全局变量的内存地址。
	 
		 
	 
		由于设置了map_fd,mmap()触发的内核态函数是array_map_mmap(), 该函数映射内核态array->value处的内存到用户空间,至此用户可以自由修改bpf prog的全局变量。
	
	
		
			
	
				 
		
	
					 
				
					 
			
	
		
			
	
				 
		
	
					 
			
	
		Global Variable Map Relocation
	
	
		open phase
	
	
		Load phase
	
	
		user space
	
	
		create & update map
	
	
		patch bytecode
	
	
	
		kernel space
	
	
		create map
	
	
	
		update map
	
	
	
		kernel patch ld_imm64 instructions
	
	
		ld_imm64指令变换
	
	
		
			Sharing Global Variable Between User Space and Kernel Space
		
		
			
				
		
					 
			
		
						 
				
			Open Phase
		
		
	
		
			
	
				 
			
					 
			
				 
		
	
					 
			
		Load Phase
	
	
		
			
	
				 
			
					 
			
				 
		
	
					 
			
