Performance Optimization¶
Learn how to build high-performance eBPF tools that can handle production workloads with minimal overhead. This guide covers optimization techniques for both eBPF programs and userspace applications.
🎯 Performance Goals¶
Well-optimized eBPF tools should achieve:
- < 1% CPU overhead even under heavy load
- < 10MB memory usage for userspace components
- < 100ns latency added to monitored operations
- > 1M events/second processing capability
🔧 eBPF Program Optimizations¶
1. Minimize Program Complexity¶
Reduce Instructions¶
// ❌ Inefficient: Multiple helper calls
int trace_exec(void *ctx) {
u64 pid_tgid = bpf_get_current_pid_tgid();
u32 pid = pid_tgid & 0xFFFFFFFF;
u32 tgid = pid_tgid >> 32;
struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
if (!data) return 0;
data->pid = pid;
data->tgid = tgid;
// ...
}
// ✅ Efficient: Single call, direct assignment
int trace_exec(void *ctx) {
struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
if (!data) return 0;
u64 pid_tgid = bpf_get_current_pid_tgid();
data->pid = pid_tgid & 0xFFFFFFFF;
data->tgid = pid_tgid >> 32;
// ...
}
Optimize Data Structures¶
// ❌ Inefficient: Large, sparse structure
struct inefficient_data {
u32 pid;
char padding1[60]; // Wasted space
u64 timestamp;
char padding2[120]; // More waste
char comm[16];
};
// ✅ Efficient: Packed, minimal structure
struct efficient_data {
u32 pid;
u32 tgid; // Use all 32 bits
u64 timestamp;
char comm[16];
} __attribute__((packed));
2. Early Filtering¶
Filter events in kernel space to reduce userspace processing:
// Configuration map for filters
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct filter_config);
__uint(max_entries, 1);
} config SEC(".maps");
struct filter_config {
u32 target_pid; // 0 = no filter
u8 target_comm[16]; // Empty = no filter
u8 enable_filter; // 0/1
};
SEC("tracepoint/sched/sched_process_exec")
int trace_exec_filtered(void *ctx) {
// Get filter configuration
u32 key = 0;
struct filter_config *cfg = bpf_map_lookup_elem(&config, &key);
if (!cfg || !cfg->enable_filter) {
goto process_event; // No filtering
}
// PID filtering
if (cfg->target_pid != 0) {
u32 current_pid = bpf_get_current_pid_tgid() & 0xFFFFFFFF;
if (current_pid != cfg->target_pid) {
return 0; // Skip this event
}
}
// Command filtering
if (cfg->target_comm[0] != '\0') {
char current_comm[16];
bpf_get_current_comm(¤t_comm, sizeof(current_comm));
// Simple string comparison
bool match = true;
for (int i = 0; i < 16; i++) {
if (current_comm[i] != cfg->target_comm[i]) {
match = false;
break;
}
if (current_comm[i] == '\0') break;
}
if (!match) return 0; // Skip this event
}
process_event:
// Process the event normally
struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
// ... rest of processing
return 0;
}
3. Efficient Memory Access¶
Use Appropriate Helper Functions¶
// ❌ Slow: Multiple memory accesses
int get_process_info(struct data_t *data) {
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
// These require multiple probe_read calls
bpf_probe_read(&data->pid, sizeof(data->pid), &task->pid);
bpf_probe_read(&data->tgid, sizeof(data->tgid), &task->tgid);
bpf_probe_read(&data->comm, sizeof(data->comm), &task->comm);
return 0;
}
// ✅ Fast: Use helper functions when available
int get_process_info_fast(struct data_t *data) {
u64 pid_tgid = bpf_get_current_pid_tgid(); // Single helper call
data->pid = pid_tgid & 0xFFFFFFFF;
data->tgid = pid_tgid >> 32;
bpf_get_current_comm(&data->comm, sizeof(data->comm)); // Optimized helper
return 0;
}
Minimize String Operations¶
// ❌ Expensive: String operations in eBPF
int expensive_string_ops(char *filename) {
char prefix[] = "/tmp/";
// Avoid complex string operations
if (my_strncmp(filename, prefix, 5) == 0) {
// This is expensive in eBPF
}
return 0;
}
// ✅ Efficient: Simple byte comparisons
int efficient_path_check(char *filename) {
// Direct byte comparison
if (filename[0] == '/' &&
filename[1] == 't' &&
filename[2] == 'm' &&
filename[3] == 'p' &&
filename[4] == '/') {
// Match found
}
return 0;
}
4. Optimize Map Operations¶
Choose Right Map Type¶
// For frequent lookups with known keys
struct {
__uint(type, BPF_MAP_TYPE_ARRAY); // O(1) lookup
__uint(max_entries, 1024);
} fast_array SEC(".maps");
// For dynamic keys with good distribution
struct {
__uint(type, BPF_MAP_TYPE_HASH); // O(1) average
__uint(max_entries, 10000);
} dynamic_hash SEC(".maps");
// For ordered data or statistics
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); // Per-CPU, no locks
__uint(max_entries, 256);
} stats_array SEC(".maps");
Batch Map Operations¶
// ❌ Multiple individual updates
for (int i = 0; i < 10; i++) {
bpf_map_update_elem(&counters, &keys[i], &values[i], BPF_ANY);
}
// ✅ Batch operation (when available)
bpf_map_update_batch(&counters, keys, values, &count, BPF_ANY);
📊 Ring Buffer Optimizations¶
1. Right-Size Your Buffers¶
// Buffer sizing guidelines
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 24); // 16MB - high-frequency events
} high_freq_events SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 20); // 1MB - medium-frequency events
} medium_freq_events SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 16); // 64KB - low-frequency events
} low_freq_events SEC(".maps");
2. Minimize Event Size¶
// ❌ Large events (slow)
struct bloated_event {
u64 timestamp;
u32 pid;
u32 tid;
u32 uid;
u32 gid;
char comm[16];
char filename[4096]; // Often mostly empty
u8 padding[512]; // Waste
};
// ✅ Compact events (fast)
struct compact_event {
u32 pid; // Most important data first
u32 timestamp_delta; // Delta from base time
u16 filename_len; // Actual length
char comm[16];
char filename[]; // Variable length
};
3. Efficient Ring Buffer Usage¶
// ✅ Efficient event submission
int submit_compact_event(char *filename, u16 filename_len) {
// Calculate actual size needed
u32 event_size = sizeof(struct compact_event) + filename_len;
struct compact_event *event = bpf_ringbuf_reserve(&events, event_size, 0);
if (!event) return 0;
// Fill data efficiently
event->pid = bpf_get_current_pid_tgid() & 0xFFFFFFFF;
event->timestamp_delta = get_time_delta();
event->filename_len = filename_len;
bpf_get_current_comm(&event->comm, sizeof(event->comm));
// Copy only needed bytes
bpf_probe_read_user_str(&event->filename, filename_len, filename);
bpf_ringbuf_submit(event, 0);
return 0;
}
⚡ Userspace Optimizations¶
1. Efficient Event Processing¶
// ✅ Optimized event reader with batching
type OptimizedEventProcessor struct {
reader *ringbuf.Reader
eventBuffer []RawEvent
processBuffer []ProcessedEvent
batchSize int
}
func (p *OptimizedEventProcessor) ProcessEvents(ctx context.Context) error {
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
// Read events in batches
events, err := p.readEventBatch()
if err != nil {
return err
}
if len(events) == 0 {
continue
}
// Process batch efficiently
processed := p.processBatch(events)
// Output batch
p.outputBatch(processed)
}
}
func (p *OptimizedEventProcessor) readEventBatch() ([]RawEvent, error) {
p.eventBuffer = p.eventBuffer[:0] // Reuse slice
for len(p.eventBuffer) < p.batchSize {
record, err := p.reader.Read()
if err != nil {
if errors.Is(err, ringbuf.ErrClosed) {
break
}
return nil, err
}
var event RawEvent
if err := binary.Read(bytes.NewReader(record.RawSample),
binary.LittleEndian, &event); err != nil {
continue // Skip malformed events
}
p.eventBuffer = append(p.eventBuffer, event)
}
return p.eventBuffer, nil
}
2. Memory Pool for Events¶
// Event pool to reduce GC pressure
var eventPool = sync.Pool{
New: func() interface{} {
return &ProcessedEvent{}
},
}
type ProcessedEvent struct {
PID uint32
Comm string
Filename string
Timestamp time.Time
}
func (p *OptimizedEventProcessor) processBatch(rawEvents []RawEvent) []*ProcessedEvent {
processed := make([]*ProcessedEvent, 0, len(rawEvents))
for _, raw := range rawEvents {
// Get from pool instead of allocating
event := eventPool.Get().(*ProcessedEvent)
// Reset and populate
*event = ProcessedEvent{
PID: raw.PID,
Comm: nullTerminatedString(raw.Comm[:]),
Filename: nullTerminatedString(raw.Filename[:]),
Timestamp: time.Unix(0, int64(raw.Timestamp)),
}
processed = append(processed, event)
}
return processed
}
// Return events to pool when done
func (p *OptimizedEventProcessor) cleanup(events []*ProcessedEvent) {
for _, event := range events {
eventPool.Put(event)
}
}
3. Efficient String Handling¶
// ✅ Optimized string conversion
func nullTerminatedString(b []byte) string {
// Find null terminator without allocation
for i, c := range b {
if c == 0 {
return string(b[:i])
}
}
return string(b)
}
// ✅ String interning for repeated values
type StringInterner struct {
mu sync.RWMutex
strings map[string]string
}
func (s *StringInterner) Intern(str string) string {
s.mu.RLock()
if interned, exists := s.strings[str]; exists {
s.mu.RUnlock()
return interned
}
s.mu.RUnlock()
s.mu.Lock()
defer s.mu.Unlock()
// Double-check after acquiring write lock
if interned, exists := s.strings[str]; exists {
return interned
}
s.strings[str] = str
return str
}
📈 Performance Monitoring¶
1. Measure eBPF Program Performance¶
// Add performance counters
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, 16);
} perf_stats SEC(".maps");
enum {
STAT_EVENTS_PROCESSED = 0,
STAT_EVENTS_DROPPED,
STAT_TOTAL_PROCESSING_TIME,
STAT_MAX_PROCESSING_TIME,
};
SEC("tracepoint/sched/sched_process_exec")
int trace_exec_with_stats(void *ctx) {
u64 start_time = bpf_ktime_get_ns();
// Increment event counter
u32 key = STAT_EVENTS_PROCESSED;
u64 *counter = bpf_map_lookup_elem(&perf_stats, &key);
if (counter) {
(*counter)++;
}
// Your event processing here
struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
if (!data) {
// Increment drop counter
key = STAT_EVENTS_DROPPED;
counter = bpf_map_lookup_elem(&perf_stats, &key);
if (counter) {
(*counter)++;
}
return 0;
}
// Process event...
bpf_ringbuf_submit(data, 0);
// Record processing time
u64 processing_time = bpf_ktime_get_ns() - start_time;
key = STAT_TOTAL_PROCESSING_TIME;
counter = bpf_map_lookup_elem(&perf_stats, &key);
if (counter) {
(*counter) += processing_time;
}
return 0;
}
2. Monitor Userspace Performance¶
type PerformanceMetrics struct {
EventsProcessed uint64
EventsDropped uint64
ProcessingTimeNs uint64
MemoryUsage uint64
GCPauses uint64
}
func (p *OptimizedEventProcessor) GetMetrics() PerformanceMetrics {
var m runtime.MemStats
runtime.ReadMemStats(&m)
return PerformanceMetrics{
EventsProcessed: atomic.LoadUint64(&p.eventsProcessed),
EventsDropped: atomic.LoadUint64(&p.eventsDropped),
ProcessingTimeNs: atomic.LoadUint64(&p.processingTimeNs),
MemoryUsage: m.Alloc,
GCPauses: m.NumGC,
}
}
🎯 Benchmarking¶
1. Create Performance Tests¶
func BenchmarkEventProcessing(b *testing.B) {
processor := NewOptimizedEventProcessor()
// Generate test events
events := generateTestEvents(1000)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
processor.processBatch(events)
}
}
func BenchmarkStringConversion(b *testing.B) {
testData := []byte("test_process_name\x00\x00\x00")
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = nullTerminatedString(testData)
}
}
2. Production Monitoring¶
#!/bin/bash
# Monitor eBPF program performance
echo "eBPF Program Statistics:"
bpftool prog show | grep your_program
echo "Ring Buffer Usage:"
bpftool map show | grep events
echo "System Impact:"
top -p $(pgrep your_tool) -n 1
echo "Memory Usage:"
ps -o pid,vsz,rss,comm -p $(pgrep your_tool)
🔧 Optimization Checklist¶
eBPF Program¶
- Minimize program instructions
- Use efficient data structures
- Implement early filtering
- Choose appropriate map types
- Minimize string operations
- Use helper functions when available
Ring Buffers¶
- Right-size buffers for workload
- Minimize event structure size
- Use variable-length events when appropriate
- Batch operations when possible
Userspace Application¶
- Process events in batches
- Use memory pools for frequent allocations
- Implement string interning for repeated values
- Monitor GC pressure and tune accordingly
Monitoring¶
- Add performance counters to eBPF programs
- Monitor ring buffer utilization
- Track processing latency and throughput
- Monitor system resource usage
📊 Performance Targets¶
| Metric | Target | Notes |
|---|---|---|
| CPU Overhead | < 1% | Under normal load |
| Memory Usage | < 10MB | Userspace component |
| Event Latency | < 100ns | Added to monitored operations |
| Throughput | > 1M events/sec | On modern hardware |
| Ring Buffer Utilization | < 80% | Prevent drops |
| GC Pauses | < 1ms | Go application |
Following these optimization techniques will help you build eBPF tools that can handle production workloads efficiently! 🚀