Performance Optimization¶

Learn how to build high-performance eBPF tools that can handle production workloads with minimal overhead. This guide covers optimization techniques for both eBPF programs and userspace applications.

🎯 Performance Goals¶

Well-optimized eBPF tools should achieve:

< 1% CPU overhead even under heavy load
< 10MB memory usage for userspace components
< 100ns latency added to monitored operations
> 1M events/second processing capability

🔧 eBPF Program Optimizations¶

1. Minimize Program Complexity¶

Reduce Instructions¶

// ❌ Inefficient: Multiple helper calls
int trace_exec(void *ctx) {
    u64 pid_tgid = bpf_get_current_pid_tgid();
    u32 pid = pid_tgid & 0xFFFFFFFF;
    u32 tgid = pid_tgid >> 32;

    struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
    if (!data) return 0;

    data->pid = pid;
    data->tgid = tgid;
    // ...
}

// ✅ Efficient: Single call, direct assignment
int trace_exec(void *ctx) {
    struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
    if (!data) return 0;

    u64 pid_tgid = bpf_get_current_pid_tgid();
    data->pid = pid_tgid & 0xFFFFFFFF;
    data->tgid = pid_tgid >> 32;
    // ...
}

Optimize Data Structures¶

// ❌ Inefficient: Large, sparse structure
struct inefficient_data {
    u32 pid;
    char padding1[60];    // Wasted space
    u64 timestamp;
    char padding2[120];   // More waste
    char comm[16];
};

// ✅ Efficient: Packed, minimal structure
struct efficient_data {
    u32 pid;
    u32 tgid;            // Use all 32 bits
    u64 timestamp;
    char comm[16];
} __attribute__((packed));

2. Early Filtering¶

Filter events in kernel space to reduce userspace processing:

// Configuration map for filters
struct {
    __uint(type, BPF_MAP_TYPE_ARRAY);
    __type(key, u32);
    __type(value, struct filter_config);
    __uint(max_entries, 1);
} config SEC(".maps");

struct filter_config {
    u32 target_pid;      // 0 = no filter
    u8 target_comm[16];  // Empty = no filter
    u8 enable_filter;    // 0/1
};

SEC("tracepoint/sched/sched_process_exec")
int trace_exec_filtered(void *ctx) {
    // Get filter configuration
    u32 key = 0;
    struct filter_config *cfg = bpf_map_lookup_elem(&config, &key);
    if (!cfg || !cfg->enable_filter) {
        goto process_event;  // No filtering
    }

    // PID filtering
    if (cfg->target_pid != 0) {
        u32 current_pid = bpf_get_current_pid_tgid() & 0xFFFFFFFF;
        if (current_pid != cfg->target_pid) {
            return 0;  // Skip this event
        }
    }

    // Command filtering
    if (cfg->target_comm[0] != '\0') {
        char current_comm[16];
        bpf_get_current_comm(&current_comm, sizeof(current_comm));

        // Simple string comparison
        bool match = true;
        for (int i = 0; i < 16; i++) {
            if (current_comm[i] != cfg->target_comm[i]) {
                match = false;
                break;
            }
            if (current_comm[i] == '\0') break;
        }

        if (!match) return 0;  // Skip this event
    }

process_event:
    // Process the event normally
    struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
    // ... rest of processing
    return 0;
}

3. Efficient Memory Access¶

Use Appropriate Helper Functions¶

// ❌ Slow: Multiple memory accesses
int get_process_info(struct data_t *data) {
    struct task_struct *task = (struct task_struct *)bpf_get_current_task();

    // These require multiple probe_read calls
    bpf_probe_read(&data->pid, sizeof(data->pid), &task->pid);
    bpf_probe_read(&data->tgid, sizeof(data->tgid), &task->tgid);
    bpf_probe_read(&data->comm, sizeof(data->comm), &task->comm);
    return 0;
}

// ✅ Fast: Use helper functions when available
int get_process_info_fast(struct data_t *data) {
    u64 pid_tgid = bpf_get_current_pid_tgid();  // Single helper call
    data->pid = pid_tgid & 0xFFFFFFFF;
    data->tgid = pid_tgid >> 32;
    bpf_get_current_comm(&data->comm, sizeof(data->comm));  // Optimized helper
    return 0;
}

Minimize String Operations¶

// ❌ Expensive: String operations in eBPF
int expensive_string_ops(char *filename) {
    char prefix[] = "/tmp/";

    // Avoid complex string operations
    if (my_strncmp(filename, prefix, 5) == 0) {
        // This is expensive in eBPF
    }
    return 0;
}

// ✅ Efficient: Simple byte comparisons
int efficient_path_check(char *filename) {
    // Direct byte comparison
    if (filename[0] == '/' && 
        filename[1] == 't' && 
        filename[2] == 'm' && 
        filename[3] == 'p' && 
        filename[4] == '/') {
        // Match found
    }
    return 0;
}

4. Optimize Map Operations¶

Choose Right Map Type¶

// For frequent lookups with known keys
struct {
    __uint(type, BPF_MAP_TYPE_ARRAY);        // O(1) lookup
    __uint(max_entries, 1024);
} fast_array SEC(".maps");

// For dynamic keys with good distribution
struct {
    __uint(type, BPF_MAP_TYPE_HASH);         // O(1) average
    __uint(max_entries, 10000);
} dynamic_hash SEC(".maps");

// For ordered data or statistics
struct {
    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); // Per-CPU, no locks
    __uint(max_entries, 256);
} stats_array SEC(".maps");

Batch Map Operations¶

// ❌ Multiple individual updates
for (int i = 0; i < 10; i++) {
    bpf_map_update_elem(&counters, &keys[i], &values[i], BPF_ANY);
}

// ✅ Batch operation (when available)
bpf_map_update_batch(&counters, keys, values, &count, BPF_ANY);

📊 Ring Buffer Optimizations¶

1. Right-Size Your Buffers¶

// Buffer sizing guidelines
struct {
    __uint(type, BPF_MAP_TYPE_RINGBUF);
    __uint(max_entries, 1 << 24);    // 16MB - high-frequency events
} high_freq_events SEC(".maps");

struct {
    __uint(type, BPF_MAP_TYPE_RINGBUF);
    __uint(max_entries, 1 << 20);    // 1MB - medium-frequency events
} medium_freq_events SEC(".maps");

struct {
    __uint(type, BPF_MAP_TYPE_RINGBUF);
    __uint(max_entries, 1 << 16);    // 64KB - low-frequency events
} low_freq_events SEC(".maps");

2. Minimize Event Size¶

// ❌ Large events (slow)
struct bloated_event {
    u64 timestamp;
    u32 pid;
    u32 tid;
    u32 uid;
    u32 gid;
    char comm[16];
    char filename[4096];    // Often mostly empty
    u8 padding[512];        // Waste
};

// ✅ Compact events (fast)
struct compact_event {
    u32 pid;               // Most important data first
    u32 timestamp_delta;   // Delta from base time
    u16 filename_len;      // Actual length
    char comm[16];
    char filename[];       // Variable length
};

3. Efficient Ring Buffer Usage¶

// ✅ Efficient event submission
int submit_compact_event(char *filename, u16 filename_len) {
    // Calculate actual size needed
    u32 event_size = sizeof(struct compact_event) + filename_len;

    struct compact_event *event = bpf_ringbuf_reserve(&events, event_size, 0);
    if (!event) return 0;

    // Fill data efficiently
    event->pid = bpf_get_current_pid_tgid() & 0xFFFFFFFF;
    event->timestamp_delta = get_time_delta();
    event->filename_len = filename_len;
    bpf_get_current_comm(&event->comm, sizeof(event->comm));

    // Copy only needed bytes
    bpf_probe_read_user_str(&event->filename, filename_len, filename);

    bpf_ringbuf_submit(event, 0);
    return 0;
}

⚡ Userspace Optimizations¶

1. Efficient Event Processing¶

// ✅ Optimized event reader with batching
type OptimizedEventProcessor struct {
    reader        *ringbuf.Reader
    eventBuffer   []RawEvent
    processBuffer []ProcessedEvent
    batchSize     int
}

func (p *OptimizedEventProcessor) ProcessEvents(ctx context.Context) error {
    for {
        select {
        case <-ctx.Done():
            return ctx.Err()
        default:
        }

        // Read events in batches
        events, err := p.readEventBatch()
        if err != nil {
            return err
        }

        if len(events) == 0 {
            continue
        }

        // Process batch efficiently
        processed := p.processBatch(events)

        // Output batch
        p.outputBatch(processed)
    }
}

func (p *OptimizedEventProcessor) readEventBatch() ([]RawEvent, error) {
    p.eventBuffer = p.eventBuffer[:0] // Reuse slice

    for len(p.eventBuffer) < p.batchSize {
        record, err := p.reader.Read()
        if err != nil {
            if errors.Is(err, ringbuf.ErrClosed) {
                break
            }
            return nil, err
        }

        var event RawEvent
        if err := binary.Read(bytes.NewReader(record.RawSample), 
                             binary.LittleEndian, &event); err != nil {
            continue // Skip malformed events
        }

        p.eventBuffer = append(p.eventBuffer, event)
    }

    return p.eventBuffer, nil
}

2. Memory Pool for Events¶

// Event pool to reduce GC pressure
var eventPool = sync.Pool{
    New: func() interface{} {
        return &ProcessedEvent{}
    },
}

type ProcessedEvent struct {
    PID       uint32
    Comm      string
    Filename  string
    Timestamp time.Time
}

func (p *OptimizedEventProcessor) processBatch(rawEvents []RawEvent) []*ProcessedEvent {
    processed := make([]*ProcessedEvent, 0, len(rawEvents))

    for _, raw := range rawEvents {
        // Get from pool instead of allocating
        event := eventPool.Get().(*ProcessedEvent)

        // Reset and populate
        *event = ProcessedEvent{
            PID:       raw.PID,
            Comm:      nullTerminatedString(raw.Comm[:]),
            Filename:  nullTerminatedString(raw.Filename[:]),
            Timestamp: time.Unix(0, int64(raw.Timestamp)),
        }

        processed = append(processed, event)
    }

    return processed
}

// Return events to pool when done
func (p *OptimizedEventProcessor) cleanup(events []*ProcessedEvent) {
    for _, event := range events {
        eventPool.Put(event)
    }
}

3. Efficient String Handling¶

// ✅ Optimized string conversion
func nullTerminatedString(b []byte) string {
    // Find null terminator without allocation
    for i, c := range b {
        if c == 0 {
            return string(b[:i])
        }
    }
    return string(b)
}

// ✅ String interning for repeated values
type StringInterner struct {
    mu      sync.RWMutex
    strings map[string]string
}

func (s *StringInterner) Intern(str string) string {
    s.mu.RLock()
    if interned, exists := s.strings[str]; exists {
        s.mu.RUnlock()
        return interned
    }
    s.mu.RUnlock()

    s.mu.Lock()
    defer s.mu.Unlock()

    // Double-check after acquiring write lock
    if interned, exists := s.strings[str]; exists {
        return interned
    }

    s.strings[str] = str
    return str
}

📈 Performance Monitoring¶

1. Measure eBPF Program Performance¶

// Add performance counters
struct {
    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
    __type(key, u32);
    __type(value, u64);
    __uint(max_entries, 16);
} perf_stats SEC(".maps");

enum {
    STAT_EVENTS_PROCESSED = 0,
    STAT_EVENTS_DROPPED,
    STAT_TOTAL_PROCESSING_TIME,
    STAT_MAX_PROCESSING_TIME,
};

SEC("tracepoint/sched/sched_process_exec")
int trace_exec_with_stats(void *ctx) {
    u64 start_time = bpf_ktime_get_ns();

    // Increment event counter
    u32 key = STAT_EVENTS_PROCESSED;
    u64 *counter = bpf_map_lookup_elem(&perf_stats, &key);
    if (counter) {
        (*counter)++;
    }

    // Your event processing here
    struct data_t *data = bpf_ringbuf_reserve(&events, sizeof(*data), 0);
    if (!data) {
        // Increment drop counter
        key = STAT_EVENTS_DROPPED;
        counter = bpf_map_lookup_elem(&perf_stats, &key);
        if (counter) {
            (*counter)++;
        }
        return 0;
    }

    // Process event...
    bpf_ringbuf_submit(data, 0);

    // Record processing time
    u64 processing_time = bpf_ktime_get_ns() - start_time;
    key = STAT_TOTAL_PROCESSING_TIME;
    counter = bpf_map_lookup_elem(&perf_stats, &key);
    if (counter) {
        (*counter) += processing_time;
    }

    return 0;
}

2. Monitor Userspace Performance¶

type PerformanceMetrics struct {
    EventsProcessed   uint64
    EventsDropped     uint64
    ProcessingTimeNs  uint64
    MemoryUsage       uint64
    GCPauses          uint64
}

func (p *OptimizedEventProcessor) GetMetrics() PerformanceMetrics {
    var m runtime.MemStats
    runtime.ReadMemStats(&m)

    return PerformanceMetrics{
        EventsProcessed:  atomic.LoadUint64(&p.eventsProcessed),
        EventsDropped:    atomic.LoadUint64(&p.eventsDropped),
        ProcessingTimeNs: atomic.LoadUint64(&p.processingTimeNs),
        MemoryUsage:      m.Alloc,
        GCPauses:         m.NumGC,
    }
}

🎯 Benchmarking¶

1. Create Performance Tests¶

func BenchmarkEventProcessing(b *testing.B) {
    processor := NewOptimizedEventProcessor()

    // Generate test events
    events := generateTestEvents(1000)

    b.ResetTimer()
    b.ReportAllocs()

    for i := 0; i < b.N; i++ {
        processor.processBatch(events)
    }
}

func BenchmarkStringConversion(b *testing.B) {
    testData := []byte("test_process_name\x00\x00\x00")

    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        _ = nullTerminatedString(testData)
    }
}

2. Production Monitoring¶

#!/bin/bash
# Monitor eBPF program performance

echo "eBPF Program Statistics:"
bpftool prog show | grep your_program

echo "Ring Buffer Usage:"
bpftool map show | grep events

echo "System Impact:"
top -p $(pgrep your_tool) -n 1

echo "Memory Usage:"
ps -o pid,vsz,rss,comm -p $(pgrep your_tool)

🔧 Optimization Checklist¶

eBPF Program¶

Minimize program instructions
Use efficient data structures
Implement early filtering
Choose appropriate map types
Minimize string operations
Use helper functions when available

Ring Buffers¶

Right-size buffers for workload
Minimize event structure size
Use variable-length events when appropriate
Batch operations when possible

Userspace Application¶

Process events in batches
Use memory pools for frequent allocations
Implement string interning for repeated values
Monitor GC pressure and tune accordingly

Monitoring¶

Add performance counters to eBPF programs
Monitor ring buffer utilization
Track processing latency and throughput
Monitor system resource usage

📊 Performance Targets¶

Metric	Target	Notes
CPU Overhead	< 1%	Under normal load
Memory Usage	< 10MB	Userspace component
Event Latency	< 100ns	Added to monitored operations
Throughput	> 1M events/sec	On modern hardware
Ring Buffer Utilization	< 80%	Prevent drops
GC Pauses	< 1ms	Go application

Following these optimization techniques will help you build eBPF tools that can handle production workloads efficiently! 🚀