edge Expert 25 min read

Writing Custom C Kernels for Pyvorin Edge

Learn the C ABI contract, write a hand-optimized NEON kernel, build a shared object, and register it with CompilerBridge for edge-native execution.

Published Jun 2, 2026

Introduction

Pyvorin Edge allows you to drop below the Python layer and execute hand-written C kernels on the edge device. This is useful when the compiler cannot auto-vectorize a hot path, or when you need explicit control over SIMD registers on ARM64.

In this article you will:

  • Understand the ABIContract class from module_host/abi.py.
  • Write a simple C kernel that adds two float arrays.
  • Rewrite it with ARM NEON intrinsics (vaddq_f32, vld1q_f32, vst1q_f32).
  • Build a .so with GCC and load it through ModuleLoader.
  • Register the kernel with CompilerBridge so the pipeline can call it.

The ABI Contract

Every shared object that Pyvorin Edge loads must declare an ABIContract. This dataclass lives in edge_runtime/pyv_edge_agent/module_host/abi.py and tells the runtime:

  • Which symbol to call (function_name)
  • The ctypes signature (arg_types and return_type)
  • The calling convention (always "cdecl" on Linux)
from pyv_edge_agent.module_host.abi import ABIContract
import ctypes

abi = ABIContract(
    function_name="vec_add_f32",
    arg_types=[
        ctypes.POINTER(ctypes.c_float),   # a
        ctypes.POINTER(ctypes.c_float),   # b
        ctypes.POINTER(ctypes.c_float),   # out
        ctypes.c_size_t,                  # n
    ],
    return_type=None,
    calling_convention="cdecl",
)

Writing a Simple C Kernel

Start with a scalar fallback kernel so you have a reference implementation to validate against.

Scalar Reference (vec_add_scalar.c)

#include <stddef.h>

void vec_add_f32(
    const float *restrict a,
    const float *restrict b,
    float *restrict out,
    size_t n
) {
    for (size_t i = 0; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

NEON-Optimized Version (vec_add_neon.c)

#include <arm_neon.h>
#include <stddef.h>

void vec_add_f32(
    const float *restrict a,
    const float *restrict b,
    float *restrict out,
    size_t n
) {
    size_t i = 0;
    /* Process 16 floats per iteration (four 128-bit vectors) */
    for (; i + 15 < n; i += 16) {
        float32x4_t va0 = vld1q_f32(&a[i + 0]);
        float32x4_t va1 = vld1q_f32(&a[i + 4]);
        float32x4_t va2 = vld1q_f32(&a[i + 8]);
        float32x4_t va3 = vld1q_f32(&a[i + 12]);

        float32x4_t vb0 = vld1q_f32(&b[i + 0]);
        float32x4_t vb1 = vld1q_f32(&b[i + 4]);
        float32x4_t vb2 = vld1q_f32(&b[i + 8]);
        float32x4_t vb3 = vld1q_f32(&b[i + 12]);

        vst1q_f32(&out[i + 0],  vaddq_f32(va0, vb0));
        vst1q_f32(&out[i + 4],  vaddq_f32(va1, vb1));
        vst1q_f32(&out[i + 8],  vaddq_f32(va2, vb2));
        vst1q_f32(&out[i + 12], vaddq_f32(va3, vb3));
    }
    /* Scalar tail */
    for (; i < n; i++) {
        out[i] = a[i] + b[i];
    }
}

Building the Shared Object

Compile for the target ARM64 edge device. The flags below enable position-independent code, maximum optimization, and the correct architecture level.

gcc -shared -fPIC -O3 -march=armv8-a+fp+simd \
    -o libvecadd.so vec_add_neon.c

Verify the symbol is exported:

nm -D libvecadd.so | grep vec_add_f32

Loading and Testing with ModuleLoader

ModuleLoader in module_host/loader.py is thread-safe and caches symbol lookups after the first call.

import ctypes
import numpy as np
from pyv_edge_agent.module_host.abi import ABIContract
from pyv_edge_agent.module_host.loader import ModuleLoader

abi = ABIContract(
    function_name="vec_add_f32",
    arg_types=[
        ctypes.POINTER(ctypes.c_float),
        ctypes.POINTER(ctypes.c_float),
        ctypes.POINTER(ctypes.c_float),
        ctypes.c_size_t,
    ],
    return_type=None,
)

loader = ModuleLoader()
loader.load("./libvecadd.so", abi)

# Prepare data
n = 1024
a = np.arange(n, dtype=np.float32)
b = np.arange(n, dtype=np.float32) * 2.0
out = np.empty(n, dtype=np.float32)

# Get ctypes pointers
pa = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
pb = b.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
po = out.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

loader.call("vec_add_f32", pa, pb, po, n)

np.testing.assert_allclose(out, a + b)
loader.unload()
print("Kernel test passed.")

Registering with CompilerBridge

For pipeline integration, register the kernel through CompilerBridge so it appears in the deployment manifest. The bridge also provides validate_compilation() to compare the native output against a Python reference.

from pyvorin_edge.compiler_bridge import CompilerBridge

def python_reference(a, b):
    return [x + y for x, y in zip(a, b)]

bridge = CompilerBridge()

# If you already have a .so from hand-written C, generate its ABI
abi = bridge.generate_abi("./libvecadd.so", "vec_add_f32")

# Validate against Python reference using ModuleLoader under the hood
report = bridge.validate_compilation(
    "./libvecadd.so",
    python_reference,
    test_inputs=[
        (list(range(64)), list(range(64))),
        ([1.5] * 128, [2.5] * 128),
    ],
)
print(report)
# {'passed': 2, 'failed': 0, 'mismatches': []}

Complete Test Script

Copy-paste the script below to validate your kernel end-to-end.

#!/usr/bin/env python3
"""End-to-end test for a custom C kernel on Pyvorin Edge."""
import ctypes
import numpy as np
from pathlib import Path
from pyv_edge_agent.module_host.abi import ABIContract
from pyv_edge_agent.module_host.loader import ModuleLoader
from pyvorin_edge.compiler_bridge import CompilerBridge

SO_PATH = Path(__file__).with_name("libvecadd.so")

abi = ABIContract(
    function_name="vec_add_f32",
    arg_types=[
        ctypes.POINTER(ctypes.c_float),
        ctypes.POINTER(ctypes.c_float),
        ctypes.POINTER(ctypes.c_float),
        ctypes.c_size_t,
    ],
    return_type=None,
)

def python_reference(a, b):
    return [x + y for x, y in zip(a, b)]

def main() -> None:
    bridge = CompilerBridge()
    report = bridge.validate_compilation(
        SO_PATH, python_reference,
        test_inputs=[
            (list(map(float, range(256))), list(map(float, range(256)))),
        ],
    )
    assert report["failed"] == 0, report["mismatches"]

    loader = ModuleLoader()
    loader.load(str(SO_PATH), abi)

    n = 4096
    a = np.random.rand(n).astype(np.float32)
    b = np.random.rand(n).astype(np.float32)
    out = np.empty(n, dtype=np.float32)

    loader.call(
        "vec_add_f32",
        a.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        b.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        n,
    )
    np.testing.assert_allclose(out, a + b, rtol=1e-6)
    loader.unload()
    print("All checks passed.")

if __name__ == "__main__":
    main()

Summary

You now know how to write a C kernel, vectorize it with NEON, build a .so, describe its ABI with ABIContract, load it through ModuleLoader, and validate it with CompilerBridge. In the next article we cover the plugin architecture that lets you wire custom kernels into live pipelines.