[we]blog

From Tokens to AST

Last time I wrote a scanner for ABNF that turned grammar source into a flat stream of tokens. That post ended on a cliffhanger: next step, build a parser on top of these tokens. Here it is.

The parser turns a []Token into a tree, a list of Rules, each pointing at a Node that describes the rule’s body. And when the input is malformed, it tells you where and why, with a caret pointing at the offending byte.

All in roughly 250 lines across three files.

The AST

Before the parser, the shape of what it produces. A Rule is just a name plus a body:

/// A named grammar rule: `rulename = elements`.
pub const Rule = struct {
    name: []const u8,
    node: Node,
};
Ast.zig:7-11

The body is a tagged union with seven variants, one per ABNF construct:

/// A single node in the ABNF syntax tree.
pub const Node = union(enum) {
    /// `a / b / c` — one of the alternatives.
    alternation: []const Node,
    /// `a b c` — all elements in sequence.
    concatenation: []const Node,
    /// `[min]*[max] element` — bounded repetition.
    repetition: Repetition,
    /// `"quoted string"` — case-insensitive literal.
    char_val: []const u8,
    /// `%x41`, `%x41-5A`, or `%x41.42.43`.
    num_val: NumVal,
    /// `<prose description>` — free-form text.
    prose_val: []const u8,
    /// Reference to another rule by name.
    rulename: []const u8,
};
Ast.zig:13-29

Repetition wraps another node and gets its own struct since it carries bounds:

/// Repetition operator: `*element`, `3*5element`, `1*element`, etc.
pub const Repetition = struct {
    /// Minimum number of occurrences (default 0).
    min: usize,
    /// Maximum number of occurrences, or null for unbounded.
    max: ?usize,
    /// The repeated element.
    element: *const Node,
};
Ast.zig:31-39

Numeric values (ABNF’s %x41, %x41-5A, %x41.42.43) come in three flavors, also a tagged union:

/// A numeric value literal in one of three forms.
pub const NumVal = union(enum) {
    /// Single value, e.g. `%x41`.
    single: u8,
    /// Inclusive range, e.g. `%x41-5A`.
    range: struct { lo: u8, hi: u8 },
    /// Concatenated values, e.g. `%x41.42.43`.
    concat: []const u8,
};
Ast.zig:41-49

That’s the entire AST. 49 lines.

The Parser

The parser is a single struct holding everything needed for one pass over the tokens:

const std = @import("std");
const Token = @import("Token.zig");
const Ast = @import("Ast.zig");
const Diagnostic = @import("Diagnostic.zig");

const Parser = @This();

pub const ParseError = error{ SyntaxError, OutOfMemory, Overflow, InvalidCharacter };

tokens: []const Token,
source: []const u8,
allocator: std.mem.Allocator,
pos: usize,
diagnostic: ?Diagnostic = null,
Parser.zig:1-14

tokens is the slice produced by the scanner. source is the original text; we need it to recover the spelling of each lexeme. pos is the cursor into the token stream. The diagnostic field stays null until something goes wrong; we’ll come back to it.

init is unsurprising:

pub fn init(allocator: std.mem.Allocator, tokens: []const Token, source: []const u8) Parser {
    return .{
        .tokens = tokens,
        .source = source,
        .allocator = allocator,
        .pos = 0,
    };
}
Parser.zig:16-23

Recursive Descent

ABNF’s grammar lines up neatly with the precedence ladder of a recursive-descent parser. Each grammar production becomes a function; lower-precedence productions call higher-precedence ones:

parse           = *rule
rule            = rulename "=" alternation
alternation     = concatenation *("/" concatenation)
concatenation   = repetition *(repetition)
repetition      = [repeat] element
element         = rulename / group / option / char-val / num-val / prose-val

Top-down. Let’s walk it.

The Rule List

parse drives the whole thing. It reads rules until EOF, with one twist for incremental alternation¹ (=/):

/// Parse all rules from the token stream.
pub fn parse(self: *Parser) ![]const Ast.Rule {
    var rules: std.ArrayList(Ast.Rule) = .empty;

    self.skipTrivia();
    while (self.peek().tag != .eof) {
        const rule = try self.parseRule();

        // Handle incremental alternation (=/): merge with existing rule.
        var merged = false;
        for (rules.items) |*existing| {
            if (std.mem.eql(u8, existing.name, rule.name)) {
                existing.node = try self.mergeAlternation(existing.node, rule.node);
                merged = true;
                break;
            }
        }
        if (!merged) {
            try rules.append(self.allocator, rule);
        }

        self.skipTrivia();
    }

    return try rules.toOwnedSlice(self.allocator);
}
Parser.zig:25-50

skipTrivia consumes comments and newlines so the individual parse functions don’t have to. When we see a redefinition of a rule name, mergeAlternation flattens both sides into one alternation node:

fn mergeAlternation(self: *Parser, a: Ast.Node, b: Ast.Node) !Ast.Node {
    var alts: std.ArrayList(Ast.Node) = .empty;

    switch (a) {
        .alternation => |items| for (items) |item| try alts.append(self.allocator, item),
        else => try alts.append(self.allocator, a),
    }
    switch (b) {
        .alternation => |items| for (items) |item| try alts.append(self.allocator, item),
        else => try alts.append(self.allocator, b),
    }

    return .{ .alternation = try alts.toOwnedSlice(self.allocator) };
}
Parser.zig:52-65

Rule, Alternation, Concatenation

A rule is rulename "=" body. With skipTrivia doing the heavy lifting, this is almost trivial:

/// rule = rulename ("=" / "=/") alternation
fn parseRule(self: *Parser) !Ast.Rule {
    const name = self.advance().lexeme(self.source);
    self.skipTrivia();
    if (self.peek().tag == .equals or self.peek().tag == .equals_slash) _ = self.advance();
    self.skipTrivia();
    return .{ .name = name, .node = try self.parseAlternation() };
}
Parser.zig:69-76

The or accepts both = and =/. The parsing machinery is the same; parse already handles the merging.

Alternation is the lowest precedence: parse a concatenation, and if you see a /, parse another, and so on:

/// alternation = concatenation *("/" concatenation)
fn parseAlternation(self: *Parser) !Ast.Node {
    var alts: std.ArrayList(Ast.Node) = .empty;
    try alts.append(self.allocator, try self.parseConcatenation());

    while (true) {
        self.skipTrivia();
        if (self.peek().tag != .slash) break;
        _ = self.advance();
        self.skipTrivia();
        try alts.append(self.allocator, try self.parseConcatenation());
    }

    if (alts.items.len == 1) {
        const single = alts.items[0];
        alts.deinit(self.allocator);
        return single;
    }
    return .{ .alternation = try alts.toOwnedSlice(self.allocator) };
}
Parser.zig:78-97

A small but worthwhile flourish: if there’s only one alternative, return it directly instead of wrapping it in a single-element alternation node. Same for concatenation below. This keeps the AST clean: foo = a produces a bare rulename node, not alternation([rulename]).

Concatenation has no separator; elements just sit next to each other. So we keep parsing repetitions as long as we’re “at” something that could start one:

/// concatenation = repetition *(repetition)
fn parseConcatenation(self: *Parser) !Ast.Node {
    var elems: std.ArrayList(Ast.Node) = .empty;

    while (self.isAtRepetition()) {
        try elems.append(self.allocator, try self.parseRepetition());
        self.skipTrivia();
    }

    if (elems.items.len == 1) {
        const single = elems.items[0];
        elems.deinit(self.allocator);
        return single;
    }
    return .{ .concatenation = try elems.toOwnedSlice(self.allocator) };
}
Parser.zig:99-114

The interesting bit is isAtRepetition, the lookahead that decides when concatenation has run out of elements. It works by FIRST set²: the set of token tags that can begin a repetition.

/// Can the current position start a repetition/element?
fn isAtRepetition(self: *Parser) bool {
    return switch (self.peek().tag) {
        .star,
        .number,
        .left_paren,
        .left_bracket,
        .char_val,
        .bin_val,
        .dec_val,
        .hex_val,
        .prose_val,
        => true,
        // A rulename followed by = or =/ starts a new rule, not an element.
        .rulename => {
            const next = self.peekNextMeaningful();
            return next != .equals and next != .equals_slash;
        },
        else => false,
    };
}
Parser.zig:290-310

The interesting case is .rulename. A bare rulename can start an element: a rule body is allowed to reference another rule. But a rulename followed by = or =/ is the start of the next rule, not part of the current one. Without that disambiguation, parseConcatenation would happily eat the next rule’s name as part of the current rule’s body.

peekNextMeaningful does the lookahead, skipping comments and newlines:

/// Next non-trivia token tag after the current position.
fn peekNextMeaningful(self: *Parser) Token.Tag {
    var i = self.pos + 1;
    while (i < self.tokens.len) : (i += 1) {
        const tag = self.tokens[i].tag;
        if (tag != .comment and tag != .newline) return tag;
    }
    return .eof;
}
Parser.zig:312-320

This is the parser’s only “infinite” lookahead, but in practice it’s bounded by the rule separator, and rules are short.

Repetition

Repetition is [repeat] element. The repeat prefix has four forms: N, N*, N*M, and *M (and bare * for unbounded). They all start with either a number or a *:

/// repetition = [repeat] element
/// repeat     = 1*DIGIT / (*DIGIT "*" *DIGIT)
fn parseRepetition(self: *Parser) !Ast.Node {
    var min: usize = 0;
    var max: ?usize = null;
    var has_repeat = false;

    switch (self.peek().tag) {
        .number => {
            if (self.peekAt(1).tag == .star) {
                // number "*" [number]  — e.g. 3*5
                min = try self.parseNumber();
                _ = self.advance(); // consume *
                if (self.peek().tag == .number) max = try self.parseNumber();
            } else {
                // bare number — exactly N
                const n = try self.parseNumber();
                min = n;
                max = n;
            }
            has_repeat = true;
        },
        .star => {
            // "*" [number]  — e.g. *5
            _ = self.advance();
            if (self.peek().tag == .number) max = try self.parseNumber();
            has_repeat = true;
        },
        else => {},
    }

    const element = try self.parseElement();
    if (!has_repeat) return element;

    const ptr = try self.allocator.create(Ast.Node);
    ptr.* = element;
    return .{ .repetition = .{ .min = min, .max = max, .element = ptr } };
}
Parser.zig:116-153

This is the only place the parser needs two-token lookahead. peekAt(1) checks whether a leading number is part of an N*M repeat or just a bare element count. After the repeat is parsed (or skipped), we consume one element and either return it bare or wrap it.

Elements, Groups, Options

The element rule is the leaf of the grammar: one token decides the shape.

/// element = rulename / group / option / char-val / num-val / prose-val
fn parseElement(self: *Parser) ParseError!Ast.Node {
    return switch (self.peek().tag) {
        .rulename => .{ .rulename = self.advance().lexeme(self.source) },
        .char_val => {
            const lex = self.advance().lexeme(self.source);
            return .{ .char_val = lex[1 .. lex.len - 1] };
        },
        .bin_val, .dec_val, .hex_val => .{ .num_val = try self.parseNumVal() },
        .prose_val => {
            const lex = self.advance().lexeme(self.source);
            return .{ .prose_val = lex[1 .. lex.len - 1] };
        },
        .left_paren => try self.parseGroup(),
        .left_bracket => try self.parseOption(),
        else => {
            self.fail(.element, self.peek());
            return error.SyntaxError;
        },
    };
}
Parser.zig:155-175

The else branch is where errors are born. We’ll get to that in a moment.

( ... ) and [ ... ] both contain an alternation; the difference is what they wrap it in. A group is transparent; it just disambiguates precedence:

/// group = "(" alternation ")"
fn parseGroup(self: *Parser) !Ast.Node {
    const open = self.advance();
    if (open.tag != .left_paren) {
        self.fail(.left_paren, open);
        return error.SyntaxError;
    }
    self.skipTrivia();
    const node = try self.parseAlternation();
    self.skipTrivia();
    const close = self.advance();
    if (close.tag != .right_paren) {
        self.fail(.right_paren, close);
        return error.SyntaxError;
    }
    return node;
}
Parser.zig:177-193

An option desugars to a *1( alternation ) repetition. There’s no separate option node in the AST: [ x ] and *1 x mean the same thing, so we collapse them at parse time.

/// option = "[" alternation "]"  →  *1( alternation )
fn parseOption(self: *Parser) !Ast.Node {
    const open = self.advance();
    if (open.tag != .left_bracket) {
        self.fail(.left_bracket, open);
        return error.SyntaxError;
    }
    self.skipTrivia();
    const inner = try self.parseAlternation();
    self.skipTrivia();
    const close = self.advance();
    if (close.tag != .right_bracket) {
        self.fail(.right_bracket, close);
        return error.SyntaxError;
    }

    const ptr = try self.allocator.create(Ast.Node);
    ptr.* = inner;
    return .{ .repetition = .{ .min = 0, .max = 1, .element = ptr } };
}
Parser.zig:195-214

Fewer node types, simpler downstream consumers. The grammar’s choice of two surface syntaxes for the same semantics is a spelling difference, not a structural one. The AST should reflect structure.

Numeric Values

parseNumVal consumes a single token like %x41-5A or %x41.42.43 and decomposes the lexeme. The base is b/d/x after the %; the digits follow:

fn parseNumVal(self: *Parser) !Ast.NumVal {
    const lex = self.advance().lexeme(self.source);
    const base: u8 = switch (lex[1]) {
        'b' => 2,
        'd' => 10,
        'x' => 16,
        else => unreachable,
    };
    const digits = lex[2..];

    if (std.mem.indexOfScalar(u8, digits, '-')) |dash| {
        return .{ .range = .{
            .lo = try std.fmt.parseInt(u8, digits[0..dash], base),
            .hi = try std.fmt.parseInt(u8, digits[dash + 1 ..], base),
        } };
    }

    if (std.mem.indexOfScalar(u8, digits, '.')) |_| {
        var vals: std.ArrayList(u8) = .empty;
        var iter = std.mem.splitScalar(u8, digits, '.');
        while (iter.next()) |part| {
            try vals.append(self.allocator, try std.fmt.parseInt(u8, part, base));
        }
        return .{ .concat = try vals.toOwnedSlice(self.allocator) };
    }

    return .{ .single = try std.fmt.parseInt(u8, digits, base) };
}
Parser.zig:218-245

Range first, then concat, then a single value as the fallthrough; mutually exclusive forms enforced by the scanner.

Errors That Point At The Problem

So far the parser is happy on well-formed input. On malformed input it would just return error.UnexpectedToken and call it a day, which tells you something is wrong, but not where, and not what was expected.

A real diagnostic is a small struct:

const Diagnostic = @This();

/// What the parser expected at this position.
expected: Expected,
/// The token that was actually found.
found_tag: Token.Tag,
/// Byte offset into source where the unexpected token starts.
found_start: usize,
/// Length of the found token's lexeme.
found_len: usize,
/// Line number (1-based).
line: usize,
Diagnostic.zig:4-15

expected is one of a handful of “things the parser was looking for”:

pub const Expected = enum {
    element,
    left_paren,
    right_paren,
    left_bracket,
    right_bracket,
};
Diagnostic.zig:17-23

Five categories cover every spot in the parser where input can go wrong. Specific enough to be useful, coarse enough to stay manageable.

Recording

The parser records a diagnostic before failing. A tiny helper:

fn fail(self: *Parser, expected: Diagnostic.Expected, token: Token) void {
    self.diagnostic = .{
        .expected = expected,
        .found_tag = token.tag,
        .found_start = token.start,
        .found_len = token.len,
        .line = token.line,
    };
}
Parser.zig:249-257

And the call sites use it instead of throwing a bare error. From parseElement:

        else => {
            self.fail(.element, self.peek());
            return error.SyntaxError;
        },
Parser.zig:170-173

Same shape in parseGroup and parseOption for unmatched parens and brackets. The parser’s control flow is unchanged; we just decorate the failure path with context before unwinding.

Rendering

Recording the diagnostic is half the job; printing something readable is the other half. Two helpers locate the offending line:

/// Compute the 1-based column number by scanning backward from `found_start`
/// to find the beginning of the line.
pub fn column(self: Diagnostic, source: []const u8) usize {
    var line_start: usize = self.found_start;
    while (line_start > 0 and source[line_start - 1] != '\n') {
        line_start -= 1;
    }
    return self.found_start - line_start + 1;
}
Diagnostic.zig:25-33

column walks backward to the previous \n; byte offsets within the line are 1-based. sourceLine does the same and then walks forward to extract the line itself:

/// Extract the full source line containing the error.
pub fn sourceLine(self: Diagnostic, source: []const u8) []const u8 {
    var line_start: usize = self.found_start;
    while (line_start > 0 and source[line_start - 1] != '\n') {
        line_start -= 1;
    }
    var line_end: usize = self.found_start;
    while (line_end < source.len and source[line_end] != '\n' and source[line_end] != '\r') {
        line_end += 1;
    }
    return source[line_start..line_end];
}
Diagnostic.zig:35-46

Then the format function assembles a Rust/Clang-style message with a caret:

/// Format the full diagnostic message to a writer.
pub fn format(self: Diagnostic, source: []const u8, filename: []const u8, writer: anytype) !void {
    const col = self.column(source);
    const line = self.sourceLine(source);
    const found_lexeme = if (self.found_len > 0)
        source[self.found_start .. self.found_start + self.found_len]
    else
        "eof";

    try writer.print("{s}:{d}:{d}: error: expected {s}, found '{s}'\n", .{
        filename,
        self.line,
        col,
        @tagName(self.expected),
        found_lexeme,
    });
    try writer.print("   {s}\n", .{line});
    for (0..col - 1 + 3) |_| try writer.writeByte(' ');
    try writer.print("^\n", .{});
}
Diagnostic.zig:48-67

Note found_lexeme: if the offending token has zero length (i.e. EOF), we substitute the literal string "eof" so the message reads naturally instead of pointing at an empty quote.

In Action

The CLI handles the new error variant explicitly: print the diagnostic to stderr, exit with status 1:

    var parser = zpars.Parser.init(aa, tokens, source);
    const rules = parser.parse() catch |err| switch (err) {
        error.SyntaxError => {
            if (parser.diagnostic) |diag| {
                var stderr_buffer: [4096]u8 = undefined;
                var stderr_writer = std.fs.File.stderr().writer(&stderr_buffer);
                const stderr = &stderr_writer.interface;
                diag.format(source, filename, stderr) catch {};
                stderr.flush() catch {};
            }
            std.process.exit(1);
        },
        else => return err,
    };
main.zig:28-41

Running it on foo = (a / ) produces:

foo.abnf:1:12: error: expected element, found ')'
   foo = (a / )
              ^

Which is the kind of error message you actually want.

Takeaways

Writing a Simple Scanner in Zig
All the Errors at Once