const std = @import("std"); const expect = std.testing.expect; pub const TokenList = std.SegmentedList(CToken, 32); pub const CToken = struct { id: Id, bytes: []const u8, num_lit_suffix: NumLitSuffix = .None, pub const Id = enum { CharLit, StrLit, NumLitInt, NumLitFloat, Identifier, Minus, Slash, LParen, RParen, Eof, Dot, Asterisk, Bang, Tilde, Shl, Lt, Comma, Fn, }; pub const NumLitSuffix = enum { None, F, L, U, LU, LL, LLU, }; }; pub fn tokenizeCMacro(tl: *TokenList, chars: [*:0]const u8) !void { var index: usize = 0; var first = true; while (true) { const tok = try next(chars, &index); if (tok.id == .StrLit or tok.id == .CharLit) try tl.push(try zigifyEscapeSequences(tl.allocator, tok)) else try tl.push(tok); if (tok.id == .Eof) return; if (first) { // distinguish NAME (EXPR) from NAME(ARGS) first = false; if (chars[index] == '(') { try tl.push(.{ .id = .Fn, .bytes = "", }); } } } } fn zigifyEscapeSequences(allocator: *std.mem.Allocator, tok: CToken) !CToken { for (tok.bytes) |c| { if (c == '\\') { break; } } else return tok; var bytes = try allocator.alloc(u8, tok.bytes.len * 2); var escape = false; var i: usize = 0; for (tok.bytes) |c| { if (escape) { switch (c) { 'n', 'r', 't', '\\', '\'', '\"', 'x' => { bytes[i] = c; }, 'a' => { bytes[i] = 'x'; i += 1; bytes[i] = '0'; i += 1; bytes[i] = '7'; }, 'b' => { bytes[i] = 'x'; i += 1; bytes[i] = '0'; i += 1; bytes[i] = '8'; }, 'f' => { bytes[i] = 'x'; i += 1; bytes[i] = '0'; i += 1; bytes[i] = 'C'; }, 'v' => { bytes[i] = 'x'; i += 1; bytes[i] = '0'; i += 1; bytes[i] = 'B'; }, '?' => { i -= 1; bytes[i] = '?'; }, 'u', 'U' => { // TODO unicode escape sequences return error.TokenizingFailed; }, '0'...'7' => { // TODO octal escape sequences return error.TokenizingFailed; }, else => { // unknown escape sequence return error.TokenizingFailed; }, } i += 1; escape = false; } else { if (c == '\\') { escape = true; } bytes[i] = c; i += 1; } } return CToken{ .id = tok.id, .bytes = bytes[0..i], }; } fn next(chars: [*:0]const u8, i: *usize) !CToken { var state: enum { Start, GotLt, CharLit, OpenComment, Comment, CommentStar, Backslash, String, Identifier, Decimal, Octal, GotZero, Hex, Bin, Float, ExpSign, FloatExp, FloatExpFirst, NumLitIntSuffixU, NumLitIntSuffixL, NumLitIntSuffixLL, NumLitIntSuffixUL, } = .Start; var result = CToken{ .bytes = "", .id = .Eof, }; var begin_index: usize = 0; var digits: u8 = 0; var pre_escape = state; while (true) { const c = chars[i.*]; if (c == 0) { switch (state) { .Start => { return result; }, .Identifier, .Decimal, .Hex, .Bin, .Octal, .GotZero, .Float, .FloatExp, => { result.bytes = chars[begin_index..i.*]; return result; }, .NumLitIntSuffixU, .NumLitIntSuffixL, .NumLitIntSuffixUL, .NumLitIntSuffixLL, .GotLt, => { return result; }, .CharLit, .OpenComment, .Comment, .CommentStar, .Backslash, .String, .ExpSign, .FloatExpFirst, => return error.TokenizingFailed, } } i.* += 1; switch (state) { .Start => { switch (c) { ' ', '\t', '\x0B', '\x0C' => {}, '\'' => { state = .CharLit; result.id = .CharLit; begin_index = i.* - 1; }, '\"' => { state = .String; result.id = .StrLit; begin_index = i.* - 1; }, '/' => { state = .OpenComment; }, '\\' => { state = .Backslash; }, '\n', '\r' => { return result; }, 'a'...'z', 'A'...'Z', '_' => { state = .Identifier; result.id = .Identifier; begin_index = i.* - 1; }, '1'...'9' => { state = .Decimal; result.id = .NumLitInt; begin_index = i.* - 1; }, '0' => { state = .GotZero; result.id = .NumLitInt; begin_index = i.* - 1; }, '.' => { result.id = .Dot; return result; }, '<' => { result.id = .Lt; state = .GotLt; }, '(' => { result.id = .LParen; return result; }, ')' => { result.id = .RParen; return result; }, '*' => { result.id = .Asterisk; return result; }, '-' => { result.id = .Minus; return result; }, '!' => { result.id = .Bang; return result; }, '~' => { result.id = .Tilde; return result; }, ',' => { result.id = .Comma; return result; }, else => return error.TokenizingFailed, } }, .GotLt => { switch (c) { '<' => { result.id = .Shl; return result; }, else => { return result; }, } }, .Float => { switch (c) { '.', '0'...'9' => {}, 'e', 'E' => { state = .ExpSign; }, 'f', 'F', => { i.* -= 1; result.num_lit_suffix = .F; result.bytes = chars[begin_index..i.*]; return result; }, 'l', 'L' => { i.* -= 1; result.num_lit_suffix = .L; result.bytes = chars[begin_index..i.*]; return result; }, else => { i.* -= 1; result.bytes = chars[begin_index..i.*]; return result; }, } }, .ExpSign => { switch (c) { '+', '-' => { state = .FloatExpFirst; }, '0'...'9' => { state = .FloatExp; }, else => return error.TokenizingFailed, } }, .FloatExpFirst => { switch (c) { '0'...'9' => { state = .FloatExp; }, else => return error.TokenizingFailed, } }, .FloatExp => { switch (c) { '0'...'9' => {}, 'f', 'F' => { result.num_lit_suffix = .F; result.bytes = chars[begin_index .. i.* - 1]; return result; }, 'l', 'L' => { result.num_lit_suffix = .L; result.bytes = chars[begin_index .. i.* - 1]; return result; }, else => { i.* -= 1; result.bytes = chars[begin_index..i.*]; return result; }, } }, .Decimal => { switch (c) { '0'...'9' => {}, '\'' => {}, 'u', 'U' => { state = .NumLitIntSuffixU; result.num_lit_suffix = .U; result.bytes = chars[begin_index .. i.* - 1]; }, 'l', 'L' => { state = .NumLitIntSuffixL; result.num_lit_suffix = .L; result.bytes = chars[begin_index .. i.* - 1]; }, '.' => { result.id = .NumLitFloat; state = .Float; }, else => { i.* -= 1; result.bytes = chars[begin_index..i.*]; return result; }, } }, .GotZero => { switch (c) { 'x', 'X' => { state = .Hex; }, 'b', 'B' => { state = .Bin; }, '.' => { state = .Float; result.id = .NumLitFloat; }, 'u', 'U' => { state = .NumLitIntSuffixU; result.num_lit_suffix = .U; result.bytes = chars[begin_index .. i.* - 1]; }, 'l', 'L' => { state = .NumLitIntSuffixL; result.num_lit_suffix = .L; result.bytes = chars[begin_index .. i.* - 1]; }, else => { i.* -= 1; state = .Octal; }, } }, .Octal => { switch (c) { '0'...'7' => {}, '8', '9' => return error.TokenizingFailed, else => { i.* -= 1; result.bytes = chars[begin_index..i.*]; return result; }, } }, .Hex => { switch (c) { '0'...'9', 'a'...'f', 'A'...'F' => {}, 'u', 'U' => { // marks the number literal as unsigned state = .NumLitIntSuffixU; result.num_lit_suffix = .U; result.bytes = chars[begin_index .. i.* - 1]; }, 'l', 'L' => { // marks the number literal as long state = .NumLitIntSuffixL; result.num_lit_suffix = .L; result.bytes = chars[begin_index .. i.* - 1]; }, else => { i.* -= 1; result.bytes = chars[begin_index..i.*]; return result; }, } }, .Bin => { switch (c) { '0'...'1' => {}, '2'...'9' => return error.TokenizingFailed, 'u', 'U' => { // marks the number literal as unsigned state = .NumLitIntSuffixU; result.num_lit_suffix = .U; result.bytes = chars[begin_index .. i.* - 1]; }, 'l', 'L' => { // marks the number literal as long state = .NumLitIntSuffixL; result.num_lit_suffix = .L; result.bytes = chars[begin_index .. i.* - 1]; }, else => { i.* -= 1; result.bytes = chars[begin_index..i.*]; return result; }, } }, .NumLitIntSuffixU => { switch (c) { 'l', 'L' => { result.num_lit_suffix = .LU; state = .NumLitIntSuffixUL; }, else => { i.* -= 1; return result; }, } }, .NumLitIntSuffixL => { switch (c) { 'l', 'L' => { result.num_lit_suffix = .LL; state = .NumLitIntSuffixLL; }, 'u', 'U' => { result.num_lit_suffix = .LU; return result; }, else => { i.* -= 1; return result; }, } }, .NumLitIntSuffixLL => { switch (c) { 'u', 'U' => { result.num_lit_suffix = .LLU; return result; }, else => { i.* -= 1; return result; }, } }, .NumLitIntSuffixUL => { switch (c) { 'l', 'L' => { result.num_lit_suffix = .LLU; return result; }, else => { i.* -= 1; return result; }, } }, .Identifier => { switch (c) { '_', 'a'...'z', 'A'...'Z', '0'...'9' => {}, else => { i.* -= 1; result.bytes = chars[begin_index..i.*]; return result; }, } }, .String => { // TODO char escapes switch (c) { '\"' => { result.bytes = chars[begin_index..i.*]; return result; }, else => {}, } }, .CharLit => { switch (c) { '\'' => { result.bytes = chars[begin_index..i.*]; return result; }, else => {}, } }, .OpenComment => { switch (c) { '/' => { return result; }, '*' => { state = .Comment; }, else => { result.id = .Slash; return result; }, } }, .Comment => { switch (c) { '*' => { state = .CommentStar; }, else => {}, } }, .CommentStar => { switch (c) { '/' => { state = .Start; }, else => { state = .Comment; }, } }, .Backslash => { switch (c) { ' ', '\t', '\x0B', '\x0C' => {}, '\n', '\r' => { state = .Start; }, else => return error.TokenizingFailed, } }, } } unreachable; } test "tokenize macro" { var tl = TokenList.init(std.heap.page_allocator); defer tl.deinit(); const src = "TEST(0\n"; try tokenizeCMacro(&tl, src); var it = tl.iterator(0); expect(it.next().?.id == .Identifier); expect(it.next().?.id == .Fn); expect(it.next().?.id == .LParen); expect(std.mem.eql(u8, it.next().?.bytes, "0")); expect(it.next().?.id == .Eof); expect(it.next() == null); tl.shrink(0); const src2 = "__FLT_MIN_10_EXP__ -37\n"; try tokenizeCMacro(&tl, src2); it = tl.iterator(0); expect(std.mem.eql(u8, it.next().?.bytes, "__FLT_MIN_10_EXP__")); expect(it.next().?.id == .Minus); expect(std.mem.eql(u8, it.next().?.bytes, "37")); expect(it.next().?.id == .Eof); expect(it.next() == null); tl.shrink(0); const src3 = "__llvm__ 1\n#define"; try tokenizeCMacro(&tl, src3); it = tl.iterator(0); expect(std.mem.eql(u8, it.next().?.bytes, "__llvm__")); expect(std.mem.eql(u8, it.next().?.bytes, "1")); expect(it.next().?.id == .Eof); expect(it.next() == null); tl.shrink(0); const src4 = "TEST 2"; try tokenizeCMacro(&tl, src4); it = tl.iterator(0); expect(it.next().?.id == .Identifier); expect(std.mem.eql(u8, it.next().?.bytes, "2")); expect(it.next().?.id == .Eof); expect(it.next() == null); tl.shrink(0); const src5 = "FOO 0l"; try tokenizeCMacro(&tl, src5); it = tl.iterator(0); expect(it.next().?.id == .Identifier); expect(std.mem.eql(u8, it.next().?.bytes, "0")); expect(it.next().?.id == .Eof); expect(it.next() == null); tl.shrink(0); }