From f36b8fd7b2c89d46dba95eca05b60487638dd2a0 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Wed, 11 Sep 2019 11:46:51 +0200 Subject: [PATCH] Recognize & skip the UTF-8 BOM --- src/tokenizer.cpp | 9 +++++++-- std/zig/tokenizer.zig | 11 ++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 465f65228..71a24fe72 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -407,9 +407,14 @@ void tokenize(Buf *buf, Tokenization *out) { t.buf = buf; out->line_offsets = allocate>(1); - out->line_offsets->append(0); - for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) { + + // Skip the UTF-8 BOM if present + if (buf_starts_with_mem(buf, "\xEF\xBB\xBF", 3)) { + t.pos += 3; + } + + for (; t.pos < buf_len(t.buf); t.pos += 1) { uint8_t c = buf_ptr(t.buf)[t.pos]; switch (t.state) { case TokenizeStateError: diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index 19fb23356..f25da12a9 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -222,9 +222,11 @@ pub const Tokenizer = struct { }, }; } else { + // Skip the UTF-8 BOM if present + const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else usize(0); return Tokenizer{ .buffer = buffer, - .index = 0, + .index = src_start, .pending_invalid_token = null, }; } @@ -1455,6 +1457,13 @@ test "tokenizer - line comment followed by identifier" { }); } +test "tokenizer - UTF-8 BOM is recognized and skipped" { + testTokenize("\xEF\xBB\xBFa;\n", [_]Token.Id{ + Token.Id.Identifier, + Token.Id.Semicolon, + }); +} + fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { var tokenizer = Tokenizer.init(source); for (expected_tokens) |expected_token_id| {