zig/src/c_tokenizer.cpp

814 lines
27 KiB
C++

/*
* Copyright (c) 2016 Andrew Kelley
*
* This file is part of zig, which is MIT licensed.
* See http://opensource.org/licenses/MIT
*/
#include "c_tokenizer.hpp"
#include <inttypes.h>
#define WHITESPACE_EXCEPT_N \
' ': \
case '\t': \
case '\v': \
case '\f'
#define DIGIT_NON_ZERO \
'1': \
case '2': \
case '3': \
case '4': \
case '5': \
case '6': \
case '7': \
case '8': \
case '9'
#define DIGIT \
'0': \
case DIGIT_NON_ZERO
#define ALPHA \
'a': \
case 'b': \
case 'c': \
case 'd': \
case 'e': \
case 'f': \
case 'g': \
case 'h': \
case 'i': \
case 'j': \
case 'k': \
case 'l': \
case 'm': \
case 'n': \
case 'o': \
case 'p': \
case 'q': \
case 'r': \
case 's': \
case 't': \
case 'u': \
case 'v': \
case 'w': \
case 'x': \
case 'y': \
case 'z': \
case 'A': \
case 'B': \
case 'C': \
case 'D': \
case 'E': \
case 'F': \
case 'G': \
case 'H': \
case 'I': \
case 'J': \
case 'K': \
case 'L': \
case 'M': \
case 'N': \
case 'O': \
case 'P': \
case 'Q': \
case 'R': \
case 'S': \
case 'T': \
case 'U': \
case 'V': \
case 'W': \
case 'X': \
case 'Y': \
case 'Z'
#define IDENT_START \
ALPHA: \
case '_'
#define IDENT \
IDENT_START: \
case DIGIT
#define LINE_ENDING \
'\r': \
case '\n'
static void begin_token(CTokenize *ctok, CTokId id) {
assert(ctok->cur_tok == nullptr);
ctok->tokens.add_one();
ctok->cur_tok = &ctok->tokens.last();
ctok->cur_tok->id = id;
switch (id) {
case CTokIdStrLit:
memset(&ctok->cur_tok->data.str_lit, 0, sizeof(Buf));
buf_resize(&ctok->cur_tok->data.str_lit, 0);
break;
case CTokIdSymbol:
memset(&ctok->cur_tok->data.symbol, 0, sizeof(Buf));
buf_resize(&ctok->cur_tok->data.symbol, 0);
break;
case CTokIdNumLitInt:
ctok->cur_tok->data.num_lit_int.x = 0;
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixNone;
break;
case CTokIdCharLit:
case CTokIdNumLitFloat:
case CTokIdMinus:
case CTokIdLParen:
case CTokIdRParen:
case CTokIdEOF:
case CTokIdDot:
case CTokIdAsterisk:
case CTokIdBang:
case CTokIdTilde:
break;
}
}
static void end_token(CTokenize *ctok) {
ctok->cur_tok = nullptr;
}
static void mark_error(CTokenize *ctok) {
ctok->error = true;
}
static void add_char(CTokenize *ctok, uint8_t c) {
assert(ctok->cur_tok);
if (ctok->cur_tok->id == CTokIdCharLit) {
ctok->cur_tok->data.char_lit = c;
ctok->state = CTokStateExpectEndQuot;
} else if (ctok->cur_tok->id == CTokIdStrLit) {
buf_append_char(&ctok->cur_tok->data.str_lit, c);
ctok->state = CTokStateString;
} else {
zig_unreachable();
}
}
static void hex_digit(CTokenize *ctok, uint8_t value) {
// TODO @mul_with_overflow
ctok->cur_tok->data.num_lit_int.x *= 16;
// TODO @add_with_overflow
ctok->cur_tok->data.num_lit_int.x += value;
static const uint8_t hex_digit[] = "0123456789abcdef";
buf_append_char(&ctok->buf, hex_digit[value]);
}
static void end_float(CTokenize *ctok) {
// TODO detect errors, overflow, and underflow
double value = strtod(buf_ptr(&ctok->buf), nullptr);
ctok->cur_tok->data.num_lit_float = value;
end_token(ctok);
ctok->state = CTokStateStart;
}
void tokenize_c_macro(CTokenize *ctok, const uint8_t *c) {
ctok->tokens.resize(0);
ctok->state = CTokStateStart;
ctok->error = false;
ctok->cur_tok = nullptr;
buf_resize(&ctok->buf, 0);
for (; *c; c += 1) {
switch (ctok->state) {
case CTokStateStart:
switch (*c) {
case WHITESPACE_EXCEPT_N:
break;
case '\'':
ctok->state = CTokStateExpectChar;
begin_token(ctok, CTokIdCharLit);
break;
case '\"':
ctok->state = CTokStateString;
begin_token(ctok, CTokIdStrLit);
break;
case '/':
ctok->state = CTokStateOpenComment;
break;
case '\\':
ctok->state = CTokStateBackslash;
break;
case LINE_ENDING:
goto found_end_of_macro;
case IDENT_START:
ctok->state = CTokStateIdentifier;
begin_token(ctok, CTokIdSymbol);
buf_append_char(&ctok->cur_tok->data.symbol, *c);
break;
case DIGIT_NON_ZERO:
ctok->state = CTokStateDecimal;
begin_token(ctok, CTokIdNumLitInt);
ctok->cur_tok->data.num_lit_int.x = *c - '0';
buf_resize(&ctok->buf, 0);
buf_append_char(&ctok->buf, *c);
break;
case '0':
ctok->state = CTokStateGotZero;
begin_token(ctok, CTokIdNumLitInt);
ctok->cur_tok->data.num_lit_int.x = 0;
buf_resize(&ctok->buf, 0);
buf_append_char(&ctok->buf, '0');
break;
case '.':
begin_token(ctok, CTokIdDot);
end_token(ctok);
break;
case '(':
begin_token(ctok, CTokIdLParen);
end_token(ctok);
break;
case ')':
begin_token(ctok, CTokIdRParen);
end_token(ctok);
break;
case '*':
begin_token(ctok, CTokIdAsterisk);
end_token(ctok);
break;
case '-':
begin_token(ctok, CTokIdMinus);
end_token(ctok);
break;
case '!':
begin_token(ctok, CTokIdBang);
end_token(ctok);
break;
case '~':
begin_token(ctok, CTokIdTilde);
end_token(ctok);
break;
default:
return mark_error(ctok);
}
break;
case CTokStateFloat:
switch (*c) {
case '.':
break;
case 'e':
case 'E':
buf_append_char(&ctok->buf, 'e');
ctok->state = CTokStateExpSign;
break;
case 'f':
case 'F':
case 'l':
case 'L':
end_float(ctok);
break;
case DIGIT:
buf_append_char(&ctok->buf, *c);
break;
default:
c -= 1;
end_float(ctok);
continue;
}
break;
case CTokStateExpSign:
switch (*c) {
case '+':
case '-':
ctok->state = CTokStateFloatExpFirst;
buf_append_char(&ctok->buf, *c);
break;
case DIGIT:
ctok->state = CTokStateFloatExp;
buf_append_char(&ctok->buf, *c);
break;
default:
return mark_error(ctok);
}
break;
case CTokStateFloatExpFirst:
switch (*c) {
case DIGIT:
buf_append_char(&ctok->buf, *c);
ctok->state = CTokStateFloatExp;
break;
default:
return mark_error(ctok);
}
break;
case CTokStateFloatExp:
switch (*c) {
case DIGIT:
buf_append_char(&ctok->buf, *c);
break;
case 'f':
case 'F':
case 'l':
case 'L':
end_float(ctok);
break;
default:
c -= 1;
end_float(ctok);
continue;
}
break;
case CTokStateDecimal:
switch (*c) {
case DIGIT:
buf_append_char(&ctok->buf, *c);
// TODO @mul_with_overflow
ctok->cur_tok->data.num_lit_int.x *= 10;
// TODO @add_with_overflow
ctok->cur_tok->data.num_lit_int.x += *c - '0';
break;
case '\'':
break;
case 'u':
case 'U':
ctok->state = CTokStateNumLitIntSuffixU;
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixU;
break;
case 'l':
case 'L':
ctok->state = CTokStateNumLitIntSuffixL;
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixL;
break;
case '.':
buf_append_char(&ctok->buf, '.');
ctok->cur_tok->id = CTokIdNumLitFloat;
ctok->state = CTokStateFloat;
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateGotZero:
switch (*c) {
case 'x':
case 'X':
ctok->state = CTokStateHex;
break;
case '.':
ctok->state = CTokStateFloat;
ctok->cur_tok->id = CTokIdNumLitFloat;
buf_append_char(&ctok->buf, '.');
break;
default:
c -= 1;
ctok->state = CTokStateOctal;
continue;
}
break;
case CTokStateOctal:
switch (*c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
// TODO @mul_with_overflow
ctok->cur_tok->data.num_lit_int.x *= 8;
// TODO @add_with_overflow
ctok->cur_tok->data.num_lit_int.x += *c - '0';
break;
case '8':
case '9':
return mark_error(ctok);
case '\'':
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateHex:
switch (*c) {
case '0':
hex_digit(ctok, 0);
break;
case '1':
hex_digit(ctok, 1);
break;
case '2':
hex_digit(ctok, 2);
break;
case '3':
hex_digit(ctok, 3);
break;
case '4':
hex_digit(ctok, 4);
break;
case '5':
hex_digit(ctok, 5);
break;
case '6':
hex_digit(ctok, 6);
break;
case '7':
hex_digit(ctok, 7);
break;
case '8':
hex_digit(ctok, 8);
break;
case '9':
hex_digit(ctok, 9);
break;
case 'a':
case 'A':
hex_digit(ctok, 10);
break;
case 'b':
case 'B':
hex_digit(ctok, 11);
break;
case 'c':
case 'C':
hex_digit(ctok, 12);
break;
case 'd':
case 'D':
hex_digit(ctok, 13);
break;
case 'e':
case 'E':
hex_digit(ctok, 14);
break;
case 'f':
case 'F':
hex_digit(ctok, 15);
break;
case 'p':
case 'P':
ctok->cur_tok->id = CTokIdNumLitFloat;
ctok->state = CTokStateExpSign;
break;
case 'u':
case 'U':
// marks the number literal as unsigned
ctok->state = CTokStateNumLitIntSuffixU;
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixU;
break;
case 'l':
case 'L':
// marks the number literal as long
ctok->state = CTokStateNumLitIntSuffixL;
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixL;
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateNumLitIntSuffixU:
switch (*c) {
case 'l':
case 'L':
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLU;
ctok->state = CTokStateNumLitIntSuffixUL;
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateNumLitIntSuffixL:
switch (*c) {
case 'l':
case 'L':
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLL;
ctok->state = CTokStateNumLitIntSuffixLL;
break;
case 'u':
case 'U':
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLU;
end_token(ctok);
ctok->state = CTokStateStart;
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateNumLitIntSuffixLL:
switch (*c) {
case 'u':
case 'U':
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLLU;
end_token(ctok);
ctok->state = CTokStateStart;
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateNumLitIntSuffixUL:
switch (*c) {
case 'l':
case 'L':
ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLLU;
end_token(ctok);
ctok->state = CTokStateStart;
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateIdentifier:
switch (*c) {
case IDENT:
buf_append_char(&ctok->cur_tok->data.symbol, *c);
break;
default:
c -= 1;
end_token(ctok);
ctok->state = CTokStateStart;
continue;
}
break;
case CTokStateString:
switch (*c) {
case '\\':
ctok->state = CTokStateCharEscape;
break;
case '\"':
end_token(ctok);
ctok->state = CTokStateStart;
break;
default:
buf_append_char(&ctok->cur_tok->data.str_lit, *c);
}
break;
case CTokStateExpectChar:
switch (*c) {
case '\\':
ctok->state = CTokStateCharEscape;
break;
case '\'':
return mark_error(ctok);
default:
ctok->cur_tok->data.char_lit = *c;
ctok->state = CTokStateExpectEndQuot;
}
break;
case CTokStateCharEscape:
switch (*c) {
case '\'':
case '"':
case '?':
case '\\':
add_char(ctok, *c);
break;
case 'a':
add_char(ctok, '\a');
break;
case 'b':
add_char(ctok, '\b');
break;
case 'f':
add_char(ctok, '\f');
break;
case 'n':
add_char(ctok, '\n');
break;
case 'r':
add_char(ctok, '\r');
break;
case 't':
add_char(ctok, '\t');
break;
case 'v':
add_char(ctok, '\v');
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
ctok->state = CTokStateStrOctal;
ctok->cur_char = (uint8_t)(*c - '0');
ctok->octal_index = 1;
break;
case 'x':
ctok->state = CTokStateStrHex;
ctok->cur_char = 0;
break;
case 'u':
zig_panic("TODO unicode");
break;
case 'U':
zig_panic("TODO Unicode");
break;
default:
return mark_error(ctok);
}
break;
case CTokStateStrHex: {
uint8_t value = 0;
switch (*c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
value = *c - '0';
break;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
value = (*c - 'a') + 10;
break;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
value = (*c - 'A') + 10;
break;
default:
c -= 1;
add_char(ctok, ctok->cur_char);
continue;
}
// TODO @mul_with_overflow
if (((long)ctok->cur_char) * 16 >= 256) {
zig_panic("TODO str hex mul overflow");
}
ctok->cur_char = (uint8_t)(ctok->cur_char * (uint8_t)16);
// TODO @add_with_overflow
if (((long)ctok->cur_char) + (long)(value) >= 256) {
zig_panic("TODO str hex add overflow");
}
ctok->cur_char = (uint8_t)(ctok->cur_char + value);
break;
}
case CTokStateStrOctal:
switch (*c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
// TODO @mul_with_overflow
if (((long)ctok->cur_char) * 8 >= 256) {
zig_panic("TODO");
}
ctok->cur_char = (uint8_t)(ctok->cur_char * (uint8_t)8);
// TODO @add_with_overflow
if (((long)ctok->cur_char) + (long)(*c - '0') >= 256) {
zig_panic("TODO");
}
ctok->cur_char = (uint8_t)(ctok->cur_char + (uint8_t)(*c - '0'));
ctok->octal_index += 1;
if (ctok->octal_index == 3) {
add_char(ctok, ctok->cur_char);
}
break;
default:
c -= 1;
add_char(ctok, ctok->cur_char);
continue;
}
break;
case CTokStateExpectEndQuot:
switch (*c) {
case '\'':
end_token(ctok);
ctok->state = CTokStateStart;
break;
default:
return mark_error(ctok);
}
break;
case CTokStateOpenComment:
switch (*c) {
case '/':
ctok->state = CTokStateLineComment;
break;
case '*':
ctok->state = CTokStateComment;
break;
default:
return mark_error(ctok);
}
break;
case CTokStateLineComment:
if (*c == '\n') {
ctok->state = CTokStateStart;
goto found_end_of_macro;
}
break;
case CTokStateComment:
switch (*c) {
case '*':
ctok->state = CTokStateCommentStar;
break;
default:
break;
}
break;
case CTokStateCommentStar:
switch (*c) {
case '/':
ctok->state = CTokStateStart;
break;
case '*':
break;
default:
ctok->state = CTokStateComment;
break;
}
break;
case CTokStateBackslash:
switch (*c) {
case '\n':
ctok->state = CTokStateStart;
break;
default:
return mark_error(ctok);
}
break;
}
}
found_end_of_macro:
switch (ctok->state) {
case CTokStateStart:
break;
case CTokStateIdentifier:
case CTokStateDecimal:
case CTokStateHex:
case CTokStateOctal:
case CTokStateGotZero:
case CTokStateNumLitIntSuffixU:
case CTokStateNumLitIntSuffixL:
case CTokStateNumLitIntSuffixUL:
case CTokStateNumLitIntSuffixLL:
end_token(ctok);
break;
case CTokStateFloat:
case CTokStateFloatExp:
end_float(ctok);
break;
case CTokStateExpectChar:
case CTokStateExpectEndQuot:
case CTokStateOpenComment:
case CTokStateLineComment:
case CTokStateComment:
case CTokStateCommentStar:
case CTokStateCharEscape:
case CTokStateBackslash:
case CTokStateString:
case CTokStateExpSign:
case CTokStateFloatExpFirst:
case CTokStateStrHex:
case CTokStateStrOctal:
return mark_error(ctok);
}
assert(ctok->cur_tok == nullptr);
begin_token(ctok, CTokIdEOF);
end_token(ctok);
}