zig/src/c_tokenizer.cpp

/*
 * Copyright (c) 2016 Andrew Kelley
 *
 * This file is part of zig, which is MIT licensed.
 * See http://opensource.org/licenses/MIT
 */

#include "c_tokenizer.hpp"
#include <inttypes.h>

#define WHITESPACE_EXCEPT_N \
         ' ': \
    case '\t': \
    case '\v': \
    case '\f'

#define DIGIT_NON_ZERO \
         '1': \
    case '2': \
    case '3': \
    case '4': \
    case '5': \
    case '6': \
    case '7': \
    case '8': \
    case '9'

#define DIGIT \
         '0': \
    case DIGIT_NON_ZERO

#define ALPHA \
         'a': \
    case 'b': \
    case 'c': \
    case 'd': \
    case 'e': \
    case 'f': \
    case 'g': \
    case 'h': \
    case 'i': \
    case 'j': \
    case 'k': \
    case 'l': \
    case 'm': \
    case 'n': \
    case 'o': \
    case 'p': \
    case 'q': \
    case 'r': \
    case 's': \
    case 't': \
    case 'u': \
    case 'v': \
    case 'w': \
    case 'x': \
    case 'y': \
    case 'z': \
    case 'A': \
    case 'B': \
    case 'C': \
    case 'D': \
    case 'E': \
    case 'F': \
    case 'G': \
    case 'H': \
    case 'I': \
    case 'J': \
    case 'K': \
    case 'L': \
    case 'M': \
    case 'N': \
    case 'O': \
    case 'P': \
    case 'Q': \
    case 'R': \
    case 'S': \
    case 'T': \
    case 'U': \
    case 'V': \
    case 'W': \
    case 'X': \
    case 'Y': \
    case 'Z'

#define IDENT_START \
    ALPHA: \
    case '_'

#define IDENT \
    IDENT_START: \
    case DIGIT

#define LINE_ENDING \
         '\r': \
    case '\n'

static void begin_token(CTokenize *ctok, CTokId id) {
    assert(ctok->cur_tok == nullptr);
    ctok->tokens.add_one();
    ctok->cur_tok = &ctok->tokens.last();
    ctok->cur_tok->id = id;

    switch (id) {
        case CTokIdStrLit:
            memset(&ctok->cur_tok->data.str_lit, 0, sizeof(Buf));
            buf_resize(&ctok->cur_tok->data.str_lit, 0);
            break;
        case CTokIdSymbol:
            memset(&ctok->cur_tok->data.symbol, 0, sizeof(Buf));
            buf_resize(&ctok->cur_tok->data.symbol, 0);
            break;
        case CTokIdNumLitInt:
            ctok->cur_tok->data.num_lit_int.x = 0;
            ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixNone;
            break;
        case CTokIdCharLit:
        case CTokIdNumLitFloat:
        case CTokIdMinus:
        case CTokIdLParen:
        case CTokIdRParen:
        case CTokIdEOF:
        case CTokIdDot:
        case CTokIdAsterisk:
        case CTokIdBang:
        case CTokIdTilde:
            break;
    }
}

static void end_token(CTokenize *ctok) {
    ctok->cur_tok = nullptr;
}

static void mark_error(CTokenize *ctok) {
    ctok->error = true;
}

static void add_char(CTokenize *ctok, uint8_t c) {
    assert(ctok->cur_tok);
    if (ctok->cur_tok->id == CTokIdCharLit) {
        ctok->cur_tok->data.char_lit = c;
        ctok->state = CTokStateExpectEndQuot;
    } else if (ctok->cur_tok->id == CTokIdStrLit) {
        buf_append_char(&ctok->cur_tok->data.str_lit, c);
        ctok->state = CTokStateString;
    } else {
        zig_unreachable();
    }
}

static void hex_digit(CTokenize *ctok, uint8_t value) {
    // TODO @mul_with_overflow
    ctok->cur_tok->data.num_lit_int.x *= 16;
    // TODO @add_with_overflow
    ctok->cur_tok->data.num_lit_int.x += value;

    static const uint8_t hex_digit[] = "0123456789abcdef";
    buf_append_char(&ctok->buf, hex_digit[value]);
}

static void end_float(CTokenize *ctok) {
    // TODO detect errors, overflow, and underflow
    double value = strtod(buf_ptr(&ctok->buf), nullptr);

    ctok->cur_tok->data.num_lit_float = value;

    end_token(ctok);
    ctok->state = CTokStateStart;

}

void tokenize_c_macro(CTokenize *ctok, const uint8_t *c) {
    ctok->tokens.resize(0);
    ctok->state = CTokStateStart;
    ctok->error = false;
    ctok->cur_tok = nullptr;

    buf_resize(&ctok->buf, 0);

    for (; *c; c += 1) {
        switch (ctok->state) {
            case CTokStateStart:
                switch (*c) {
                    case WHITESPACE_EXCEPT_N:
                        break;
                    case '\'':
                        ctok->state = CTokStateExpectChar;
                        begin_token(ctok, CTokIdCharLit);
                        break;
                    case '\"':
                        ctok->state = CTokStateString;
                        begin_token(ctok, CTokIdStrLit);
                        break;
                    case '/':
                        ctok->state = CTokStateOpenComment;
                        break;
                    case '\\':
                        ctok->state = CTokStateBackslash;
                        break;
                    case LINE_ENDING:
                        goto found_end_of_macro;
                    case IDENT_START:
                        ctok->state = CTokStateIdentifier;
                        begin_token(ctok, CTokIdSymbol);
                        buf_append_char(&ctok->cur_tok->data.symbol, *c);
                        break;
                    case DIGIT_NON_ZERO:
                        ctok->state = CTokStateDecimal;
                        begin_token(ctok, CTokIdNumLitInt);
                        ctok->cur_tok->data.num_lit_int.x = *c - '0';
                        buf_resize(&ctok->buf, 0);
                        buf_append_char(&ctok->buf, *c);
                        break;
                    case '0':
                        ctok->state = CTokStateGotZero;
                        begin_token(ctok, CTokIdNumLitInt);
                        ctok->cur_tok->data.num_lit_int.x = 0;
                        buf_resize(&ctok->buf, 0);
                        buf_append_char(&ctok->buf, '0');
                        break;
                    case '.':
                        begin_token(ctok, CTokIdDot);
                        end_token(ctok);
                        break;
                    case '(':
                        begin_token(ctok, CTokIdLParen);
                        end_token(ctok);
                        break;
                    case ')':
                        begin_token(ctok, CTokIdRParen);
                        end_token(ctok);
                        break;
                    case '*':
                        begin_token(ctok, CTokIdAsterisk);
                        end_token(ctok);
                        break;
                    case '-':
                        begin_token(ctok, CTokIdMinus);
                        end_token(ctok);
                        break;
                    case '!':
                        begin_token(ctok, CTokIdBang);
                        end_token(ctok);
                        break;
                    case '~':
                        begin_token(ctok, CTokIdTilde);
                        end_token(ctok);
                        break;
                    default:
                        return mark_error(ctok);
                }
                break;
            case CTokStateFloat:
                switch (*c) {
                    case '.':
                        break;
                    case 'e':
                    case 'E':
                        buf_append_char(&ctok->buf, 'e');
                        ctok->state = CTokStateExpSign;
                        break;
                    case 'f':
                    case 'F':
                    case 'l':
                    case 'L':
                        end_float(ctok);
                        break;
                    case DIGIT:
                        buf_append_char(&ctok->buf, *c);
                        break;
                    default:
                        c -= 1;
                        end_float(ctok);
                        continue;
                }
                break;
            case CTokStateExpSign:
                switch (*c) {
                    case '+':
                    case '-':
                        ctok->state = CTokStateFloatExpFirst;
                        buf_append_char(&ctok->buf, *c);
                        break;
                    case DIGIT:
                        ctok->state = CTokStateFloatExp;
                        buf_append_char(&ctok->buf, *c);
                        break;
                    default:
                        return mark_error(ctok);
                }
                break;
            case CTokStateFloatExpFirst:
                switch (*c) {
                    case DIGIT:
                        buf_append_char(&ctok->buf, *c);
                        ctok->state = CTokStateFloatExp;
                        break;
                    default:
                        return mark_error(ctok);
                }
                break;
            case CTokStateFloatExp:
                switch (*c) {
                    case DIGIT:
                        buf_append_char(&ctok->buf, *c);
                        break;
                    case 'f':
                    case 'F':
                    case 'l':
                    case 'L':
                        end_float(ctok);
                        break;
                    default:
                        c -= 1;
                        end_float(ctok);
                        continue;
                }
                break;
            case CTokStateDecimal:
                switch (*c) {
                    case DIGIT:
                        buf_append_char(&ctok->buf, *c);

                        // TODO @mul_with_overflow
                        ctok->cur_tok->data.num_lit_int.x *= 10;
                        // TODO @add_with_overflow
                        ctok->cur_tok->data.num_lit_int.x += *c - '0';
                        break;
                    case '\'':
                        break;
                    case 'u':
                    case 'U':
                        ctok->state = CTokStateNumLitIntSuffixU;
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixU;
                        break;
                    case 'l':
                    case 'L':
                        ctok->state = CTokStateNumLitIntSuffixL;
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixL;
                        break;
                    case '.':
                        buf_append_char(&ctok->buf, '.');
                        ctok->cur_tok->id = CTokIdNumLitFloat;
                        ctok->state = CTokStateFloat;
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateGotZero:
                switch (*c) {
                    case 'x':
                    case 'X':
                        ctok->state = CTokStateHex;
                        break;
                    case '.':
                        ctok->state = CTokStateFloat;
                        ctok->cur_tok->id = CTokIdNumLitFloat;
                        buf_append_char(&ctok->buf, '.');
                        break;
                    default:
                        c -= 1;
                        ctok->state = CTokStateOctal;
                        continue;
                }
                break;
            case CTokStateOctal:
                switch (*c) {
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                        // TODO @mul_with_overflow
                        ctok->cur_tok->data.num_lit_int.x *= 8;
                        // TODO @add_with_overflow
                        ctok->cur_tok->data.num_lit_int.x += *c - '0';
                        break;
                    case '8':
                    case '9':
                        return mark_error(ctok);
                    case '\'':
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateHex:
                switch (*c) {
                    case '0':
                        hex_digit(ctok, 0);
                        break;
                    case '1':
                        hex_digit(ctok, 1);
                        break;
                    case '2':
                        hex_digit(ctok, 2);
                        break;
                    case '3':
                        hex_digit(ctok, 3);
                        break;
                    case '4':
                        hex_digit(ctok, 4);
                        break;
                    case '5':
                        hex_digit(ctok, 5);
                        break;
                    case '6':
                        hex_digit(ctok, 6);
                        break;
                    case '7':
                        hex_digit(ctok, 7);
                        break;
                    case '8':
                        hex_digit(ctok, 8);
                        break;
                    case '9':
                        hex_digit(ctok, 9);
                        break;
                    case 'a':
                    case 'A':
                        hex_digit(ctok, 10);
                        break;
                    case 'b':
                    case 'B':
                        hex_digit(ctok, 11);
                        break;
                    case 'c':
                    case 'C':
                        hex_digit(ctok, 12);
                        break;
                    case 'd':
                    case 'D':
                        hex_digit(ctok, 13);
                        break;
                    case 'e':
                    case 'E':
                        hex_digit(ctok, 14);
                        break;
                    case 'f':
                    case 'F':
                        hex_digit(ctok, 15);
                        break;
                    case 'p':
                    case 'P':
                        ctok->cur_tok->id = CTokIdNumLitFloat;
                        ctok->state = CTokStateExpSign;
                        break;
                    case 'u':
                    case 'U':
                        // marks the number literal as unsigned
                        ctok->state = CTokStateNumLitIntSuffixU;
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixU;
                        break;
                    case 'l':
                    case 'L':
                        // marks the number literal as long
                        ctok->state = CTokStateNumLitIntSuffixL;
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixL;
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateNumLitIntSuffixU:
                switch (*c) {
                    case 'l':
                    case 'L':
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLU;
                        ctok->state = CTokStateNumLitIntSuffixUL;
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateNumLitIntSuffixL:
                switch (*c) {
                    case 'l':
                    case 'L':
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLL;
                        ctok->state = CTokStateNumLitIntSuffixLL;
                        break;
                    case 'u':
                    case 'U':
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLU;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateNumLitIntSuffixLL:
                switch (*c) {
                    case 'u':
                    case 'U':
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLLU;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateNumLitIntSuffixUL:
                switch (*c) {
                    case 'l':
                    case 'L':
                        ctok->cur_tok->data.num_lit_int.suffix = CNumLitSuffixLLU;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateIdentifier:
                switch (*c) {
                    case IDENT:
                        buf_append_char(&ctok->cur_tok->data.symbol, *c);
                        break;
                    default:
                        c -= 1;
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        continue;
                }
                break;
            case CTokStateString:
                switch (*c) {
                    case '\\':
                        ctok->state = CTokStateCharEscape;
                        break;
                    case '\"':
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        break;
                    default:
                        buf_append_char(&ctok->cur_tok->data.str_lit, *c);
                }
                break;
            case CTokStateExpectChar:
                switch (*c) {
                    case '\\':
                        ctok->state = CTokStateCharEscape;
                        break;
                    case '\'':
                        return mark_error(ctok);
                    default:
                        ctok->cur_tok->data.char_lit = *c;
                        ctok->state = CTokStateExpectEndQuot;
                }
                break;
            case CTokStateCharEscape:
                switch (*c) {
                    case '\'':
                    case '"':
                    case '?':
                    case '\\':
                        add_char(ctok, *c);
                        break;
                    case 'a':
                        add_char(ctok, '\a');
                        break;
                    case 'b':
                        add_char(ctok, '\b');
                        break;
                    case 'f':
                        add_char(ctok, '\f');
                        break;
                    case 'n':
                        add_char(ctok, '\n');
                        break;
                    case 'r':
                        add_char(ctok, '\r');
                        break;
                    case 't':
                        add_char(ctok, '\t');
                        break;
                    case 'v':
                        add_char(ctok, '\v');
                        break;
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                        ctok->state = CTokStateStrOctal;
                        ctok->cur_char = (uint8_t)(*c - '0');
                        ctok->octal_index = 1;
                        break;
                    case 'x':
                        ctok->state = CTokStateStrHex;
                        ctok->cur_char = 0;
                        break;
                    case 'u':
                        zig_panic("TODO unicode");
                        break;
                    case 'U':
                        zig_panic("TODO Unicode");
                        break;
                    default:
                        return mark_error(ctok);
                }
                break;
            case CTokStateStrHex: {
                uint8_t value = 0;
                switch (*c) {
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                    case '8':
                    case '9':
                        value = *c - '0';
                        break;
                    case 'a':
                    case 'b':
                    case 'c':
                    case 'd':
                    case 'e':
                    case 'f':
                        value = (*c - 'a') + 10;
                        break;
                    case 'A':
                    case 'B':
                    case 'C':
                    case 'D':
                    case 'E':
                    case 'F':
                        value = (*c - 'A') + 10;
                        break;
                    default:
                        c -= 1;
                        add_char(ctok, ctok->cur_char);
                        continue;
                }
                // TODO @mul_with_overflow
                if (((long)ctok->cur_char) * 16 >= 256) {
                    zig_panic("TODO str hex mul overflow");
                }
                ctok->cur_char = (uint8_t)(ctok->cur_char * (uint8_t)16);
                // TODO @add_with_overflow
                if (((long)ctok->cur_char) + (long)(value) >= 256) {
                    zig_panic("TODO str hex add overflow");
                }
                ctok->cur_char = (uint8_t)(ctok->cur_char + value);
                break;
            }
            case CTokStateStrOctal:
                switch (*c) {
                    case '0':
                    case '1':
                    case '2':
                    case '3':
                    case '4':
                    case '5':
                    case '6':
                    case '7':
                        // TODO @mul_with_overflow
                        if (((long)ctok->cur_char) * 8 >= 256) {
                            zig_panic("TODO");
                        }
                        ctok->cur_char = (uint8_t)(ctok->cur_char * (uint8_t)8);
                        // TODO @add_with_overflow
                        if (((long)ctok->cur_char) + (long)(*c - '0') >= 256) {
                            zig_panic("TODO");
                        }
                        ctok->cur_char = (uint8_t)(ctok->cur_char + (uint8_t)(*c - '0'));
                        ctok->octal_index += 1;
                        if (ctok->octal_index == 3) {
                            add_char(ctok, ctok->cur_char);
                        }
                        break;
                    default:
                        c -= 1;
                        add_char(ctok, ctok->cur_char);
                        continue;
                }
                break;
            case CTokStateExpectEndQuot:
                switch (*c) {
                    case '\'':
                        end_token(ctok);
                        ctok->state = CTokStateStart;
                        break;
                    default:
                        return mark_error(ctok);
                }
                break;
            case CTokStateOpenComment:
                switch (*c) {
                    case '/':
                        ctok->state = CTokStateLineComment;
                        break;
                    case '*':
                        ctok->state = CTokStateComment;
                        break;
                    default:
                        return mark_error(ctok);
                }
                break;
            case CTokStateLineComment:
                if (*c == '\n') {
                    ctok->state = CTokStateStart;
                    goto found_end_of_macro;
                }
                break;
            case CTokStateComment:
                switch (*c) {
                    case '*':
                        ctok->state = CTokStateCommentStar;
                        break;
                    default:
                        break;
                }
                break;
            case CTokStateCommentStar:
                switch (*c) {
                    case '/':
                        ctok->state = CTokStateStart;
                        break;
                    case '*':
                        break;
                    default:
                        ctok->state = CTokStateComment;
                        break;
                }
                break;
            case CTokStateBackslash:
                switch (*c) {
                    case '\n':
                        ctok->state = CTokStateStart;
                        break;
                    default:
                        return mark_error(ctok);
                }
                break;
        }
    }
found_end_of_macro:

    switch (ctok->state) {
        case CTokStateStart:
            break;
        case CTokStateIdentifier:
        case CTokStateDecimal:
        case CTokStateHex:
        case CTokStateOctal:
        case CTokStateGotZero:
        case CTokStateNumLitIntSuffixU:
        case CTokStateNumLitIntSuffixL:
        case CTokStateNumLitIntSuffixUL:
        case CTokStateNumLitIntSuffixLL:
            end_token(ctok);
            break;
        case CTokStateFloat:
        case CTokStateFloatExp:
            end_float(ctok);
            break;
        case CTokStateExpectChar:
        case CTokStateExpectEndQuot:
        case CTokStateOpenComment:
        case CTokStateLineComment:
        case CTokStateComment:
        case CTokStateCommentStar:
        case CTokStateCharEscape:
        case CTokStateBackslash:
        case CTokStateString:
        case CTokStateExpSign:
        case CTokStateFloatExpFirst:
        case CTokStateStrHex:
        case CTokStateStrOctal:
            return mark_error(ctok);
    }

    assert(ctok->cur_tok == nullptr);

    begin_token(ctok, CTokIdEOF);
    end_token(ctok);
}