speedcat/lexer.odin
2024-02-08 01:04:11 +02:00

362 lines
11 KiB
Odin

package main
import "core:fmt"
import "core:c/libc"
import "core:math"
Lexer :: struct {
data: ^[dynamic]u8,
read_position: u64,
position: TextPosition,
char, next: u8,
last_token_kind: TokenKind,
should_return_semicolon: bool,
}
lexer_create :: proc(data: ^[dynamic]u8) -> ^Lexer {
lexer := new(Lexer)
lexer^ = {
data = data,
read_position = 0,
position = TextPosition {
line = 1,
column = 1,
},
}
lexer_advance(lexer)
lexer_advance(lexer)
return lexer
}
@(private = "file")
lexer_advance :: proc(lexer: ^Lexer) {
lexer.char = lexer.next
if lexer.read_position < u64(len(lexer.data)) {
lexer.next = lexer.data[lexer.read_position]
} else {
lexer.next = 0
}
lexer.read_position += 1
if lexer.char == '\r' {
lexer_advance(lexer)
}
if lexer.char == '\n' {
lexer.position.line += 1
lexer.position.column = 1
} else {
lexer.position.column += 1
}
}
@(private = "file")
lexer_should_not_emit_semicolon :: proc(lexer: ^Lexer) -> bool {
return lexer.last_token_kind == .CloseBrace ||
lexer.last_token_kind == .Semicolon ||
lexer.last_token_kind == .EOF ||
lexer.last_token_kind == .Invalid ||
lexer.last_token_kind == .OpenParen ||
lexer.last_token_kind == .OpenBrace ||
lexer.last_token_kind == .OpenBracket ||
lexer.last_token_kind == .Add ||
lexer.last_token_kind == .Subtract ||
lexer.last_token_kind == .Multiply ||
lexer.last_token_kind == .Divide ||
lexer.last_token_kind == .Modulo ||
lexer.last_token_kind == .Exponent ||
lexer.last_token_kind == .Assign ||
lexer.last_token_kind == .Not ||
lexer.last_token_kind == .BitwiseAnd ||
lexer.last_token_kind == .BitwiseOr ||
lexer.last_token_kind == .BitwiseXOR ||
lexer.last_token_kind == .BitwiseNot ||
lexer.last_token_kind == .LessThan ||
lexer.last_token_kind == .GreaterThan ||
lexer.last_token_kind == .BitwiseLeftShift ||
lexer.last_token_kind == .BitwiseRightShift
}
@(private = "file")
lexer_skip_whitespace :: proc(lexer: ^Lexer) {
// FIXME: Do the funny golang thing where newlines are semicolons based on some rules
for lexer.char == ' ' || lexer.char == '\t' || lexer.char == '\r' || lexer.char == '\n' {
if lexer.char == '\n' {
if !lexer_should_not_emit_semicolon(lexer) {
lexer.should_return_semicolon = true
lexer_advance(lexer)
return
}
}
lexer_advance(lexer)
}
}
lexer_next :: proc(lexer: ^Lexer) -> (ret: Token) {
lexer_skip_whitespace(lexer)
if lexer.should_return_semicolon {
lexer.should_return_semicolon = false
return token_create(.Semicolon, TextRange { start = lexer.position, end = lexer.position })
}
defer lexer.last_token_kind = ret.kind
crange := TextRange {
start = lexer.position,
end = lexer.position,
}
ret = token_create(.Invalid, crange)
should_advance := true
switch lexer.char {
case '+':
ret = token_create(.Add, crange)
if lexer.next == '+' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.Increment, crange)
}
case '-':
ret = token_create(.Subtract, crange)
if lexer.next == '-' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.Decrement, crange)
} else if lexer.next == '>' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.Arrow, crange)
}
case '*': ret = token_create(.Multiply, crange)
case '/': ret = token_create(.Divide, crange)
case '%': ret = token_create(.Modulo, crange)
case '`': ret = token_create(.Exponent, crange)
case '=': ret = token_create(.Assign, crange)
case '!':
ret = token_create(.Not, crange)
if lexer.next == '=' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.NotEquals, crange)
}
case '<':
ret = token_create(.LessThan, crange)
if lexer.next == '=' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.LessThanOrEqual, crange)
} else if lexer.next == '<' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.BitwiseLeftShift, crange)
}
case '>':
ret = token_create(.GreaterThan, crange)
if lexer.next == '=' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.GreaterThanOrEqual, crange)
} else if lexer.next == '>' {
lexer_advance(lexer)
crange.end = lexer.position
ret = token_create(.BitwiseRightShift, crange)
}
case '&': ret = token_create(.BitwiseAnd, crange)
case '|': ret = token_create(.BitwiseOr, crange)
case '^': ret = token_create(.BitwiseXOR, crange)
case '~': ret = token_create(.BitwiseNot, crange)
case '(': ret = token_create(.OpenParen, crange)
case ')': ret = token_create(.CloseParen, crange)
case '[': ret = token_create(.OpenBracket, crange)
case ']': ret = token_create(.CloseBracket, crange)
case '{': ret = token_create(.OpenBrace, crange)
case '}': ret = token_create(.CloseBrace, crange)
case '?': ret = token_create(.Question, crange)
case ':': ret = token_create(.Colon, crange)
case '.': ret = token_create(.Dot, crange)
case ';': ret = token_create(.Semicolon, crange)
case '"': ret = lexer_read_string(lexer, .String, '\"')
case '\'': ret = lexer_read_string(lexer, .Character, '\'')
case 'a'..='z': fallthrough
case 'A'..='Z': fallthrough
case '_':
ret = lexer_read_identifier(lexer)
should_advance = false
case '0'..='9':
ret = lexer_read_number(lexer)
should_advance = false
case 0:
ret = token_create(.EOF, crange)
should_advance = false
}
if should_advance {
lexer_advance(lexer)
}
return
}
@(private = "file")
lexer_read_string :: proc(lexer: ^Lexer, kind: TokenKind, outer: u8) -> Token {
crange := TextRange {
start = lexer.position,
end = lexer.position,
}
lexer_advance(lexer)
str : [dynamic]u8
for lexer.char != outer {
if lexer.char == '\\' {
range := TextRange { start = lexer.position }
lexer_advance(lexer)
switch lexer.char {
case 'n': append(&str, '\n'); break
case 't': append(&str, '\t'); break
case 'b': append(&str, '\b'); break
case 'r': append(&str, '\r'); break
case '\\': append(&str, '\\'); break
case:
range.end = lexer.position
append(&g_message_list,
message_create(.Warning, fmt.aprintf("Invalid string/character escape: %c at %s", lexer.char, "TODO LOCATION"), range),
)
}
lexer_advance(lexer)
continue
}
append(&str, lexer.char)
lexer_advance(lexer)
}
crange.end = lexer.position
return token_create_u8(kind, str, crange)
}
@(private = "file")
lexer_read_identifier :: proc(lexer: ^Lexer) -> Token {
crange := TextRange { start = lexer.position }
str : [dynamic]u8
for libc.isalnum(i32(lexer.char)) != 0 || lexer.char == '_' {
append(&str, lexer.char)
crange.end = lexer.position
lexer_advance(lexer)
}
if compare_dyn_arr_string(&str, "fn") { return token_create(.Function, crange) }
else if compare_dyn_arr_string(&str, "struct") { return token_create(.Struct, crange) }
else if compare_dyn_arr_string(&str, "enum") { return token_create(.Enum, crange) }
else if compare_dyn_arr_string(&str, "union") { return token_create(.Union, crange) }
else if compare_dyn_arr_string(&str, "type") { return token_create(.Type, crange) }
else if compare_dyn_arr_string(&str, "use") { return token_create(.Use, crange) }
else if compare_dyn_arr_string(&str, "pub") { return token_create(.Pub, crange) }
else if compare_dyn_arr_string(&str, "let") { return token_create(.Let, crange) }
else if compare_dyn_arr_string(&str, "mut") { return token_create(.Mut, crange) }
else if compare_dyn_arr_string(&str, "as") { return token_create(.As, crange) }
else if compare_dyn_arr_string(&str, "in") { return token_create(.In, crange) }
else if compare_dyn_arr_string(&str, "else") { return token_create(.Else, crange) }
else if compare_dyn_arr_string(&str, "elif") { return token_create(.Elif, crange) }
else if compare_dyn_arr_string(&str, "for") { return token_create(.For, crange) }
else if compare_dyn_arr_string(&str, "break") { return token_create(.Break, crange) }
else if compare_dyn_arr_string(&str, "continue") { return token_create(.Continue, crange) }
else if compare_dyn_arr_string(&str, "switch") { return token_create(.Switch, crange) }
else if compare_dyn_arr_string(&str, "case") { return token_create(.Case, crange) }
else if compare_dyn_arr_string(&str, "ret") { return token_create(.Ret, crange) }
else if compare_dyn_arr_string(&str, "static") { return token_create(.Static, crange) }
else if compare_dyn_arr_string(&str, "defer") { return token_create(.Defer, crange) }
else if compare_dyn_arr_string(&str, "and") { return token_create(.And, crange) }
else if compare_dyn_arr_string(&str, "or") { return token_create(.Or, crange) }
return token_create_u8(.Identifier, str, crange)
}
@(private = "file")
lexer_read_number :: proc(lexer: ^Lexer) -> Token {
crange := TextRange {
start = lexer.position,
end = lexer.position,
}
// FIXME: Implement binary
ReadMode :: enum {
Normal,
Hex,
}
read_mode := ReadMode.Normal
if lexer.char == '0' && lexer.next == 'x' {
read_mode = .Hex
lexer_advance(lexer)
crange.end = lexer.position
lexer_advance(lexer)
}
whole_part : u64 = 0
if read_mode == .Normal {
for libc.isdigit(i32(lexer.char)) != 0 && lexer.char > 0 {
whole_part = whole_part * 10 + u64(lexer.char) - '0'
crange.end = lexer.position
lexer_advance(lexer)
}
} else if read_mode == .Hex {
lowered := libc.tolower(i32(lexer.char))
for libc.isxdigit(lowered) != 0 && lexer.char > 0 && lexer.char != '.' {
digit := lowered - '0'
if libc.isdigit(lowered) == 0 {
digit = lowered - 'a' + 10
}
whole_part = (whole_part << 4) | u64(digit)
crange.end = lexer.position
lexer_advance(lexer)
}
whole_part = whole_part >> 4
}
if lexer.char == '.' {
lexer_advance(lexer)
// FIXME: Move this to another procedure because this is repeating lmfao
fractional_part : u64 = 0
if read_mode == .Normal {
for libc.isdigit(i32(lexer.char)) != 0 && lexer.char > 0 {
fractional_part = fractional_part * 10 + u64(lexer.char) - '0'
crange.end = lexer.position
lexer_advance(lexer)
}
} else if read_mode == .Hex {
append(&g_message_list, message_create(.Error, "Hexadecimal floating point numbers are not supported yet", crange))
lowered := libc.tolower(i32(lexer.char))
for libc.isxdigit(lowered) != 0 && lexer.char > 0 {
digit := lowered - '0'
if libc.isdigit(lowered) == 0 {
digit = lowered - 'a' + 10
}
fractional_part = fractional_part * 16 + u64(digit)
crange.end = lexer.position
lexer_advance(lexer)
}
fractional_part = fractional_part / 16
}
fractional_part_clone := fractional_part
count := 0
for fractional_part_clone != 0 {
fractional_part_clone = fractional_part_clone / 10
count = count + 1
}
floating : f64 = 0
floating = f64(fractional_part) / math.pow_f64(10, f64(count)) + f64(whole_part)
return token_create_f64(.Float, floating, crange)
}
return token_create_u64(.Integer, whole_part, crange)
}