v/vlib/v2/scanner/scanner.v
2025-05-13 22:05:22 +03:00

592 lines
11 KiB
V

// Copyright (c) 2020-2024 Joe Conigliaro. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module scanner
import v2.token
import v2.pref
@[flag]
pub enum Mode {
normal
scan_comments
skip_interpolation
}
pub struct Scanner {
pref &pref.Preferences
mode Mode
skip_interpolation bool
mut:
file &token.File = &token.File{}
src string
insert_semi bool
pub mut:
offset int // current char offset
pos int // token offset (start of current token)
lit string
// strings literals & interpolation
in_str_incomplete bool
in_str_inter bool
str_inter_cbr_depth int
str_quote u8
}
pub fn new_scanner(prefs &pref.Preferences, mode Mode) &Scanner {
unsafe {
return &Scanner{
pref: prefs
mode: mode
skip_interpolation: mode.has(.skip_interpolation)
}
}
}
pub fn (mut s Scanner) init(file &token.File, src string) {
// reset since scanner instance may be reused
s.offset = 0
s.pos = 0
s.lit = ''
// s.in_str_incomplete = false
// s.in_str_inter = false
// s.str_inter_cbr_depth = 0
// init
s.file = unsafe { file }
s.src = src
}
@[direct_array_access]
pub fn (mut s Scanner) scan() token.Token {
// before whitespace call to keep whitespaces in string
// NOTE: before start: simply for a little more efficiency
// if !s.skip_interpolation && s.in_str_incomplete {
if s.in_str_incomplete {
s.in_str_incomplete = false
s.pos = s.offset
s.string_literal(false, s.str_quote)
s.lit = s.src[s.pos..s.offset]
return .string
}
start:
s.whitespace()
if s.offset == s.src.len {
s.lit = ''
if s.insert_semi {
s.insert_semi = false
return .semicolon
}
s.file.add_line(s.offset)
return .eof
}
c := s.src[s.offset]
s.pos = s.offset
preserve_insert_semi := s.insert_semi
s.insert_semi = false
if c == `\n` {
s.lit = ''
return .semicolon
}
// comment | `/=` | `/`
else if c == `/` {
c2 := s.src[s.offset + 1]
// comment
if c2 in [`/`, `*`] {
if preserve_insert_semi {
s.insert_semi = true
}
s.comment()
if !s.mode.has(.scan_comments) {
unsafe {
goto start
}
}
s.lit = s.src[s.pos..s.offset]
return .comment
}
// `/=`
else if c2 == `=` {
s.offset += 2
return .div_assign
}
s.offset++
// `/`
return .div
}
// number
else if c >= `0` && c <= `9` {
s.number()
s.lit = s.src[s.pos..s.offset]
s.insert_semi = true
return .number
}
// keyword | name
else if (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) || c in [`_`, `@`] {
s.offset++
// NOTE: I have made `@[` a token instead of using `@` and `[` because `@`
// is not currently used as a token, and it is also easier to parse this way.
// if/when `@` becomes used as a token of its own, then I may change this.
if c == `@` && s.src[s.offset] == `[` {
s.offset++
return .attribute
}
for s.offset < s.src.len {
c3 := s.src[s.offset]
if c3.is_alnum() || c3 == `_` {
s.offset++
continue
}
break
}
s.lit = s.src[s.pos..s.offset]
tok := token.Token.from_string_tinyv(s.lit)
if tok in [.key_break, .key_continue, .key_none, .key_return, .key_false, .key_true, .name] {
s.insert_semi = true
}
return tok
}
// string
else if c in [`'`, `"`] {
s.offset++
if !s.in_str_inter {
s.str_quote = c
}
// TODO: I would prefer a better way to handle raw
s.string_literal(s.in_str_inter || s.src[s.offset - 2] == `r`, c)
s.lit = s.src[s.pos..s.offset]
s.insert_semi = true
return .string
}
// byte (char) `a`
else if c == `\`` {
s.offset++
// NOTE: if there is more than one char still scan it
// we can error at a later stage. should we error now?
for {
c2 := s.src[s.offset]
if c2 == c {
break
} else if c2 == `\\` {
s.offset += 2
continue
}
s.offset++
}
s.offset++
s.lit = s.src[s.pos + 1..s.offset - 1]
s.insert_semi = true
return .char
}
// s.lit not set, as tokens below get converted directly to string
// s.lit = c
s.lit = ''
s.offset++
match c {
`.` {
c2 := s.src[s.offset]
if c2 >= `0` && c2 <= `9` {
// TODO: only really need decimal
s.number()
s.lit = s.src[s.pos..s.offset]
return .number
} else if c2 == `.` {
s.offset++
if s.src[s.offset] == `.` {
s.offset++
return .ellipsis
}
return .dotdot
}
return .dot
}
`:` {
if s.src[s.offset] == `=` {
s.offset++
return .decl_assign
}
return .colon
}
`!` {
c2 := s.src[s.offset]
if c2 == `=` {
s.offset++
return .ne
} else if c2 == `i` {
c3 := s.src[s.offset + 1]
c4_is_space := s.src[s.offset + 2] in [` `, `\t`]
if c3 == `n` && c4_is_space {
s.offset += 2
return .not_in
} else if c3 == `s` && c4_is_space {
s.offset += 2
return .not_is
}
}
s.insert_semi = true
return .not
}
`=` {
c2 := s.src[s.offset]
if c2 == `=` {
s.offset++
return .eq
}
return .assign
}
`+` {
c2 := s.src[s.offset]
if c2 == `+` {
s.offset++
return .inc
} else if c2 == `=` {
s.offset++
return .plus_assign
}
return .plus
}
`-` {
c2 := s.src[s.offset]
if c2 == `-` {
s.offset++
return .dec
} else if c2 == `=` {
s.offset++
return .minus_assign
}
return .minus
}
`%` {
if s.src[s.offset] == `=` {
s.offset++
return .mod_assign
}
return .mod
}
`*` {
if s.src[s.offset] == `=` {
s.offset++
return .mul_assign
}
return .mul
}
`^` {
if s.src[s.offset] == `=` {
s.offset++
return .xor_assign
}
return .xor
}
`&` {
c2 := s.src[s.offset]
if c2 == `&` {
// so that we parse &&Type as two .amp instead of .and
// but this requires there is a space. we could check
// for capital or some other way, this is simplest for now.
if s.offset + 1 <= s.src.len && s.src[s.offset + 1] in [` `, `\t`] {
s.offset++
return .and
}
} else if c2 == `=` {
s.offset++
return .and_assign
}
return .amp
}
`|` {
c2 := s.src[s.offset]
if c2 == `|` {
s.offset++
return .logical_or
} else if c2 == `=` {
s.offset++
return .or_assign
}
return .pipe
}
`<` {
c2 := s.src[s.offset]
if c2 == `<` {
s.offset++
if s.src[s.offset] == `=` {
s.offset++
return .left_shift_assign
}
return .left_shift
} else if c2 == `=` {
s.offset++
return .le
} else if c2 == `-` {
s.offset++
return .arrow
}
return .lt
}
`>` {
c2 := s.src[s.offset]
if c2 == `>` {
s.offset++
c3 := s.src[s.offset]
if c3 == `>` {
s.offset++
if s.src[s.offset] == `=` {
s.offset++
return .right_shift_unsigned_assign
}
return .right_shift_unsigned
} else if c3 == `=` {
s.offset++
return .right_shift_assign
}
return .right_shift
} else if c2 == `=` {
s.offset++
return .ge
}
return .gt
}
`#` {
// if we choose to scan whole line
// s.line()
return .hash
}
// `@` { return .at }
`~` {
return .bit_not
}
`,` {
return .comma
}
`$` {
if s.in_str_inter {
return .str_dollar
}
return .dollar
}
`{` {
if s.in_str_inter {
s.str_inter_cbr_depth++
}
return .lcbr
}
`}` {
if s.in_str_inter {
s.str_inter_cbr_depth--
if s.str_inter_cbr_depth == 0 {
s.in_str_incomplete = true
s.in_str_inter = false
}
}
s.insert_semi = true
return .rcbr
}
`(` {
return .lpar
}
`)` {
s.insert_semi = true
return .rpar
}
`[` {
s.insert_semi = true
return .lsbr
}
`]` {
s.insert_semi = true
return .rsbr
}
`;` {
return .semicolon
}
`?` {
s.insert_semi = true
return .question
}
else {
return .unknown
}
}
}
// skip whitespace
@[direct_array_access]
fn (mut s Scanner) whitespace() {
for s.offset < s.src.len {
c := s.src[s.offset]
if c in [` `, `\t`, `\r`] {
s.offset++
continue
} else if c == `\n` {
if s.insert_semi {
return
}
s.offset++
s.file.add_line(s.offset)
continue
}
break
}
// s.insert_semi = false
}
@[direct_array_access]
fn (mut s Scanner) line() {
// a newline reached here will get recorded by next whitespace call
// we could add them manually here, but whitespace is called anyway
for s.offset < s.src.len {
if s.src[s.offset] == `\n` {
break
}
s.offset++
}
}
@[direct_array_access]
fn (mut s Scanner) comment() {
s.offset++
c := s.src[s.offset]
// single line
if c == `/` {
s.line()
}
// multi line
else if c == `*` {
s.offset++
mut ml_comment_depth := 1
for s.offset < s.src.len {
c2 := s.src[s.offset]
c3 := s.src[s.offset + 1]
if c2 == `\n` {
s.offset++
s.file.add_line(s.offset)
} else if c2 == `/` && c3 == `*` {
s.offset += 2
ml_comment_depth++
} else if c2 == `*` && c3 == `/` {
s.offset += 2
ml_comment_depth--
if ml_comment_depth == 0 {
break
}
} else {
s.offset++
}
}
}
}
@[direct_array_access]
fn (mut s Scanner) string_literal(scan_as_raw bool, c_quote u8) {
// shortcut, scan whole string
if scan_as_raw {
for s.offset < s.src.len && s.src[s.offset] != c_quote {
s.offset++
}
s.offset++
return
}
// normal strings
for s.offset < s.src.len {
c := s.src[s.offset]
// escape `\\n` | `\'`
if c == `\\` {
s.offset += 2
continue
} else if c == `\n` {
s.offset++
s.file.add_line(s.offset)
continue
} else if c == `$` && s.src[s.offset + 1] == `{` {
s.in_str_inter = true
if s.skip_interpolation {
s.str_inter_cbr_depth++
s.offset += 2
continue
} else {
return
}
} else if s.skip_interpolation && s.in_str_inter {
if c == `{` {
s.str_inter_cbr_depth++
} else if c == `}` {
s.str_inter_cbr_depth--
if s.str_inter_cbr_depth == 0 {
s.in_str_inter = false
}
}
} else if c == c_quote && !s.in_str_inter {
s.offset++
break
}
s.offset++
}
}
@[direct_array_access]
fn (mut s Scanner) number() {
if s.src[s.offset] == `0` {
s.offset++
c := s.src[s.offset]
// TODO: impl proper underscore support
// 0b (binary)
if c in [`b`, `B`] {
s.offset++
for {
c2 := s.src[s.offset]
if c2 in [`0`, `1`] || c2 == `_` {
s.offset++
continue
}
return
}
}
// 0x (hex)
else if c in [`x`, `X`] {
s.offset++
for {
c2 := s.src[s.offset]
if (c2 >= `0` && c2 <= `9`) || (c2 >= `a` && c2 <= `f`)
|| (c2 >= `A` && c2 <= `F`) || c2 == `_` {
s.offset++
continue
}
return
}
}
// 0o (octal)
else if c in [`o`, `O`] {
s.offset++
for {
c2 := s.src[s.offset]
if c2 >= `0` && c2 <= `7` {
s.offset++
continue
}
return
}
}
}
mut has_decimal := false
mut has_exponent := false
// TODO: proper impl of fraction / exponent
// continue decimal (and also completion of bin/octal)
for s.offset < s.src.len {
c := s.src[s.offset]
if (c >= `0` && c <= `9`) || c == `_` {
s.offset++
continue
}
// fraction
else if !has_decimal && c == `.` && s.src[s.offset + 1] != `.` {
has_decimal = true
s.offset++
continue
}
// exponent
else if !has_exponent && c in [`e`, `E`] {
has_exponent = true
s.offset++
continue
}
break
}
}