mirror of
https://github.com/vlang/v.git
synced 2025-09-10 16:00:31 -04:00
encoding.utf8: replace uchar with rune, deprecate get_uchar, is_uchar_punct, is_uchar_global_punct (#22560)
This commit is contained in:
parent
f814386c2a
commit
35079f115d
@ -33,7 +33,7 @@ pub fn display_width(s string, ambiguous_width int) int {
|
|||||||
|
|
||||||
// width_property_at returns the East Asian Width properties at string[index]
|
// width_property_at returns the East Asian Width properties at string[index]
|
||||||
pub fn east_asian_width_property_at(s string, index int) EastAsianWidthProperty {
|
pub fn east_asian_width_property_at(s string, index int) EastAsianWidthProperty {
|
||||||
codepoint := utf8.get_uchar(s, index)
|
codepoint := utf8.get_rune(s, index)
|
||||||
mut left, mut right := 0, east_asian_width_data.len - 1
|
mut left, mut right := 0, east_asian_width_data.len - 1
|
||||||
for left <= right {
|
for left <= right {
|
||||||
middle := left + ((right - left) / 2)
|
middle := left + ((right - left) / 2)
|
||||||
|
@ -27,8 +27,15 @@ pub fn len(s string) int {
|
|||||||
return count
|
return count
|
||||||
}
|
}
|
||||||
|
|
||||||
// get_uchar convert a unicode glyph in string[index] into a int unicode char
|
// get_uchar convert a UTF-8 unicode codepoint in string[index] into a UTF-32 encoded int unicode char
|
||||||
|
@[deprecated: 'use `.get_rune(s string, index int)` instead']
|
||||||
|
@[deprecated_after: '2024-11-17']
|
||||||
pub fn get_uchar(s string, index int) int {
|
pub fn get_uchar(s string, index int) int {
|
||||||
|
return int(get_rune(s, index))
|
||||||
|
}
|
||||||
|
|
||||||
|
// get_rune convert a UTF-8 unicode codepoint in string[index] into a UTF-32 encoded rune
|
||||||
|
pub fn get_rune(s string, index int) rune {
|
||||||
mut res := 0
|
mut res := 0
|
||||||
mut ch_len := 0
|
mut ch_len := 0
|
||||||
if s.len > 0 {
|
if s.len > 0 {
|
||||||
@ -81,7 +88,7 @@ pub fn raw_index(s string, index int) string {
|
|||||||
|
|
||||||
r << if ch_len > 0 {
|
r << if ch_len > 0 {
|
||||||
i += ch_len
|
i += ch_len
|
||||||
rune(get_uchar(s, i - ch_len))
|
rune(get_rune(s, i - ch_len))
|
||||||
} else {
|
} else {
|
||||||
rune(b)
|
rune(b)
|
||||||
}
|
}
|
||||||
@ -126,7 +133,7 @@ pub fn to_lower(s string) string {
|
|||||||
|
|
||||||
// is_punct return true if the string[index] byte is the start of a unicode western punctuation
|
// is_punct return true if the string[index] byte is the start of a unicode western punctuation
|
||||||
pub fn is_punct(s string, index int) bool {
|
pub fn is_punct(s string, index int) bool {
|
||||||
return is_uchar_punct(get_uchar(s, index))
|
return is_rune_punct(get_rune(s, index))
|
||||||
}
|
}
|
||||||
|
|
||||||
// is_control return true if the rune is control code
|
// is_control return true if the rune is control code
|
||||||
@ -175,20 +182,34 @@ pub fn is_number(r rune) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// is_uchar_punct return true if the input unicode is a western unicode punctuation
|
// is_uchar_punct return true if the input unicode is a western unicode punctuation
|
||||||
|
@[deprecated: 'use `.is_rune_punct(r rune)` instead']
|
||||||
|
@[deprecated_after: '2024-11-17']
|
||||||
pub fn is_uchar_punct(uchar int) bool {
|
pub fn is_uchar_punct(uchar int) bool {
|
||||||
return find_punct_in_table(uchar, unicode_punct_western) != 0
|
return is_rune_punct(rune(uchar))
|
||||||
|
}
|
||||||
|
|
||||||
|
// is_rune_punct return true if the input unicode is a western unicode punctuation
|
||||||
|
pub fn is_rune_punct(r rune) bool {
|
||||||
|
return find_punct_in_table(r, unicode_punct_western) != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// Global
|
// Global
|
||||||
|
|
||||||
// is_global_punct return true if the string[index] byte of is the start of a global unicode punctuation
|
// is_global_punct return true if the string[index] byte of is the start of a global unicode punctuation
|
||||||
pub fn is_global_punct(s string, index int) bool {
|
pub fn is_global_punct(s string, index int) bool {
|
||||||
return is_uchar_global_punct(get_uchar(s, index))
|
return is_rune_global_punct(get_rune(s, index))
|
||||||
}
|
}
|
||||||
|
|
||||||
// is_uchar_global_punct return true if the input unicode is a global unicode punctuation
|
// is_uchar_global_punct return true if the input unicode is a global unicode punctuation
|
||||||
|
@[deprecated: 'use `.is_rune_global_punct(r rune)` instead']
|
||||||
|
@[deprecated_after: '2024-11-17']
|
||||||
pub fn is_uchar_global_punct(uchar int) bool {
|
pub fn is_uchar_global_punct(uchar int) bool {
|
||||||
return find_punct_in_table(uchar, unicode_punct) != 0
|
return is_rune_global_punct(rune(uchar))
|
||||||
|
}
|
||||||
|
|
||||||
|
// is_rune_global_punct return true if the input unicode is a global unicode punctuation
|
||||||
|
pub fn is_rune_global_punct(r rune) bool {
|
||||||
|
return find_punct_in_table(r, unicode_punct) != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// Private functions
|
// Private functions
|
||||||
@ -523,13 +544,13 @@ fn convert_case(s string, upper_flag bool) string {
|
|||||||
|
|
||||||
// find_punct_in_table looks for valid punctuation in table
|
// find_punct_in_table looks for valid punctuation in table
|
||||||
@[direct_array_access]
|
@[direct_array_access]
|
||||||
fn find_punct_in_table(in_code int, in_table []int) int {
|
fn find_punct_in_table(in_code rune, in_table []rune) rune {
|
||||||
// uses simple binary search
|
// uses simple binary search
|
||||||
|
|
||||||
mut first_index := 0
|
mut first_index := 0
|
||||||
mut last_index := (in_table.len)
|
mut last_index := (in_table.len)
|
||||||
mut index := 0
|
mut index := 0
|
||||||
mut x := 0
|
mut x := rune(0)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
index = (first_index + last_index) >> 1
|
index = (first_index + last_index) >> 1
|
||||||
@ -559,7 +580,7 @@ fn find_punct_in_table(in_code int, in_table []int) int {
|
|||||||
// Western punctuation mark
|
// Western punctuation mark
|
||||||
// Character Name Browser Image
|
// Character Name Browser Image
|
||||||
const unicode_punct_western = [
|
const unicode_punct_western = [
|
||||||
0x0021, // EXCLAMATION MARK !
|
rune(0x0021), // EXCLAMATION MARK !
|
||||||
0x0022, // QUOTATION MARK "
|
0x0022, // QUOTATION MARK "
|
||||||
0x0027, // APOSTROPHE '
|
0x0027, // APOSTROPHE '
|
||||||
0x002A, // ASTERISK *
|
0x002A, // ASTERISK *
|
||||||
@ -593,7 +614,7 @@ const unicode_punct_western = [
|
|||||||
// Unicode Characters in the 'Punctuation, Other' Category
|
// Unicode Characters in the 'Punctuation, Other' Category
|
||||||
// Character Name Browser Image
|
// Character Name Browser Image
|
||||||
const unicode_punct = [
|
const unicode_punct = [
|
||||||
0x0021, // EXCLAMATION MARK !
|
rune(0x0021), // EXCLAMATION MARK !
|
||||||
0x0022, // QUOTATION MARK "
|
0x0022, // QUOTATION MARK "
|
||||||
0x0023, // NUMBER SIGN #
|
0x0023, // NUMBER SIGN #
|
||||||
0x0025, // PERCENT SIGN %
|
0x0025, // PERCENT SIGN %
|
||||||
|
@ -23,7 +23,8 @@ fn test_utf8_util() {
|
|||||||
a := '.abc?abcòàè.'
|
a := '.abc?abcòàè.'
|
||||||
assert utf8.is_punct(a, 0) == true
|
assert utf8.is_punct(a, 0) == true
|
||||||
assert utf8.is_punct('b', 0) == false
|
assert utf8.is_punct('b', 0) == false
|
||||||
assert utf8.is_uchar_punct(0x002E) == true
|
assert utf8.is_uchar_punct(0x002E) == true // Test deprecated
|
||||||
|
assert utf8.is_rune_punct(0x002E) == true
|
||||||
assert utf8.is_punct(a, 4) == true // ?
|
assert utf8.is_punct(a, 4) == true // ?
|
||||||
assert utf8.is_punct(a, 14) == true // last .
|
assert utf8.is_punct(a, 14) == true // last .
|
||||||
assert utf8.is_punct(a, 12) == false // è
|
assert utf8.is_punct(a, 12) == false // è
|
||||||
@ -33,12 +34,18 @@ fn test_utf8_util() {
|
|||||||
b := '.ĂĂa. ÔÔ TESTO Æ€'
|
b := '.ĂĂa. ÔÔ TESTO Æ€'
|
||||||
assert utf8.is_global_punct(b, 0) == true
|
assert utf8.is_global_punct(b, 0) == true
|
||||||
assert utf8.is_global_punct('.', 0) == true
|
assert utf8.is_global_punct('.', 0) == true
|
||||||
assert utf8.is_uchar_punct(0x002E) == true
|
assert utf8.is_uchar_punct(0x002E) == true // Test deprecated
|
||||||
|
assert utf8.is_rune_punct(0x002E) == true
|
||||||
assert utf8.is_global_punct(b, 6) == true // .
|
assert utf8.is_global_punct(b, 6) == true // .
|
||||||
assert utf8.is_global_punct(b, 1) == false // a
|
assert utf8.is_global_punct(b, 1) == false // a
|
||||||
|
|
||||||
// test utility functions
|
// test utility functions
|
||||||
assert utf8.get_uchar(b, 0) == 0x002E
|
assert utf8.get_uchar(b, 0) == 0x002E // Test deprecated
|
||||||
|
c := 'a©★🚀'
|
||||||
|
assert utf8.get_rune(c, 0) == `a` // 1 byte
|
||||||
|
assert utf8.get_rune(c, 1) == `©` // 2 bytes
|
||||||
|
assert utf8.get_rune(c, 3) == `★` // 3 bytes
|
||||||
|
assert utf8.get_rune(c, 6) == `🚀` // 4 bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_raw_indexing() {
|
fn test_raw_indexing() {
|
||||||
@ -56,6 +63,13 @@ fn test_raw_indexing() {
|
|||||||
assert utf8.raw_index(a, 6) == 'n'
|
assert utf8.raw_index(a, 6) == 'n'
|
||||||
assert utf8.raw_index(a, 7) == 'g'
|
assert utf8.raw_index(a, 7) == 'g'
|
||||||
assert utf8.raw_index(a, 8) == '!'
|
assert utf8.raw_index(a, 8) == '!'
|
||||||
|
|
||||||
|
// test differnt utf8 byte lenghts
|
||||||
|
c := 'a©★🚀'
|
||||||
|
assert utf8.raw_index(c, 0) == 'a' // 1 byte
|
||||||
|
assert utf8.raw_index(c, 1) == '©' // 2 bytes
|
||||||
|
assert utf8.raw_index(c, 2) == '★' // 3 bytes
|
||||||
|
assert utf8.raw_index(c, 3) == '🚀' // 4 bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_reversed() {
|
fn test_reversed() {
|
||||||
|
@ -483,7 +483,7 @@ pub fn (mut bmp BitMap) get_chars_bbox(in_string string) []int {
|
|||||||
// manage unicode chars like latin greek etc
|
// manage unicode chars like latin greek etc
|
||||||
c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
|
c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
|
||||||
if c_len > 1 {
|
if c_len > 1 {
|
||||||
tmp_char := utf8.get_uchar(in_string, i)
|
tmp_char := utf8.get_rune(in_string, i)
|
||||||
// dprintln("tmp_char: ${tmp_char.hex()}")
|
// dprintln("tmp_char: ${tmp_char.hex()}")
|
||||||
chr = u16(tmp_char)
|
chr = u16(tmp_char)
|
||||||
}
|
}
|
||||||
@ -554,7 +554,7 @@ pub fn (mut bmp BitMap) get_bbox(in_string string) (int, int) {
|
|||||||
// manage unicode chars like latin greek etc
|
// manage unicode chars like latin greek etc
|
||||||
c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
|
c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
|
||||||
if c_len > 1 {
|
if c_len > 1 {
|
||||||
tmp_char := utf8.get_uchar(in_string, i)
|
tmp_char := utf8.get_rune(in_string, i)
|
||||||
// dprintln("tmp_char: ${tmp_char.hex()}")
|
// dprintln("tmp_char: ${tmp_char.hex()}")
|
||||||
chr = u16(tmp_char)
|
chr = u16(tmp_char)
|
||||||
}
|
}
|
||||||
@ -649,7 +649,7 @@ pub fn (mut bmp BitMap) draw_text(in_string string) (int, int) {
|
|||||||
// manage unicode chars like latin greek etc
|
// manage unicode chars like latin greek etc
|
||||||
c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
|
c_len := ((0xe5000000 >> ((chr >> 3) & 0x1e)) & 3) + 1
|
||||||
if c_len > 1 {
|
if c_len > 1 {
|
||||||
tmp_char := utf8.get_uchar(in_string, i)
|
tmp_char := utf8.get_rune(in_string, i)
|
||||||
// dprintln("tmp_char: ${tmp_char.hex()}")
|
// dprintln("tmp_char: ${tmp_char.hex()}")
|
||||||
chr = u16(tmp_char)
|
chr = u16(tmp_char)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user